diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/added_tokens.json b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/config.json b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/config.json new file mode 100644 index 0000000000000000000000000000000000000000..643480c33ae0d994d81260446eafd2d2d19994b6 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/config.json @@ -0,0 +1,197 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 7, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev32", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.5, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/generation_config.json b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00001-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b0d0f0608781af054d2ec9a7f11c1cddf4a54ebe --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecf88bd826e2422e46bb44344ec13166b5528d8abe2979ea189721486cfb2d5a +size 4972489328 diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00002-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..679047eb2316229ea9ce6432972822dd68f8afd0 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75b6a1fccc27443f8bc30f9fdd03af9e806e13573c5f0e18414d698b93fefd46 +size 4985976068 diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00003-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..746b57a74af5a3c1175769f99b3881ba1afc6981 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86a9b747ea14004ff2860bafea7a5c9328df2c8c1adace994557c745dcb7cdc +size 248943552 diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model.safetensors.index.json b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7cdc5da041253f30bfca8dad5f6a64a31333d1b4 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model.safetensors.index.json @@ -0,0 +1,1033 @@ +{ + "metadata": { + "total_size": 10207261884 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/special_tokens_map.json b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer.model b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer_config.json b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/trainer_state.json b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..65d36a5b03dbbe47132a7aaf51811eec808329ca --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/trainer_state.json @@ -0,0 +1,249523 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05028445, + "auxiliary_loss_mlp": 0.02215396, + "balance_loss_clip": 2.43573999, + "balance_loss_mlp": 1.76983953, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 55.01357859957846, + "language_loss": 2.85272503, + "learning_rate": 0.0, + "loss": 1.94613922, + "num_input_tokens_seen": 19155, + "step": 1, + "time_per_iteration": 16.99275779724121 + }, + { + "auxiliary_loss_clip": 0.0338048, + "auxiliary_loss_mlp": 0.01458816, + "balance_loss_clip": 1.62775421, + "balance_loss_mlp": 1.18911731, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 37.49078228221029, + "language_loss": 1.82644057, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87483335, + "num_input_tokens_seen": 36175, + "step": 2, + "time_per_iteration": 2.3849055767059326 + }, + { + "auxiliary_loss_clip": 0.03319546, + "auxiliary_loss_mlp": 0.01440765, + "balance_loss_clip": 1.62554133, + "balance_loss_mlp": 1.18804145, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 55.653028379013236, + "language_loss": 1.57191491, + "learning_rate": 7.073439208833112e-07, + "loss": 1.61951792, + "num_input_tokens_seen": 54870, + "step": 3, + "time_per_iteration": 2.3462157249450684 + }, + { + "auxiliary_loss_clip": 0.03362697, + "auxiliary_loss_mlp": 0.01452561, + "balance_loss_clip": 1.6242913, + "balance_loss_mlp": 1.15634966, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.199026644670454, + "language_loss": 1.67298365, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72113633, + "num_input_tokens_seen": 74575, + "step": 4, + "time_per_iteration": 2.4163434505462646 + }, + { + "auxiliary_loss_clip": 0.03402577, + "auxiliary_loss_mlp": 0.01503903, + "balance_loss_clip": 1.6250391, + "balance_loss_mlp": 1.21627522, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 55.97094444330911, + "language_loss": 1.91248035, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.96154523, + "num_input_tokens_seen": 92580, + "step": 5, + "time_per_iteration": 2.6308891773223877 + }, + { + "auxiliary_loss_clip": 0.03371383, + "auxiliary_loss_mlp": 0.01516207, + "balance_loss_clip": 1.61565065, + "balance_loss_mlp": 1.22152174, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.20162643903161, + "language_loss": 1.60806251, + "learning_rate": 1.153628246576487e-06, + "loss": 1.65693855, + "num_input_tokens_seen": 109705, + "step": 6, + "time_per_iteration": 2.652102470397949 + }, + { + "auxiliary_loss_clip": 0.03354459, + "auxiliary_loss_mlp": 0.01487812, + "balance_loss_clip": 1.61590815, + "balance_loss_mlp": 1.20380831, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 24.802051336661247, + "language_loss": 1.5341363, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58255911, + "num_input_tokens_seen": 129425, + "step": 7, + "time_per_iteration": 2.8288254737854004 + }, + { + "auxiliary_loss_clip": 0.03323346, + "auxiliary_loss_mlp": 0.01443314, + "balance_loss_clip": 1.6124599, + "balance_loss_mlp": 1.16579437, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 32.69766189188073, + "language_loss": 1.43773413, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48540068, + "num_input_tokens_seen": 149210, + "step": 8, + "time_per_iteration": 2.720519542694092 + }, + { + "auxiliary_loss_clip": 0.03370202, + "auxiliary_loss_mlp": 0.01495999, + "balance_loss_clip": 1.61180174, + "balance_loss_mlp": 1.21199441, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 33.084666501044424, + "language_loss": 1.4983716, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54703355, + "num_input_tokens_seen": 169055, + "step": 9, + "time_per_iteration": 2.718416452407837 + }, + { + "auxiliary_loss_clip": 0.03311124, + "auxiliary_loss_mlp": 0.01475296, + "balance_loss_clip": 1.61572063, + "balance_loss_mlp": 1.20635986, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 24.284744416411108, + "language_loss": 1.44740689, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49527097, + "num_input_tokens_seen": 188045, + "step": 10, + "time_per_iteration": 2.623753070831299 + }, + { + "auxiliary_loss_clip": 0.03366701, + "auxiliary_loss_mlp": 0.0149234, + "balance_loss_clip": 1.62098789, + "balance_loss_mlp": 1.21844459, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 19.36188675006359, + "language_loss": 1.45250297, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.50109339, + "num_input_tokens_seen": 207035, + "step": 11, + "time_per_iteration": 2.625054121017456 + }, + { + "auxiliary_loss_clip": 0.03295037, + "auxiliary_loss_mlp": 0.01450204, + "balance_loss_clip": 1.60805058, + "balance_loss_mlp": 1.17363846, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.694362924007965, + "language_loss": 1.4494009, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49685335, + "num_input_tokens_seen": 223225, + "step": 12, + "time_per_iteration": 2.6011135578155518 + }, + { + "auxiliary_loss_clip": 0.03324181, + "auxiliary_loss_mlp": 0.01405718, + "balance_loss_clip": 1.61745763, + "balance_loss_mlp": 1.14708138, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 11.930397699172165, + "language_loss": 1.25121629, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.2985152, + "num_input_tokens_seen": 242570, + "step": 13, + "time_per_iteration": 2.645387649536133 + }, + { + "auxiliary_loss_clip": 0.03291187, + "auxiliary_loss_mlp": 0.0147245, + "balance_loss_clip": 1.61312854, + "balance_loss_mlp": 1.20351434, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.776027091153949, + "language_loss": 1.20676839, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25440478, + "num_input_tokens_seen": 261215, + "step": 14, + "time_per_iteration": 2.638593912124634 + }, + { + "auxiliary_loss_clip": 0.03275891, + "auxiliary_loss_mlp": 0.01432526, + "balance_loss_clip": 1.61814392, + "balance_loss_mlp": 1.16873956, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.190679917829633, + "language_loss": 1.13171339, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.17879748, + "num_input_tokens_seen": 280035, + "step": 15, + "time_per_iteration": 2.7264766693115234 + }, + { + "auxiliary_loss_clip": 0.03242807, + "auxiliary_loss_mlp": 0.0141162, + "balance_loss_clip": 1.60349536, + "balance_loss_mlp": 1.16194773, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.756327584046069, + "language_loss": 1.11124468, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15778887, + "num_input_tokens_seen": 300265, + "step": 16, + "time_per_iteration": 4.146451711654663 + }, + { + "auxiliary_loss_clip": 0.03229211, + "auxiliary_loss_mlp": 0.01416279, + "balance_loss_clip": 1.60991216, + "balance_loss_mlp": 1.17709792, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 4.785580631652641, + "language_loss": 1.12813509, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17458999, + "num_input_tokens_seen": 317375, + "step": 17, + "time_per_iteration": 4.026803970336914 + }, + { + "auxiliary_loss_clip": 0.03164909, + "auxiliary_loss_mlp": 0.01380013, + "balance_loss_clip": 1.60722864, + "balance_loss_mlp": 1.14884233, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.852530018715462, + "language_loss": 1.08151329, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12696242, + "num_input_tokens_seen": 337975, + "step": 18, + "time_per_iteration": 2.7187507152557373 + }, + { + "auxiliary_loss_clip": 0.03191582, + "auxiliary_loss_mlp": 0.01401165, + "balance_loss_clip": 1.60632813, + "balance_loss_mlp": 1.13585305, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.361063024456757, + "language_loss": 1.02253509, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06846261, + "num_input_tokens_seen": 356635, + "step": 19, + "time_per_iteration": 2.6398427486419678 + }, + { + "auxiliary_loss_clip": 0.03134825, + "auxiliary_loss_mlp": 0.01341294, + "balance_loss_clip": 1.60777926, + "balance_loss_mlp": 1.12061334, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 5.814681839034755, + "language_loss": 1.16637862, + "learning_rate": 1.928808765521199e-06, + "loss": 1.2111398, + "num_input_tokens_seen": 375625, + "step": 20, + "time_per_iteration": 2.600081443786621 + }, + { + "auxiliary_loss_clip": 0.03121253, + "auxiliary_loss_mlp": 0.01379985, + "balance_loss_clip": 1.58914185, + "balance_loss_mlp": 1.13088524, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 4.204394836693334, + "language_loss": 1.05888343, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.10389578, + "num_input_tokens_seen": 394350, + "step": 21, + "time_per_iteration": 2.6679067611694336 + }, + { + "auxiliary_loss_clip": 0.0301829, + "auxiliary_loss_mlp": 0.01378849, + "balance_loss_clip": 1.57123065, + "balance_loss_mlp": 1.14519882, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 5.251892981733023, + "language_loss": 1.05595446, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.09992576, + "num_input_tokens_seen": 413255, + "step": 22, + "time_per_iteration": 2.656960964202881 + }, + { + "auxiliary_loss_clip": 0.02971793, + "auxiliary_loss_mlp": 0.01335988, + "balance_loss_clip": 1.5730052, + "balance_loss_mlp": 1.12541664, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 3.290668368052331, + "language_loss": 0.9190129, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96209079, + "num_input_tokens_seen": 433065, + "step": 23, + "time_per_iteration": 2.6649303436279297 + }, + { + "auxiliary_loss_clip": 0.02940327, + "auxiliary_loss_mlp": 0.0136417, + "balance_loss_clip": 1.56496239, + "balance_loss_mlp": 1.14310789, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.8703676529006725, + "language_loss": 1.08213818, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12518322, + "num_input_tokens_seen": 451175, + "step": 24, + "time_per_iteration": 2.6021625995635986 + }, + { + "auxiliary_loss_clip": 0.02831437, + "auxiliary_loss_mlp": 0.01330118, + "balance_loss_clip": 1.55828047, + "balance_loss_mlp": 1.11907029, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.437927953801944, + "language_loss": 1.01184893, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05346465, + "num_input_tokens_seen": 468775, + "step": 25, + "time_per_iteration": 2.605135917663574 + }, + { + "auxiliary_loss_clip": 0.02825907, + "auxiliary_loss_mlp": 0.01310276, + "balance_loss_clip": 1.56112146, + "balance_loss_mlp": 1.10027707, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 3.027209569216652, + "language_loss": 1.06458044, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10594225, + "num_input_tokens_seen": 488530, + "step": 26, + "time_per_iteration": 2.623779296875 + }, + { + "auxiliary_loss_clip": 0.02768452, + "auxiliary_loss_mlp": 0.01326988, + "balance_loss_clip": 1.5512917, + "balance_loss_mlp": 1.12652588, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 3.132022159767438, + "language_loss": 0.95463276, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99558723, + "num_input_tokens_seen": 510495, + "step": 27, + "time_per_iteration": 2.6869874000549316 + }, + { + "auxiliary_loss_clip": 0.02746207, + "auxiliary_loss_mlp": 0.01313806, + "balance_loss_clip": 1.55559921, + "balance_loss_mlp": 1.1321311, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.908655169413799, + "language_loss": 1.06381345, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.10441375, + "num_input_tokens_seen": 528605, + "step": 28, + "time_per_iteration": 2.6386659145355225 + }, + { + "auxiliary_loss_clip": 0.02711651, + "auxiliary_loss_mlp": 0.01319167, + "balance_loss_clip": 1.54097176, + "balance_loss_mlp": 1.13224649, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 3.3122863565797735, + "language_loss": 1.02549672, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06580496, + "num_input_tokens_seen": 548515, + "step": 29, + "time_per_iteration": 2.6951634883880615 + }, + { + "auxiliary_loss_clip": 0.02703975, + "auxiliary_loss_mlp": 0.01313834, + "balance_loss_clip": 1.53679454, + "balance_loss_mlp": 1.12700963, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 5.40849341436269, + "language_loss": 1.19384718, + "learning_rate": 2.189868360711334e-06, + "loss": 1.23402524, + "num_input_tokens_seen": 564025, + "step": 30, + "time_per_iteration": 2.6403684616088867 + }, + { + "auxiliary_loss_clip": 0.02623687, + "auxiliary_loss_mlp": 0.01338607, + "balance_loss_clip": 1.52349603, + "balance_loss_mlp": 1.15683651, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 3.090533627698779, + "language_loss": 1.02460873, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06423163, + "num_input_tokens_seen": 583345, + "step": 31, + "time_per_iteration": 2.734675168991089 + }, + { + "auxiliary_loss_clip": 0.0259671, + "auxiliary_loss_mlp": 0.01328545, + "balance_loss_clip": 1.52546322, + "balance_loss_mlp": 1.14820552, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.503935581784127, + "language_loss": 0.9555307, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99478328, + "num_input_tokens_seen": 600010, + "step": 32, + "time_per_iteration": 2.6334242820739746 + }, + { + "auxiliary_loss_clip": 0.02578119, + "auxiliary_loss_mlp": 0.01303937, + "balance_loss_clip": 1.52054727, + "balance_loss_mlp": 1.13618588, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.108254377730023, + "language_loss": 0.95230979, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99113035, + "num_input_tokens_seen": 616295, + "step": 33, + "time_per_iteration": 2.661289691925049 + }, + { + "auxiliary_loss_clip": 0.02432996, + "auxiliary_loss_mlp": 0.0130296, + "balance_loss_clip": 1.48701143, + "balance_loss_mlp": 1.14493632, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 2.310624037185416, + "language_loss": 0.91532081, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95268041, + "num_input_tokens_seen": 637640, + "step": 34, + "time_per_iteration": 2.6363108158111572 + }, + { + "auxiliary_loss_clip": 0.02387556, + "auxiliary_loss_mlp": 0.01270295, + "balance_loss_clip": 1.45412207, + "balance_loss_mlp": 1.11503696, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 3.0828346030924734, + "language_loss": 0.76627553, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80285406, + "num_input_tokens_seen": 659710, + "step": 35, + "time_per_iteration": 2.8584158420562744 + }, + { + "auxiliary_loss_clip": 0.02358708, + "auxiliary_loss_mlp": 0.01272597, + "balance_loss_clip": 1.46592367, + "balance_loss_mlp": 1.12840152, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.195611096879242, + "language_loss": 0.88666761, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92298067, + "num_input_tokens_seen": 679670, + "step": 36, + "time_per_iteration": 2.653135299682617 + }, + { + "auxiliary_loss_clip": 0.02300521, + "auxiliary_loss_mlp": 0.01332047, + "balance_loss_clip": 1.45313227, + "balance_loss_mlp": 1.18499005, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 2.5418100678790165, + "language_loss": 0.93061435, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96694005, + "num_input_tokens_seen": 700170, + "step": 37, + "time_per_iteration": 2.6911416053771973 + }, + { + "auxiliary_loss_clip": 0.02259628, + "auxiliary_loss_mlp": 0.01277966, + "balance_loss_clip": 1.44755137, + "balance_loss_mlp": 1.15618145, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.082459184016721, + "language_loss": 1.04144979, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07682574, + "num_input_tokens_seen": 718545, + "step": 38, + "time_per_iteration": 2.6396498680114746 + }, + { + "auxiliary_loss_clip": 0.02225626, + "auxiliary_loss_mlp": 0.01252245, + "balance_loss_clip": 1.43994188, + "balance_loss_mlp": 1.12903047, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 5.313432325919528, + "language_loss": 0.85210872, + "learning_rate": 2.358792165262154e-06, + "loss": 0.88688743, + "num_input_tokens_seen": 739865, + "step": 39, + "time_per_iteration": 2.66723895072937 + }, + { + "auxiliary_loss_clip": 0.02202873, + "auxiliary_loss_mlp": 0.01246424, + "balance_loss_clip": 1.43150878, + "balance_loss_mlp": 1.11815453, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.793220480827623, + "language_loss": 0.90291691, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93740988, + "num_input_tokens_seen": 755770, + "step": 40, + "time_per_iteration": 2.5831565856933594 + }, + { + "auxiliary_loss_clip": 0.02153578, + "auxiliary_loss_mlp": 0.01268788, + "balance_loss_clip": 1.42131376, + "balance_loss_mlp": 1.15625477, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 2.413102753597936, + "language_loss": 0.93449628, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96871996, + "num_input_tokens_seen": 773440, + "step": 41, + "time_per_iteration": 2.6257176399230957 + }, + { + "auxiliary_loss_clip": 0.02116382, + "auxiliary_loss_mlp": 0.01253326, + "balance_loss_clip": 1.41300488, + "balance_loss_mlp": 1.14980471, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 2.8133240440517358, + "language_loss": 0.97487867, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.0085758, + "num_input_tokens_seen": 790455, + "step": 42, + "time_per_iteration": 2.57853627204895 + }, + { + "auxiliary_loss_clip": 0.02077176, + "auxiliary_loss_mlp": 0.01301009, + "balance_loss_clip": 1.41385627, + "balance_loss_mlp": 1.19386375, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 3.0502234664997525, + "language_loss": 0.97647721, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.01025903, + "num_input_tokens_seen": 810645, + "step": 43, + "time_per_iteration": 2.714310646057129 + }, + { + "auxiliary_loss_clip": 0.02097273, + "auxiliary_loss_mlp": 0.01310826, + "balance_loss_clip": 1.41274703, + "balance_loss_mlp": 1.19886446, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 3.0983273158427767, + "language_loss": 0.93506986, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.9691509, + "num_input_tokens_seen": 827470, + "step": 44, + "time_per_iteration": 2.6341898441314697 + }, + { + "auxiliary_loss_clip": 0.02054953, + "auxiliary_loss_mlp": 0.01270993, + "balance_loss_clip": 1.40777385, + "balance_loss_mlp": 1.17243052, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 2.8302616844537782, + "language_loss": 0.98770845, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02096796, + "num_input_tokens_seen": 847285, + "step": 45, + "time_per_iteration": 2.6566834449768066 + }, + { + "auxiliary_loss_clip": 0.02029826, + "auxiliary_loss_mlp": 0.01225262, + "balance_loss_clip": 1.3934747, + "balance_loss_mlp": 1.13838267, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.868172466252779, + "language_loss": 1.02683258, + "learning_rate": 2.465079122983384e-06, + "loss": 1.05938351, + "num_input_tokens_seen": 867545, + "step": 46, + "time_per_iteration": 2.653761863708496 + }, + { + "auxiliary_loss_clip": 0.0199908, + "auxiliary_loss_mlp": 0.01270466, + "balance_loss_clip": 1.38636351, + "balance_loss_mlp": 1.17967677, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 4.060285497059926, + "language_loss": 0.8791151, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91181052, + "num_input_tokens_seen": 889915, + "step": 47, + "time_per_iteration": 2.8718831539154053 + }, + { + "auxiliary_loss_clip": 0.01961072, + "auxiliary_loss_mlp": 0.01256785, + "balance_loss_clip": 1.37640202, + "balance_loss_mlp": 1.17042947, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 1.8293020190438818, + "language_loss": 0.88044512, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91262376, + "num_input_tokens_seen": 908975, + "step": 48, + "time_per_iteration": 2.686340808868408 + }, + { + "auxiliary_loss_clip": 0.019611, + "auxiliary_loss_mlp": 0.0124321, + "balance_loss_clip": 1.36616933, + "balance_loss_mlp": 1.15141857, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.428791124567508, + "language_loss": 0.89768839, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.92973149, + "num_input_tokens_seen": 929810, + "step": 49, + "time_per_iteration": 2.6370863914489746 + }, + { + "auxiliary_loss_clip": 0.01953053, + "auxiliary_loss_mlp": 0.0123489, + "balance_loss_clip": 1.35995233, + "balance_loss_mlp": 1.15010881, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 11.473727394974915, + "language_loss": 0.91041464, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94229406, + "num_input_tokens_seen": 948650, + "step": 50, + "time_per_iteration": 2.593992233276367 + }, + { + "auxiliary_loss_clip": 0.01951107, + "auxiliary_loss_mlp": 0.01200423, + "balance_loss_clip": 1.36499321, + "balance_loss_mlp": 1.11850226, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 4.8258098400106535, + "language_loss": 0.86964303, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90115839, + "num_input_tokens_seen": 966455, + "step": 51, + "time_per_iteration": 2.7042007446289062 + }, + { + "auxiliary_loss_clip": 0.01908105, + "auxiliary_loss_mlp": 0.01204295, + "balance_loss_clip": 1.35497737, + "balance_loss_mlp": 1.12518811, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 1.989038621137361, + "language_loss": 0.95351541, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98463935, + "num_input_tokens_seen": 988110, + "step": 52, + "time_per_iteration": 2.7988576889038086 + }, + { + "auxiliary_loss_clip": 0.01904863, + "auxiliary_loss_mlp": 0.01240036, + "balance_loss_clip": 1.35153794, + "balance_loss_mlp": 1.16045213, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 1.8714676577285358, + "language_loss": 0.9233852, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95483416, + "num_input_tokens_seen": 1008550, + "step": 53, + "time_per_iteration": 2.637906789779663 + }, + { + "auxiliary_loss_clip": 0.01894587, + "auxiliary_loss_mlp": 0.01195445, + "balance_loss_clip": 1.35532701, + "balance_loss_mlp": 1.1142869, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.473277836157304, + "language_loss": 0.82857406, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85947442, + "num_input_tokens_seen": 1026840, + "step": 54, + "time_per_iteration": 2.5975570678710938 + }, + { + "auxiliary_loss_clip": 0.01891122, + "auxiliary_loss_mlp": 0.01207643, + "balance_loss_clip": 1.3442874, + "balance_loss_mlp": 1.12877393, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.5634287805853906, + "language_loss": 0.81393993, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84492755, + "num_input_tokens_seen": 1048875, + "step": 55, + "time_per_iteration": 2.720212936401367 + }, + { + "auxiliary_loss_clip": 0.01880174, + "auxiliary_loss_mlp": 0.01200385, + "balance_loss_clip": 1.33787608, + "balance_loss_mlp": 1.12075317, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.6298330734745137, + "language_loss": 0.86873615, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.89954174, + "num_input_tokens_seen": 1066435, + "step": 56, + "time_per_iteration": 4.121828556060791 + }, + { + "auxiliary_loss_clip": 0.01878911, + "auxiliary_loss_mlp": 0.01161092, + "balance_loss_clip": 1.32957554, + "balance_loss_mlp": 1.08699179, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 1.8087542009350004, + "language_loss": 0.92606533, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.9564653, + "num_input_tokens_seen": 1090330, + "step": 57, + "time_per_iteration": 4.128501892089844 + }, + { + "auxiliary_loss_clip": 0.01842975, + "auxiliary_loss_mlp": 0.01210818, + "balance_loss_clip": 1.33469343, + "balance_loss_mlp": 1.13810015, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.1137454433964495, + "language_loss": 0.9960705, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02660847, + "num_input_tokens_seen": 1109840, + "step": 58, + "time_per_iteration": 2.636068105697632 + }, + { + "auxiliary_loss_clip": 0.01824994, + "auxiliary_loss_mlp": 0.01196409, + "balance_loss_clip": 1.32101107, + "balance_loss_mlp": 1.12483585, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.159439125338344, + "language_loss": 0.8829025, + "learning_rate": 2.625331386578098e-06, + "loss": 0.91311646, + "num_input_tokens_seen": 1128415, + "step": 59, + "time_per_iteration": 2.5871942043304443 + }, + { + "auxiliary_loss_clip": 0.01847478, + "auxiliary_loss_mlp": 0.01163465, + "balance_loss_clip": 1.33004904, + "balance_loss_mlp": 1.08979321, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.717671784833823, + "language_loss": 0.9336943, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96380371, + "num_input_tokens_seen": 1146515, + "step": 60, + "time_per_iteration": 2.568873405456543 + }, + { + "auxiliary_loss_clip": 0.01824923, + "auxiliary_loss_mlp": 0.01172833, + "balance_loss_clip": 1.31321108, + "balance_loss_mlp": 1.10412073, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 3.0274292784063355, + "language_loss": 0.90047532, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.93045282, + "num_input_tokens_seen": 1166330, + "step": 61, + "time_per_iteration": 2.6147077083587646 + }, + { + "auxiliary_loss_clip": 0.01808161, + "auxiliary_loss_mlp": 0.01143043, + "balance_loss_clip": 1.30956578, + "balance_loss_mlp": 1.07542753, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 2.481927809435928, + "language_loss": 0.88622725, + "learning_rate": 2.657264485425803e-06, + "loss": 0.9157393, + "num_input_tokens_seen": 1186010, + "step": 62, + "time_per_iteration": 2.6347854137420654 + }, + { + "auxiliary_loss_clip": 0.0178877, + "auxiliary_loss_mlp": 0.01164638, + "balance_loss_clip": 1.30053854, + "balance_loss_mlp": 1.09406614, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.1448824570362603, + "language_loss": 0.96117878, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99071288, + "num_input_tokens_seen": 1204985, + "step": 63, + "time_per_iteration": 2.6861062049865723 + }, + { + "auxiliary_loss_clip": 0.01796666, + "auxiliary_loss_mlp": 0.01168357, + "balance_loss_clip": 1.30811977, + "balance_loss_mlp": 1.10145652, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.9704239132575223, + "language_loss": 0.98825753, + "learning_rate": 2.677705954159056e-06, + "loss": 1.01790786, + "num_input_tokens_seen": 1223545, + "step": 64, + "time_per_iteration": 2.666386842727661 + }, + { + "auxiliary_loss_clip": 0.01805463, + "auxiliary_loss_mlp": 0.01149932, + "balance_loss_clip": 1.30829644, + "balance_loss_mlp": 1.08193517, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.3088500493603554, + "language_loss": 0.85297942, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88253331, + "num_input_tokens_seen": 1241175, + "step": 65, + "time_per_iteration": 2.6331064701080322 + }, + { + "auxiliary_loss_clip": 0.01781582, + "auxiliary_loss_mlp": 0.01154703, + "balance_loss_clip": 1.29513025, + "balance_loss_mlp": 1.08680141, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 2.0853911148996334, + "language_loss": 0.8522647, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88162756, + "num_input_tokens_seen": 1259315, + "step": 66, + "time_per_iteration": 2.663648843765259 + }, + { + "auxiliary_loss_clip": 0.01784881, + "auxiliary_loss_mlp": 0.01153488, + "balance_loss_clip": 1.29285955, + "balance_loss_mlp": 1.07771873, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.255958069510289, + "language_loss": 0.96546042, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.9948442, + "num_input_tokens_seen": 1277055, + "step": 67, + "time_per_iteration": 2.6367568969726562 + }, + { + "auxiliary_loss_clip": 0.01755885, + "auxiliary_loss_mlp": 0.0115513, + "balance_loss_clip": 1.2865361, + "balance_loss_mlp": 1.08303261, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.844793388433355, + "language_loss": 0.94606817, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97517836, + "num_input_tokens_seen": 1294355, + "step": 68, + "time_per_iteration": 2.58103346824646 + }, + { + "auxiliary_loss_clip": 0.01749949, + "auxiliary_loss_mlp": 0.01156807, + "balance_loss_clip": 1.28382111, + "balance_loss_mlp": 1.0869031, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 2.0465709068319926, + "language_loss": 0.9573884, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98645598, + "num_input_tokens_seen": 1313525, + "step": 69, + "time_per_iteration": 2.586247682571411 + }, + { + "auxiliary_loss_clip": 0.01742994, + "auxiliary_loss_mlp": 0.0116278, + "balance_loss_clip": 1.28623533, + "balance_loss_mlp": 1.09735799, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.1383776965856405, + "language_loss": 0.97987217, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00892985, + "num_input_tokens_seen": 1330505, + "step": 70, + "time_per_iteration": 2.603670597076416 + }, + { + "auxiliary_loss_clip": 0.01748879, + "auxiliary_loss_mlp": 0.01146994, + "balance_loss_clip": 1.27590966, + "balance_loss_mlp": 1.07704139, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.457673276739256, + "language_loss": 0.93888831, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96784705, + "num_input_tokens_seen": 1349615, + "step": 71, + "time_per_iteration": 2.5643980503082275 + }, + { + "auxiliary_loss_clip": 0.01818744, + "auxiliary_loss_mlp": 0.01317245, + "balance_loss_clip": 1.43334627, + "balance_loss_mlp": 1.27871656, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.423979528506007, + "language_loss": 0.65686607, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68822598, + "num_input_tokens_seen": 1410275, + "step": 72, + "time_per_iteration": 3.138848304748535 + }, + { + "auxiliary_loss_clip": 0.01798594, + "auxiliary_loss_mlp": 0.01288939, + "balance_loss_clip": 1.42581332, + "balance_loss_mlp": 1.25117302, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.2516152660306035, + "language_loss": 0.63746232, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66833764, + "num_input_tokens_seen": 1473020, + "step": 73, + "time_per_iteration": 3.160325050354004 + }, + { + "auxiliary_loss_clip": 0.01722778, + "auxiliary_loss_mlp": 0.01144133, + "balance_loss_clip": 1.26790881, + "balance_loss_mlp": 1.07599318, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.7358368352783238, + "language_loss": 0.8603611, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88903022, + "num_input_tokens_seen": 1490385, + "step": 74, + "time_per_iteration": 2.611658811569214 + }, + { + "auxiliary_loss_clip": 0.01724844, + "auxiliary_loss_mlp": 0.01162642, + "balance_loss_clip": 1.26662505, + "balance_loss_mlp": 1.09373903, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.8514617346225926, + "language_loss": 0.97095287, + "learning_rate": 2.779824149153005e-06, + "loss": 0.9998278, + "num_input_tokens_seen": 1509725, + "step": 75, + "time_per_iteration": 2.622659206390381 + }, + { + "auxiliary_loss_clip": 0.01703703, + "auxiliary_loss_mlp": 0.01144625, + "balance_loss_clip": 1.262712, + "balance_loss_mlp": 1.07781959, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.199978747124834, + "language_loss": 0.87776697, + "learning_rate": 2.788352117317012e-06, + "loss": 0.90625024, + "num_input_tokens_seen": 1527245, + "step": 76, + "time_per_iteration": 2.6056952476501465 + }, + { + "auxiliary_loss_clip": 0.01704622, + "auxiliary_loss_mlp": 0.01147478, + "balance_loss_clip": 1.26140141, + "balance_loss_mlp": 1.07752585, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 2.026345188499006, + "language_loss": 0.91834164, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94686264, + "num_input_tokens_seen": 1548930, + "step": 77, + "time_per_iteration": 2.6821014881134033 + }, + { + "auxiliary_loss_clip": 0.01694606, + "auxiliary_loss_mlp": 0.01165935, + "balance_loss_clip": 1.26273429, + "balance_loss_mlp": 1.09536302, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.0921449243340953, + "language_loss": 0.92152309, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95012844, + "num_input_tokens_seen": 1565695, + "step": 78, + "time_per_iteration": 2.6102750301361084 + }, + { + "auxiliary_loss_clip": 0.01691268, + "auxiliary_loss_mlp": 0.01153831, + "balance_loss_clip": 1.25723553, + "balance_loss_mlp": 1.08488071, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.802128602132269, + "language_loss": 0.82570493, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85415596, + "num_input_tokens_seen": 1582625, + "step": 79, + "time_per_iteration": 2.6163341999053955 + }, + { + "auxiliary_loss_clip": 0.01703904, + "auxiliary_loss_mlp": 0.011355, + "balance_loss_clip": 1.2597903, + "balance_loss_mlp": 1.06511927, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 1.9235436207182746, + "language_loss": 0.91209447, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94048846, + "num_input_tokens_seen": 1601725, + "step": 80, + "time_per_iteration": 2.604393482208252 + }, + { + "auxiliary_loss_clip": 0.01672927, + "auxiliary_loss_mlp": 0.01141254, + "balance_loss_clip": 1.25235438, + "balance_loss_mlp": 1.07063484, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.2816037420892066, + "language_loss": 0.94980824, + "learning_rate": 2.829375683533245e-06, + "loss": 0.9779501, + "num_input_tokens_seen": 1622420, + "step": 81, + "time_per_iteration": 2.6310482025146484 + }, + { + "auxiliary_loss_clip": 0.01687507, + "auxiliary_loss_mlp": 0.01147981, + "balance_loss_clip": 1.25664902, + "balance_loss_mlp": 1.08160567, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 3.630597071165227, + "language_loss": 0.95833802, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.98669291, + "num_input_tokens_seen": 1640715, + "step": 82, + "time_per_iteration": 2.5686142444610596 + }, + { + "auxiliary_loss_clip": 0.0167031, + "auxiliary_loss_mlp": 0.01160232, + "balance_loss_clip": 1.2445631, + "balance_loss_mlp": 1.08994627, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.0524796222066537, + "language_loss": 0.86641341, + "learning_rate": 2.84508017388607e-06, + "loss": 0.89471883, + "num_input_tokens_seen": 1662210, + "step": 83, + "time_per_iteration": 2.6278891563415527 + }, + { + "auxiliary_loss_clip": 0.01662556, + "auxiliary_loss_mlp": 0.0115495, + "balance_loss_clip": 1.2453593, + "balance_loss_mlp": 1.08437765, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 2.628404349917254, + "language_loss": 0.91680586, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94498098, + "num_input_tokens_seen": 1681070, + "step": 84, + "time_per_iteration": 2.5797677040100098 + }, + { + "auxiliary_loss_clip": 0.01644173, + "auxiliary_loss_mlp": 0.01216214, + "balance_loss_clip": 1.35085094, + "balance_loss_mlp": 1.18035626, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4037335074325064, + "language_loss": 0.62493408, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65353799, + "num_input_tokens_seen": 1747140, + "step": 85, + "time_per_iteration": 3.1743361949920654 + }, + { + "auxiliary_loss_clip": 0.01649806, + "auxiliary_loss_mlp": 0.01127309, + "balance_loss_clip": 1.23574376, + "balance_loss_mlp": 1.05630827, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.6723013988564652, + "language_loss": 0.90667963, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93445081, + "num_input_tokens_seen": 1767475, + "step": 86, + "time_per_iteration": 2.648634672164917 + }, + { + "auxiliary_loss_clip": 0.01654751, + "auxiliary_loss_mlp": 0.01162684, + "balance_loss_clip": 1.24177647, + "balance_loss_mlp": 1.09068131, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.2992435336131907, + "language_loss": 0.8184191, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84659344, + "num_input_tokens_seen": 1784980, + "step": 87, + "time_per_iteration": 2.584427833557129 + }, + { + "auxiliary_loss_clip": 0.01642766, + "auxiliary_loss_mlp": 0.01155615, + "balance_loss_clip": 1.23943424, + "balance_loss_mlp": 1.08618736, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.9051666552072895, + "language_loss": 0.95740676, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98539048, + "num_input_tokens_seen": 1803030, + "step": 88, + "time_per_iteration": 2.5711379051208496 + }, + { + "auxiliary_loss_clip": 0.01658267, + "auxiliary_loss_mlp": 0.01146548, + "balance_loss_clip": 1.23836982, + "balance_loss_mlp": 1.07888448, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 2.3338172352400233, + "language_loss": 0.86109859, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88914675, + "num_input_tokens_seen": 1822865, + "step": 89, + "time_per_iteration": 2.5905134677886963 + }, + { + "auxiliary_loss_clip": 0.01646594, + "auxiliary_loss_mlp": 0.01133051, + "balance_loss_clip": 1.23276305, + "balance_loss_mlp": 1.06591189, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 1.905087541321358, + "language_loss": 0.91609406, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94389045, + "num_input_tokens_seen": 1842435, + "step": 90, + "time_per_iteration": 2.6533782482147217 + }, + { + "auxiliary_loss_clip": 0.01626439, + "auxiliary_loss_mlp": 0.01133107, + "balance_loss_clip": 1.22822881, + "balance_loss_mlp": 1.06453812, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 3.182515783497975, + "language_loss": 0.85937059, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88696605, + "num_input_tokens_seen": 1860065, + "step": 91, + "time_per_iteration": 2.5663082599639893 + }, + { + "auxiliary_loss_clip": 0.01626309, + "auxiliary_loss_mlp": 0.01137525, + "balance_loss_clip": 1.22415185, + "balance_loss_mlp": 1.0710063, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.2624566345376933, + "language_loss": 0.86879933, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89643759, + "num_input_tokens_seen": 1878135, + "step": 92, + "time_per_iteration": 2.5949645042419434 + }, + { + "auxiliary_loss_clip": 0.01618782, + "auxiliary_loss_mlp": 0.0117537, + "balance_loss_clip": 1.21600091, + "balance_loss_mlp": 1.10727751, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 3.970204613063914, + "language_loss": 0.92043078, + "learning_rate": 2.918324080615938e-06, + "loss": 0.9483723, + "num_input_tokens_seen": 1894895, + "step": 93, + "time_per_iteration": 2.526972532272339 + }, + { + "auxiliary_loss_clip": 0.01631953, + "auxiliary_loss_mlp": 0.01150948, + "balance_loss_clip": 1.22291303, + "balance_loss_mlp": 1.07908869, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.196527342265881, + "language_loss": 0.87278342, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90061235, + "num_input_tokens_seen": 1913220, + "step": 94, + "time_per_iteration": 2.6020009517669678 + }, + { + "auxiliary_loss_clip": 0.01564436, + "auxiliary_loss_mlp": 0.0105773, + "balance_loss_clip": 1.31068063, + "balance_loss_mlp": 1.02301586, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3665628492705981, + "language_loss": 0.68097979, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70720148, + "num_input_tokens_seen": 1970970, + "step": 95, + "time_per_iteration": 4.440014600753784 + }, + { + "auxiliary_loss_clip": 0.01609532, + "auxiliary_loss_mlp": 0.01153418, + "balance_loss_clip": 1.21107829, + "balance_loss_mlp": 1.08565903, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 5.770707760705803, + "language_loss": 0.89880276, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92643225, + "num_input_tokens_seen": 1988930, + "step": 96, + "time_per_iteration": 4.033079147338867 + }, + { + "auxiliary_loss_clip": 0.0160074, + "auxiliary_loss_mlp": 0.01139091, + "balance_loss_clip": 1.21365428, + "balance_loss_mlp": 1.07605267, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 5.694332768028118, + "language_loss": 0.89812469, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92552304, + "num_input_tokens_seen": 2006285, + "step": 97, + "time_per_iteration": 5.4098756313323975 + }, + { + "auxiliary_loss_clip": 0.01589745, + "auxiliary_loss_mlp": 0.01136876, + "balance_loss_clip": 1.2067287, + "balance_loss_mlp": 1.06759119, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 1.879599026453437, + "language_loss": 0.76475173, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79201788, + "num_input_tokens_seen": 2024905, + "step": 98, + "time_per_iteration": 2.5702524185180664 + }, + { + "auxiliary_loss_clip": 0.01537396, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.29331899, + "balance_loss_mlp": 1.00993395, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0462556838186383, + "language_loss": 0.65503788, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.6808545, + "num_input_tokens_seen": 2086220, + "step": 99, + "time_per_iteration": 3.167391538619995 + }, + { + "auxiliary_loss_clip": 0.01590455, + "auxiliary_loss_mlp": 0.011438, + "balance_loss_clip": 1.20544827, + "balance_loss_mlp": 1.07265556, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.02140125623208, + "language_loss": 0.90818417, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93552667, + "num_input_tokens_seen": 2103365, + "step": 100, + "time_per_iteration": 2.599402904510498 + }, + { + "auxiliary_loss_clip": 0.0160306, + "auxiliary_loss_mlp": 0.01148967, + "balance_loss_clip": 1.20921731, + "balance_loss_mlp": 1.08178067, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 1.993058808395699, + "language_loss": 0.91183102, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93935126, + "num_input_tokens_seen": 2121995, + "step": 101, + "time_per_iteration": 2.5417609214782715 + }, + { + "auxiliary_loss_clip": 0.01590339, + "auxiliary_loss_mlp": 0.01152259, + "balance_loss_clip": 1.20873845, + "balance_loss_mlp": 1.081496, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 3.5599118564047587, + "language_loss": 0.90958905, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.93701506, + "num_input_tokens_seen": 2141815, + "step": 102, + "time_per_iteration": 2.6322169303894043 + }, + { + "auxiliary_loss_clip": 0.01585239, + "auxiliary_loss_mlp": 0.01134408, + "balance_loss_clip": 1.20647836, + "balance_loss_mlp": 1.0696063, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.2820438639775626, + "language_loss": 0.87700957, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90420604, + "num_input_tokens_seen": 2161125, + "step": 103, + "time_per_iteration": 2.5723938941955566 + }, + { + "auxiliary_loss_clip": 0.01583837, + "auxiliary_loss_mlp": 0.01139131, + "balance_loss_clip": 1.20713913, + "balance_loss_mlp": 1.07189679, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.8784452113125296, + "language_loss": 0.93497616, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96220583, + "num_input_tokens_seen": 2179510, + "step": 104, + "time_per_iteration": 2.5587995052337646 + }, + { + "auxiliary_loss_clip": 0.01576697, + "auxiliary_loss_mlp": 0.01149345, + "balance_loss_clip": 1.19843483, + "balance_loss_mlp": 1.08454275, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.8634359088105565, + "language_loss": 0.96166813, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.98892856, + "num_input_tokens_seen": 2197870, + "step": 105, + "time_per_iteration": 2.5611565113067627 + }, + { + "auxiliary_loss_clip": 0.01573893, + "auxiliary_loss_mlp": 0.01158527, + "balance_loss_clip": 1.19741368, + "balance_loss_mlp": 1.08666754, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.4137376292814983, + "language_loss": 0.87135929, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89868349, + "num_input_tokens_seen": 2217495, + "step": 106, + "time_per_iteration": 2.6000452041625977 + }, + { + "auxiliary_loss_clip": 0.01557149, + "auxiliary_loss_mlp": 0.01144365, + "balance_loss_clip": 1.18400466, + "balance_loss_mlp": 1.07579625, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.2196601466721013, + "language_loss": 0.83377874, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86079395, + "num_input_tokens_seen": 2236520, + "step": 107, + "time_per_iteration": 2.545607566833496 + }, + { + "auxiliary_loss_clip": 0.01466773, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.24761844, + "balance_loss_mlp": 1.00264025, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9884052790927175, + "language_loss": 0.64819586, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67320657, + "num_input_tokens_seen": 2300140, + "step": 108, + "time_per_iteration": 3.15523362159729 + }, + { + "auxiliary_loss_clip": 0.01549976, + "auxiliary_loss_mlp": 0.01133518, + "balance_loss_clip": 1.18678892, + "balance_loss_mlp": 1.06413782, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 2.056749618220266, + "language_loss": 0.9765265, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00336146, + "num_input_tokens_seen": 2317320, + "step": 109, + "time_per_iteration": 2.6525185108184814 + }, + { + "auxiliary_loss_clip": 0.01547438, + "auxiliary_loss_mlp": 0.01143709, + "balance_loss_clip": 1.18782282, + "balance_loss_mlp": 1.07676089, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.706297525409082, + "language_loss": 0.83893037, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86584175, + "num_input_tokens_seen": 2337820, + "step": 110, + "time_per_iteration": 2.5954926013946533 + }, + { + "auxiliary_loss_clip": 0.01542128, + "auxiliary_loss_mlp": 0.01152754, + "balance_loss_clip": 1.18164968, + "balance_loss_mlp": 1.08575845, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 1.89946383500841, + "language_loss": 0.82925797, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85620677, + "num_input_tokens_seen": 2358560, + "step": 111, + "time_per_iteration": 2.6361405849456787 + }, + { + "auxiliary_loss_clip": 0.01543388, + "auxiliary_loss_mlp": 0.0113297, + "balance_loss_clip": 1.18476963, + "balance_loss_mlp": 1.06893039, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 2.4869990644584026, + "language_loss": 0.94044846, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96721202, + "num_input_tokens_seen": 2379005, + "step": 112, + "time_per_iteration": 2.573342800140381 + }, + { + "auxiliary_loss_clip": 0.01546726, + "auxiliary_loss_mlp": 0.01139094, + "balance_loss_clip": 1.18108237, + "balance_loss_mlp": 1.07467341, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.9615541580784486, + "language_loss": 0.79232472, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81918293, + "num_input_tokens_seen": 2395610, + "step": 113, + "time_per_iteration": 2.536496639251709 + }, + { + "auxiliary_loss_clip": 0.01535461, + "auxiliary_loss_mlp": 0.01133673, + "balance_loss_clip": 1.17780864, + "balance_loss_mlp": 1.06691599, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 3.8457483647645745, + "language_loss": 0.93229723, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95898855, + "num_input_tokens_seen": 2415005, + "step": 114, + "time_per_iteration": 2.5522632598876953 + }, + { + "auxiliary_loss_clip": 0.01542398, + "auxiliary_loss_mlp": 0.01133451, + "balance_loss_clip": 1.17769921, + "balance_loss_mlp": 1.07313156, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.3534736116202266, + "language_loss": 0.94563091, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97238934, + "num_input_tokens_seen": 2433965, + "step": 115, + "time_per_iteration": 2.559081792831421 + }, + { + "auxiliary_loss_clip": 0.0153625, + "auxiliary_loss_mlp": 0.01119503, + "balance_loss_clip": 1.1750865, + "balance_loss_mlp": 1.05241168, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 5.106132988016955, + "language_loss": 0.81887203, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.8454296, + "num_input_tokens_seen": 2451605, + "step": 116, + "time_per_iteration": 2.5519914627075195 + }, + { + "auxiliary_loss_clip": 0.01527799, + "auxiliary_loss_mlp": 0.01125652, + "balance_loss_clip": 1.17553246, + "balance_loss_mlp": 1.06104016, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 3.117230472985986, + "language_loss": 0.8807748, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90730929, + "num_input_tokens_seen": 2472035, + "step": 117, + "time_per_iteration": 2.5778846740722656 + }, + { + "auxiliary_loss_clip": 0.01527803, + "auxiliary_loss_mlp": 0.01147326, + "balance_loss_clip": 1.17342114, + "balance_loss_mlp": 1.08123636, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 3.477185744929344, + "language_loss": 0.84512496, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87187624, + "num_input_tokens_seen": 2489285, + "step": 118, + "time_per_iteration": 2.5213968753814697 + }, + { + "auxiliary_loss_clip": 0.01536659, + "auxiliary_loss_mlp": 0.01160996, + "balance_loss_clip": 1.17415309, + "balance_loss_mlp": 1.09562159, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.0430509805415946, + "language_loss": 0.99250436, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.01948094, + "num_input_tokens_seen": 2506460, + "step": 119, + "time_per_iteration": 2.5345778465270996 + }, + { + "auxiliary_loss_clip": 0.01539052, + "auxiliary_loss_mlp": 0.01122236, + "balance_loss_clip": 1.17097461, + "balance_loss_mlp": 1.05948377, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.621800670079336, + "language_loss": 0.89330608, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91991895, + "num_input_tokens_seen": 2525565, + "step": 120, + "time_per_iteration": 2.5346975326538086 + }, + { + "auxiliary_loss_clip": 0.01524846, + "auxiliary_loss_mlp": 0.01131799, + "balance_loss_clip": 1.17199504, + "balance_loss_mlp": 1.06709242, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.8290700386071395, + "language_loss": 0.93295485, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.95952129, + "num_input_tokens_seen": 2546605, + "step": 121, + "time_per_iteration": 2.6010594367980957 + }, + { + "auxiliary_loss_clip": 0.01525609, + "auxiliary_loss_mlp": 0.0114796, + "balance_loss_clip": 1.17215443, + "balance_loss_mlp": 1.08454037, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 2.3267676511363544, + "language_loss": 0.90109301, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92782873, + "num_input_tokens_seen": 2560730, + "step": 122, + "time_per_iteration": 2.5281710624694824 + }, + { + "auxiliary_loss_clip": 0.01520848, + "auxiliary_loss_mlp": 0.01145298, + "balance_loss_clip": 1.16624761, + "balance_loss_mlp": 1.07758701, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.151362274961761, + "language_loss": 0.92501247, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95167387, + "num_input_tokens_seen": 2579550, + "step": 123, + "time_per_iteration": 2.5765693187713623 + }, + { + "auxiliary_loss_clip": 0.01519205, + "auxiliary_loss_mlp": 0.01126951, + "balance_loss_clip": 1.16250467, + "balance_loss_mlp": 1.06415176, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 2.2436178720019684, + "language_loss": 0.71168, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73814154, + "num_input_tokens_seen": 2600390, + "step": 124, + "time_per_iteration": 2.615359306335449 + }, + { + "auxiliary_loss_clip": 0.01506924, + "auxiliary_loss_mlp": 0.01125487, + "balance_loss_clip": 1.16292334, + "balance_loss_mlp": 1.06149518, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.1127884158433394, + "language_loss": 0.88242519, + "learning_rate": 3.108720342404542e-06, + "loss": 0.90874934, + "num_input_tokens_seen": 2620770, + "step": 125, + "time_per_iteration": 2.673096179962158 + }, + { + "auxiliary_loss_clip": 0.01520494, + "auxiliary_loss_mlp": 0.01142731, + "balance_loss_clip": 1.16211152, + "balance_loss_mlp": 1.07921636, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.8706886191816308, + "language_loss": 0.82130033, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.84793258, + "num_input_tokens_seen": 2639900, + "step": 126, + "time_per_iteration": 2.5316643714904785 + }, + { + "auxiliary_loss_clip": 0.01515001, + "auxiliary_loss_mlp": 0.0114179, + "balance_loss_clip": 1.16189229, + "balance_loss_mlp": 1.07937193, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.0641964412253095, + "language_loss": 0.6726402, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69920808, + "num_input_tokens_seen": 2657450, + "step": 127, + "time_per_iteration": 2.538203477859497 + }, + { + "auxiliary_loss_clip": 0.01504058, + "auxiliary_loss_mlp": 0.01133316, + "balance_loss_clip": 1.16425681, + "balance_loss_mlp": 1.06937242, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 2.028575890954625, + "language_loss": 0.8800832, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90645701, + "num_input_tokens_seen": 2678150, + "step": 128, + "time_per_iteration": 2.575871706008911 + }, + { + "auxiliary_loss_clip": 0.01505127, + "auxiliary_loss_mlp": 0.0114135, + "balance_loss_clip": 1.15851021, + "balance_loss_mlp": 1.07754874, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 2.0324298655872317, + "language_loss": 0.84685957, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87332433, + "num_input_tokens_seen": 2698290, + "step": 129, + "time_per_iteration": 2.624635934829712 + }, + { + "auxiliary_loss_clip": 0.01499639, + "auxiliary_loss_mlp": 0.01131195, + "balance_loss_clip": 1.15782952, + "balance_loss_mlp": 1.06701243, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 1.879545019927651, + "language_loss": 0.9737094, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00001776, + "num_input_tokens_seen": 2717630, + "step": 130, + "time_per_iteration": 2.5401411056518555 + }, + { + "auxiliary_loss_clip": 0.01492418, + "auxiliary_loss_mlp": 0.01134385, + "balance_loss_clip": 1.15452552, + "balance_loss_mlp": 1.06929719, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.134534295655713, + "language_loss": 0.82346332, + "learning_rate": 3.138906441556014e-06, + "loss": 0.84973133, + "num_input_tokens_seen": 2735835, + "step": 131, + "time_per_iteration": 2.532994031906128 + }, + { + "auxiliary_loss_clip": 0.01500929, + "auxiliary_loss_mlp": 0.01129268, + "balance_loss_clip": 1.15724015, + "balance_loss_mlp": 1.06742179, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.6259217574039235, + "language_loss": 0.82512367, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85142565, + "num_input_tokens_seen": 2756335, + "step": 132, + "time_per_iteration": 2.596552848815918 + }, + { + "auxiliary_loss_clip": 0.01493316, + "auxiliary_loss_mlp": 0.01128795, + "balance_loss_clip": 1.15224504, + "balance_loss_mlp": 1.0666157, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.077350586152436, + "language_loss": 0.95422804, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98044914, + "num_input_tokens_seen": 2775090, + "step": 133, + "time_per_iteration": 2.5284721851348877 + }, + { + "auxiliary_loss_clip": 0.01488915, + "auxiliary_loss_mlp": 0.01127747, + "balance_loss_clip": 1.16055775, + "balance_loss_mlp": 1.06690288, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.670944490475269, + "language_loss": 0.73289883, + "learning_rate": 3.153484849651286e-06, + "loss": 0.75906545, + "num_input_tokens_seen": 2795320, + "step": 134, + "time_per_iteration": 3.971756935119629 + }, + { + "auxiliary_loss_clip": 0.0148439, + "auxiliary_loss_mlp": 0.01134565, + "balance_loss_clip": 1.14918792, + "balance_loss_mlp": 1.06876183, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 6.755913887108555, + "language_loss": 0.88654339, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91273296, + "num_input_tokens_seen": 2812815, + "step": 135, + "time_per_iteration": 2.5557775497436523 + }, + { + "auxiliary_loss_clip": 0.01488183, + "auxiliary_loss_mlp": 0.01136355, + "balance_loss_clip": 1.15361536, + "balance_loss_mlp": 1.07059968, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.104151532326499, + "language_loss": 0.8918736, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.91811895, + "num_input_tokens_seen": 2830445, + "step": 136, + "time_per_iteration": 3.986496925354004 + }, + { + "auxiliary_loss_clip": 0.01483857, + "auxiliary_loss_mlp": 0.01110971, + "balance_loss_clip": 1.14740551, + "balance_loss_mlp": 1.05031681, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 2.37501423551953, + "language_loss": 0.84043127, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.8663795, + "num_input_tokens_seen": 2846965, + "step": 137, + "time_per_iteration": 3.9532763957977295 + }, + { + "auxiliary_loss_clip": 0.014809, + "auxiliary_loss_mlp": 0.01121093, + "balance_loss_clip": 1.1473484, + "balance_loss_mlp": 1.06072497, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 2.0450934072928493, + "language_loss": 0.90228134, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.92830127, + "num_input_tokens_seen": 2867520, + "step": 138, + "time_per_iteration": 3.9882595539093018 + }, + { + "auxiliary_loss_clip": 0.01469211, + "auxiliary_loss_mlp": 0.0112281, + "balance_loss_clip": 1.14618599, + "balance_loss_mlp": 1.05795979, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.758552657870626, + "language_loss": 0.91322923, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93914938, + "num_input_tokens_seen": 2885675, + "step": 139, + "time_per_iteration": 2.566387176513672 + }, + { + "auxiliary_loss_clip": 0.01486709, + "auxiliary_loss_mlp": 0.01124723, + "balance_loss_clip": 1.15344477, + "balance_loss_mlp": 1.06168556, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.4207252917009754, + "language_loss": 0.85676605, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88288039, + "num_input_tokens_seen": 2905960, + "step": 140, + "time_per_iteration": 2.5777218341827393 + }, + { + "auxiliary_loss_clip": 0.01473135, + "auxiliary_loss_mlp": 0.01122571, + "balance_loss_clip": 1.14732563, + "balance_loss_mlp": 1.06096411, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.487337502687863, + "language_loss": 0.84201372, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86797071, + "num_input_tokens_seen": 2922780, + "step": 141, + "time_per_iteration": 2.515693187713623 + }, + { + "auxiliary_loss_clip": 0.01476078, + "auxiliary_loss_mlp": 0.01133292, + "balance_loss_clip": 1.14427233, + "balance_loss_mlp": 1.07125592, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.3430660000853307, + "language_loss": 0.81109053, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.83718419, + "num_input_tokens_seen": 2938765, + "step": 142, + "time_per_iteration": 2.503025770187378 + }, + { + "auxiliary_loss_clip": 0.01377887, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.19453061, + "balance_loss_mlp": 1.00496507, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.4327746566994404, + "language_loss": 0.6690377, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69312179, + "num_input_tokens_seen": 3006665, + "step": 143, + "time_per_iteration": 3.233499526977539 + }, + { + "auxiliary_loss_clip": 0.01468624, + "auxiliary_loss_mlp": 0.01126051, + "balance_loss_clip": 1.14489794, + "balance_loss_mlp": 1.06444383, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 1.9933329888583515, + "language_loss": 0.84024644, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86619323, + "num_input_tokens_seen": 3024335, + "step": 144, + "time_per_iteration": 2.529996633529663 + }, + { + "auxiliary_loss_clip": 0.01455292, + "auxiliary_loss_mlp": 0.01113024, + "balance_loss_clip": 1.1383822, + "balance_loss_mlp": 1.0491755, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 1.856567970793949, + "language_loss": 0.88389611, + "learning_rate": 3.204280886775619e-06, + "loss": 0.90957928, + "num_input_tokens_seen": 3043300, + "step": 145, + "time_per_iteration": 2.5552010536193848 + }, + { + "auxiliary_loss_clip": 0.01471464, + "auxiliary_loss_mlp": 0.01127007, + "balance_loss_clip": 1.1412816, + "balance_loss_mlp": 1.06334925, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.9942065707337773, + "language_loss": 0.86046076, + "learning_rate": 3.208706005112005e-06, + "loss": 0.8864454, + "num_input_tokens_seen": 3064610, + "step": 146, + "time_per_iteration": 2.5782506465911865 + }, + { + "auxiliary_loss_clip": 0.01369002, + "auxiliary_loss_mlp": 0.01024942, + "balance_loss_clip": 1.18944621, + "balance_loss_mlp": 1.00052822, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8630183344709751, + "language_loss": 0.60113031, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62506974, + "num_input_tokens_seen": 3130385, + "step": 147, + "time_per_iteration": 3.1834676265716553 + }, + { + "auxiliary_loss_clip": 0.01462706, + "auxiliary_loss_mlp": 0.01125211, + "balance_loss_clip": 1.14459491, + "balance_loss_mlp": 1.06613111, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8695487714308094, + "language_loss": 0.84663349, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.8725127, + "num_input_tokens_seen": 3149760, + "step": 148, + "time_per_iteration": 2.5618250370025635 + }, + { + "auxiliary_loss_clip": 0.01466776, + "auxiliary_loss_mlp": 0.01145326, + "balance_loss_clip": 1.14561868, + "balance_loss_mlp": 1.08014274, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.6368622521940295, + "language_loss": 0.88768446, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91380548, + "num_input_tokens_seen": 3164500, + "step": 149, + "time_per_iteration": 2.526968240737915 + }, + { + "auxiliary_loss_clip": 0.01463988, + "auxiliary_loss_mlp": 0.0111059, + "balance_loss_clip": 1.14106441, + "balance_loss_mlp": 1.05189121, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.2578738369873244, + "language_loss": 0.93016249, + "learning_rate": 3.226108474846181e-06, + "loss": 0.9559083, + "num_input_tokens_seen": 3182455, + "step": 150, + "time_per_iteration": 2.6052730083465576 + }, + { + "auxiliary_loss_clip": 0.01451798, + "auxiliary_loss_mlp": 0.01112177, + "balance_loss_clip": 1.13690603, + "balance_loss_mlp": 1.0552429, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.8227245483382015, + "language_loss": 0.74353147, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76917118, + "num_input_tokens_seen": 3203995, + "step": 151, + "time_per_iteration": 2.6632320880889893 + }, + { + "auxiliary_loss_clip": 0.0146356, + "auxiliary_loss_mlp": 0.01127958, + "balance_loss_clip": 1.14092636, + "balance_loss_mlp": 1.06897366, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.3541730788414723, + "language_loss": 0.88215816, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90807343, + "num_input_tokens_seen": 3222575, + "step": 152, + "time_per_iteration": 2.5623040199279785 + }, + { + "auxiliary_loss_clip": 0.01462181, + "auxiliary_loss_mlp": 0.01124903, + "balance_loss_clip": 1.14587688, + "balance_loss_mlp": 1.06458282, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 8.44508662866577, + "language_loss": 0.8412686, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86713946, + "num_input_tokens_seen": 3240180, + "step": 153, + "time_per_iteration": 2.5392825603485107 + }, + { + "auxiliary_loss_clip": 0.01453668, + "auxiliary_loss_mlp": 0.01136619, + "balance_loss_clip": 1.1382004, + "balance_loss_mlp": 1.07529771, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 2.135524942311166, + "language_loss": 0.89753342, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92343628, + "num_input_tokens_seen": 3259800, + "step": 154, + "time_per_iteration": 2.5676019191741943 + }, + { + "auxiliary_loss_clip": 0.0145695, + "auxiliary_loss_mlp": 0.01152824, + "balance_loss_clip": 1.14049315, + "balance_loss_mlp": 1.09341037, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.1098913224199114, + "language_loss": 0.89755237, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92365015, + "num_input_tokens_seen": 3280400, + "step": 155, + "time_per_iteration": 2.6501522064208984 + }, + { + "auxiliary_loss_clip": 0.01461488, + "auxiliary_loss_mlp": 0.01116075, + "balance_loss_clip": 1.1386776, + "balance_loss_mlp": 1.0578053, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 2.454668078070131, + "language_loss": 0.86703807, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89281368, + "num_input_tokens_seen": 3297600, + "step": 156, + "time_per_iteration": 2.559771776199341 + }, + { + "auxiliary_loss_clip": 0.01458191, + "auxiliary_loss_mlp": 0.011202, + "balance_loss_clip": 1.14194357, + "balance_loss_mlp": 1.06159699, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.4872538935496387, + "language_loss": 0.99432427, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02010822, + "num_input_tokens_seen": 3313635, + "step": 157, + "time_per_iteration": 2.6236939430236816 + }, + { + "auxiliary_loss_clip": 0.0144352, + "auxiliary_loss_mlp": 0.01142165, + "balance_loss_clip": 1.13685918, + "balance_loss_mlp": 1.0833714, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 9.897759605429817, + "language_loss": 0.88169312, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.90754998, + "num_input_tokens_seen": 3333735, + "step": 158, + "time_per_iteration": 2.6395232677459717 + }, + { + "auxiliary_loss_clip": 0.01451168, + "auxiliary_loss_mlp": 0.01123958, + "balance_loss_clip": 1.13722241, + "balance_loss_mlp": 1.06420982, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 2.2483293426212887, + "language_loss": 0.86527896, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89103019, + "num_input_tokens_seen": 3348800, + "step": 159, + "time_per_iteration": 2.5758678913116455 + }, + { + "auxiliary_loss_clip": 0.01440527, + "auxiliary_loss_mlp": 0.01134409, + "balance_loss_clip": 1.13238084, + "balance_loss_mlp": 1.07418501, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.7031681799806995, + "language_loss": 0.86509264, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89084196, + "num_input_tokens_seen": 3368595, + "step": 160, + "time_per_iteration": 2.570378303527832 + }, + { + "auxiliary_loss_clip": 0.0144815, + "auxiliary_loss_mlp": 0.01120158, + "balance_loss_clip": 1.13829505, + "balance_loss_mlp": 1.06460667, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.4890730128923133, + "language_loss": 0.91381037, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.93949354, + "num_input_tokens_seen": 3384975, + "step": 161, + "time_per_iteration": 2.5639450550079346 + }, + { + "auxiliary_loss_clip": 0.01452046, + "auxiliary_loss_mlp": 0.01111856, + "balance_loss_clip": 1.13818789, + "balance_loss_mlp": 1.05611348, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 2.068779281778435, + "language_loss": 0.91666651, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94230551, + "num_input_tokens_seen": 3404755, + "step": 162, + "time_per_iteration": 2.519643783569336 + }, + { + "auxiliary_loss_clip": 0.01324962, + "auxiliary_loss_mlp": 0.01023447, + "balance_loss_clip": 1.15750885, + "balance_loss_mlp": 1.0011307, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.1814360874494405, + "language_loss": 0.72414052, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74762464, + "num_input_tokens_seen": 3467210, + "step": 163, + "time_per_iteration": 3.077702283859253 + }, + { + "auxiliary_loss_clip": 0.01438618, + "auxiliary_loss_mlp": 0.01114658, + "balance_loss_clip": 1.13566279, + "balance_loss_mlp": 1.0580101, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.376655562638303, + "language_loss": 0.84604633, + "learning_rate": 3.283560135133457e-06, + "loss": 0.87157905, + "num_input_tokens_seen": 3483220, + "step": 164, + "time_per_iteration": 2.495720148086548 + }, + { + "auxiliary_loss_clip": 0.01429785, + "auxiliary_loss_mlp": 0.0110117, + "balance_loss_clip": 1.12715387, + "balance_loss_mlp": 1.04547596, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 1.959942751704732, + "language_loss": 0.89264327, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91795278, + "num_input_tokens_seen": 3501465, + "step": 165, + "time_per_iteration": 2.508188009262085 + }, + { + "auxiliary_loss_clip": 0.01433145, + "auxiliary_loss_mlp": 0.01129315, + "balance_loss_clip": 1.12803936, + "balance_loss_mlp": 1.06947172, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 2.034959724407712, + "language_loss": 0.80010056, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82572514, + "num_input_tokens_seen": 3520480, + "step": 166, + "time_per_iteration": 2.5566582679748535 + }, + { + "auxiliary_loss_clip": 0.01436965, + "auxiliary_loss_mlp": 0.01124857, + "balance_loss_clip": 1.13164473, + "balance_loss_mlp": 1.06625366, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 3.0175854741209123, + "language_loss": 0.92133796, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94695616, + "num_input_tokens_seen": 3539570, + "step": 167, + "time_per_iteration": 2.606592893600464 + }, + { + "auxiliary_loss_clip": 0.01428422, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_clip": 1.12982309, + "balance_loss_mlp": 1.07850885, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 2.685648534172902, + "language_loss": 0.90656072, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93219364, + "num_input_tokens_seen": 3555465, + "step": 168, + "time_per_iteration": 2.5074615478515625 + }, + { + "auxiliary_loss_clip": 0.01422821, + "auxiliary_loss_mlp": 0.01105962, + "balance_loss_clip": 1.12485814, + "balance_loss_mlp": 1.04869425, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 1.7893971309526824, + "language_loss": 0.87168789, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.8969757, + "num_input_tokens_seen": 3578970, + "step": 169, + "time_per_iteration": 2.6439402103424072 + }, + { + "auxiliary_loss_clip": 0.01424585, + "auxiliary_loss_mlp": 0.01112268, + "balance_loss_clip": 1.1270082, + "balance_loss_mlp": 1.0549047, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 1.8856687425623526, + "language_loss": 0.84661758, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87198615, + "num_input_tokens_seen": 3597275, + "step": 170, + "time_per_iteration": 2.5238234996795654 + }, + { + "auxiliary_loss_clip": 0.01433864, + "auxiliary_loss_mlp": 0.01134452, + "balance_loss_clip": 1.12747121, + "balance_loss_mlp": 1.0766592, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.631469797624154, + "language_loss": 0.89847147, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92415464, + "num_input_tokens_seen": 3618905, + "step": 171, + "time_per_iteration": 2.588636636734009 + }, + { + "auxiliary_loss_clip": 0.01427234, + "auxiliary_loss_mlp": 0.01107238, + "balance_loss_clip": 1.12941957, + "balance_loss_mlp": 1.05209208, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 9.179333839967295, + "language_loss": 0.88836658, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91371131, + "num_input_tokens_seen": 3639610, + "step": 172, + "time_per_iteration": 2.5616507530212402 + }, + { + "auxiliary_loss_clip": 0.01416918, + "auxiliary_loss_mlp": 0.01125083, + "balance_loss_clip": 1.1234026, + "balance_loss_mlp": 1.06912589, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 2.080497523175105, + "language_loss": 0.8116653, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83708531, + "num_input_tokens_seen": 3664030, + "step": 173, + "time_per_iteration": 2.65163254737854 + }, + { + "auxiliary_loss_clip": 0.01429001, + "auxiliary_loss_mlp": 0.01108889, + "balance_loss_clip": 1.12746823, + "balance_loss_mlp": 1.05581689, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 1.937710373907411, + "language_loss": 0.82660449, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85198331, + "num_input_tokens_seen": 3683615, + "step": 174, + "time_per_iteration": 4.0156121253967285 + }, + { + "auxiliary_loss_clip": 0.01421922, + "auxiliary_loss_mlp": 0.01119614, + "balance_loss_clip": 1.12405479, + "balance_loss_mlp": 1.06434822, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.867165436494436, + "language_loss": 0.72900623, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75442159, + "num_input_tokens_seen": 3704540, + "step": 175, + "time_per_iteration": 4.042726039886475 + }, + { + "auxiliary_loss_clip": 0.01424917, + "auxiliary_loss_mlp": 0.01129671, + "balance_loss_clip": 1.12494874, + "balance_loss_mlp": 1.07245028, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.4634535220464002, + "language_loss": 0.97760952, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00315535, + "num_input_tokens_seen": 3721320, + "step": 176, + "time_per_iteration": 2.491910219192505 + }, + { + "auxiliary_loss_clip": 0.01410739, + "auxiliary_loss_mlp": 0.01132274, + "balance_loss_clip": 1.12193346, + "balance_loss_mlp": 1.07877326, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3342550838988023, + "language_loss": 0.76849347, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79392362, + "num_input_tokens_seen": 3739385, + "step": 177, + "time_per_iteration": 2.528158664703369 + }, + { + "auxiliary_loss_clip": 0.01420328, + "auxiliary_loss_mlp": 0.01103021, + "balance_loss_clip": 1.12002087, + "balance_loss_mlp": 1.04837596, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.401178401864099, + "language_loss": 0.7671752, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.7924087, + "num_input_tokens_seen": 3756360, + "step": 178, + "time_per_iteration": 4.009030342102051 + }, + { + "auxiliary_loss_clip": 0.01428337, + "auxiliary_loss_mlp": 0.0110923, + "balance_loss_clip": 1.12530255, + "balance_loss_mlp": 1.05258179, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.1909062564575477, + "language_loss": 0.84103662, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86641228, + "num_input_tokens_seen": 3773930, + "step": 179, + "time_per_iteration": 2.4920654296875 + }, + { + "auxiliary_loss_clip": 0.0141853, + "auxiliary_loss_mlp": 0.01111442, + "balance_loss_clip": 1.11901331, + "balance_loss_mlp": 1.05505657, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 2.895263210241493, + "language_loss": 0.83807492, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86337459, + "num_input_tokens_seen": 3793630, + "step": 180, + "time_per_iteration": 2.591749668121338 + }, + { + "auxiliary_loss_clip": 0.01418791, + "auxiliary_loss_mlp": 0.01122284, + "balance_loss_clip": 1.12184405, + "balance_loss_mlp": 1.06666064, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 2.4244751164357905, + "language_loss": 0.78023994, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80565071, + "num_input_tokens_seen": 3813610, + "step": 181, + "time_per_iteration": 2.543980598449707 + }, + { + "auxiliary_loss_clip": 0.01415187, + "auxiliary_loss_mlp": 0.01130739, + "balance_loss_clip": 1.11900496, + "balance_loss_mlp": 1.07628417, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 2.539809383717012, + "language_loss": 0.7621693, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78762853, + "num_input_tokens_seen": 3831390, + "step": 182, + "time_per_iteration": 2.5410749912261963 + }, + { + "auxiliary_loss_clip": 0.0141105, + "auxiliary_loss_mlp": 0.01123807, + "balance_loss_clip": 1.11725974, + "balance_loss_mlp": 1.06782627, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.902568813367098, + "language_loss": 0.87677443, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.90212297, + "num_input_tokens_seen": 3849705, + "step": 183, + "time_per_iteration": 2.4920449256896973 + }, + { + "auxiliary_loss_clip": 0.0141387, + "auxiliary_loss_mlp": 0.01112238, + "balance_loss_clip": 1.1202141, + "balance_loss_mlp": 1.06114459, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.372349688132654, + "language_loss": 0.86359632, + "learning_rate": 3.357647774369736e-06, + "loss": 0.88885742, + "num_input_tokens_seen": 3869230, + "step": 184, + "time_per_iteration": 2.542818307876587 + }, + { + "auxiliary_loss_clip": 0.01410566, + "auxiliary_loss_mlp": 0.01106953, + "balance_loss_clip": 1.12213778, + "balance_loss_mlp": 1.05178273, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 2.0227376471271254, + "language_loss": 0.83709204, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86226726, + "num_input_tokens_seen": 3889735, + "step": 185, + "time_per_iteration": 2.596196174621582 + }, + { + "auxiliary_loss_clip": 0.01420135, + "auxiliary_loss_mlp": 0.01112201, + "balance_loss_clip": 1.12022018, + "balance_loss_mlp": 1.05459857, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.3294978392264305, + "language_loss": 0.71079755, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73612094, + "num_input_tokens_seen": 3908855, + "step": 186, + "time_per_iteration": 2.5250742435455322 + }, + { + "auxiliary_loss_clip": 0.01416801, + "auxiliary_loss_mlp": 0.01106834, + "balance_loss_clip": 1.12015224, + "balance_loss_mlp": 1.05478764, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 2.50485571892187, + "language_loss": 1.01920462, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04444098, + "num_input_tokens_seen": 3923865, + "step": 187, + "time_per_iteration": 2.4954895973205566 + }, + { + "auxiliary_loss_clip": 0.01404925, + "auxiliary_loss_mlp": 0.01112499, + "balance_loss_clip": 1.11881971, + "balance_loss_mlp": 1.05737662, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.8043668651053981, + "language_loss": 0.75147086, + "learning_rate": 3.371494591560139e-06, + "loss": 0.77664506, + "num_input_tokens_seen": 3946870, + "step": 188, + "time_per_iteration": 2.703624725341797 + }, + { + "auxiliary_loss_clip": 0.01303787, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.144719, + "balance_loss_mlp": 1.00564504, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.8043959880166834, + "language_loss": 0.56250858, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58580798, + "num_input_tokens_seen": 4010005, + "step": 189, + "time_per_iteration": 3.1372928619384766 + }, + { + "auxiliary_loss_clip": 0.0140351, + "auxiliary_loss_mlp": 0.01123016, + "balance_loss_clip": 1.11526453, + "balance_loss_mlp": 1.06765556, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.243696584226902, + "language_loss": 0.95039362, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97565895, + "num_input_tokens_seen": 4029035, + "step": 190, + "time_per_iteration": 2.5581321716308594 + }, + { + "auxiliary_loss_clip": 0.01404613, + "auxiliary_loss_mlp": 0.01104047, + "balance_loss_clip": 1.11628664, + "balance_loss_mlp": 1.05211902, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 3.048149478903249, + "language_loss": 0.84600115, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.87108773, + "num_input_tokens_seen": 4046995, + "step": 191, + "time_per_iteration": 2.493659734725952 + }, + { + "auxiliary_loss_clip": 0.01403055, + "auxiliary_loss_mlp": 0.01121309, + "balance_loss_clip": 1.1124965, + "balance_loss_mlp": 1.06961942, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 1.7989602221217813, + "language_loss": 0.91702729, + "learning_rate": 3.385049875042367e-06, + "loss": 0.94227093, + "num_input_tokens_seen": 4065865, + "step": 192, + "time_per_iteration": 2.5433974266052246 + }, + { + "auxiliary_loss_clip": 0.0139935, + "auxiliary_loss_mlp": 0.01117081, + "balance_loss_clip": 1.11455202, + "balance_loss_mlp": 1.06033778, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.5396593924267172, + "language_loss": 0.86945581, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89462018, + "num_input_tokens_seen": 4085305, + "step": 193, + "time_per_iteration": 2.5717296600341797 + }, + { + "auxiliary_loss_clip": 0.01402217, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_clip": 1.11234474, + "balance_loss_mlp": 1.04744005, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.0606635661222215, + "language_loss": 0.92491519, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.94992459, + "num_input_tokens_seen": 4105185, + "step": 194, + "time_per_iteration": 2.5559895038604736 + }, + { + "auxiliary_loss_clip": 0.01406937, + "auxiliary_loss_mlp": 0.01107148, + "balance_loss_clip": 1.11708724, + "balance_loss_mlp": 1.05510163, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 2.186803157282392, + "language_loss": 0.90019292, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92533374, + "num_input_tokens_seen": 4123160, + "step": 195, + "time_per_iteration": 2.5242624282836914 + }, + { + "auxiliary_loss_clip": 0.0140224, + "auxiliary_loss_mlp": 0.01117667, + "balance_loss_clip": 1.11658239, + "balance_loss_mlp": 1.06328404, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 2.6168067841642957, + "language_loss": 0.85848749, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88368654, + "num_input_tokens_seen": 4140425, + "step": 196, + "time_per_iteration": 2.483330488204956 + }, + { + "auxiliary_loss_clip": 0.01398877, + "auxiliary_loss_mlp": 0.01107598, + "balance_loss_clip": 1.11268377, + "balance_loss_mlp": 1.05409718, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 4.278581472749545, + "language_loss": 0.93384922, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95891398, + "num_input_tokens_seen": 4159555, + "step": 197, + "time_per_iteration": 2.5415878295898438 + }, + { + "auxiliary_loss_clip": 0.013957, + "auxiliary_loss_mlp": 0.01112554, + "balance_loss_clip": 1.11391592, + "balance_loss_mlp": 1.06036448, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.0763528005483036, + "language_loss": 0.79166043, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81674302, + "num_input_tokens_seen": 4180480, + "step": 198, + "time_per_iteration": 2.5906691551208496 + }, + { + "auxiliary_loss_clip": 0.01392775, + "auxiliary_loss_mlp": 0.01119902, + "balance_loss_clip": 1.11453617, + "balance_loss_mlp": 1.06835556, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 1.8840332463768013, + "language_loss": 0.88235807, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90748489, + "num_input_tokens_seen": 4198835, + "step": 199, + "time_per_iteration": 2.5263924598693848 + }, + { + "auxiliary_loss_clip": 0.01405256, + "auxiliary_loss_mlp": 0.01120087, + "balance_loss_clip": 1.11836088, + "balance_loss_mlp": 1.06379652, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.9744168777458815, + "language_loss": 0.81425297, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83950639, + "num_input_tokens_seen": 4219335, + "step": 200, + "time_per_iteration": 2.52980637550354 + }, + { + "auxiliary_loss_clip": 0.01401754, + "auxiliary_loss_mlp": 0.01103853, + "balance_loss_clip": 1.11355186, + "balance_loss_mlp": 1.05054271, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.6239616169801914, + "language_loss": 0.87604213, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90109813, + "num_input_tokens_seen": 4236940, + "step": 201, + "time_per_iteration": 2.5011496543884277 + }, + { + "auxiliary_loss_clip": 0.01400285, + "auxiliary_loss_mlp": 0.01108675, + "balance_loss_clip": 1.11617351, + "balance_loss_mlp": 1.0547924, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 1.6962275127100719, + "language_loss": 0.84156752, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86665714, + "num_input_tokens_seen": 4256755, + "step": 202, + "time_per_iteration": 2.50809907913208 + }, + { + "auxiliary_loss_clip": 0.01389812, + "auxiliary_loss_mlp": 0.0110599, + "balance_loss_clip": 1.10878146, + "balance_loss_mlp": 1.05403876, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 2.0716236546521785, + "language_loss": 0.90061975, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.92557782, + "num_input_tokens_seen": 4276505, + "step": 203, + "time_per_iteration": 2.525484085083008 + }, + { + "auxiliary_loss_clip": 0.01274064, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.12270176, + "balance_loss_mlp": 1.01063585, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0218184150574559, + "language_loss": 0.61291921, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.635957, + "num_input_tokens_seen": 4330965, + "step": 204, + "time_per_iteration": 2.9888126850128174 + }, + { + "auxiliary_loss_clip": 0.01398182, + "auxiliary_loss_mlp": 0.0110966, + "balance_loss_clip": 1.11198711, + "balance_loss_mlp": 1.05751801, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.351810878884594, + "language_loss": 0.91188943, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93696785, + "num_input_tokens_seen": 4348200, + "step": 205, + "time_per_iteration": 2.4792327880859375 + }, + { + "auxiliary_loss_clip": 0.01404408, + "auxiliary_loss_mlp": 0.01120556, + "balance_loss_clip": 1.11587512, + "balance_loss_mlp": 1.06722152, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.0165616699302227, + "language_loss": 0.89331889, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91856849, + "num_input_tokens_seen": 4365460, + "step": 206, + "time_per_iteration": 2.515995979309082 + }, + { + "auxiliary_loss_clip": 0.01394564, + "auxiliary_loss_mlp": 0.01102468, + "balance_loss_clip": 1.10995936, + "balance_loss_mlp": 1.05158901, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.177288988223646, + "language_loss": 0.95367503, + "learning_rate": 3.43348263905683e-06, + "loss": 0.97864532, + "num_input_tokens_seen": 4383650, + "step": 207, + "time_per_iteration": 2.514897108078003 + }, + { + "auxiliary_loss_clip": 0.01393886, + "auxiliary_loss_mlp": 0.01114755, + "balance_loss_clip": 1.11487579, + "balance_loss_mlp": 1.06304193, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 1.6545627437962536, + "language_loss": 0.75969583, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78478229, + "num_input_tokens_seen": 4403765, + "step": 208, + "time_per_iteration": 2.5217947959899902 + }, + { + "auxiliary_loss_clip": 0.01382313, + "auxiliary_loss_mlp": 0.0110806, + "balance_loss_clip": 1.10808253, + "balance_loss_mlp": 1.05596566, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.150597773075941, + "language_loss": 0.98711002, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01201379, + "num_input_tokens_seen": 4421935, + "step": 209, + "time_per_iteration": 2.5703837871551514 + }, + { + "auxiliary_loss_clip": 0.01387476, + "auxiliary_loss_mlp": 0.01112405, + "balance_loss_clip": 1.10979784, + "balance_loss_mlp": 1.06002486, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 3.850362924379261, + "language_loss": 0.8521058, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87710464, + "num_input_tokens_seen": 4441470, + "step": 210, + "time_per_iteration": 2.640751361846924 + }, + { + "auxiliary_loss_clip": 0.01384332, + "auxiliary_loss_mlp": 0.01118958, + "balance_loss_clip": 1.10805917, + "balance_loss_mlp": 1.06962943, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.27108284078795, + "language_loss": 0.97166228, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99669522, + "num_input_tokens_seen": 4459950, + "step": 211, + "time_per_iteration": 2.532622814178467 + }, + { + "auxiliary_loss_clip": 0.01393953, + "auxiliary_loss_mlp": 0.01116825, + "balance_loss_clip": 1.11354065, + "balance_loss_mlp": 1.06480193, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 2.209434533489822, + "language_loss": 0.95041275, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97552049, + "num_input_tokens_seen": 4478390, + "step": 212, + "time_per_iteration": 2.459486246109009 + }, + { + "auxiliary_loss_clip": 0.01381833, + "auxiliary_loss_mlp": 0.01116528, + "balance_loss_clip": 1.11066711, + "balance_loss_mlp": 1.06514835, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 4.112975116129714, + "language_loss": 0.76154745, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78653109, + "num_input_tokens_seen": 4501665, + "step": 213, + "time_per_iteration": 2.7084102630615234 + }, + { + "auxiliary_loss_clip": 0.01386845, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.10879612, + "balance_loss_mlp": 1.05673254, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 3.321789018836272, + "language_loss": 0.86460793, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.88954556, + "num_input_tokens_seen": 4519055, + "step": 214, + "time_per_iteration": 3.845842123031616 + }, + { + "auxiliary_loss_clip": 0.01383317, + "auxiliary_loss_mlp": 0.01127631, + "balance_loss_clip": 1.11395025, + "balance_loss_mlp": 1.07477379, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.152580147687718, + "language_loss": 0.77532208, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.80043161, + "num_input_tokens_seen": 4540870, + "step": 215, + "time_per_iteration": 5.457382917404175 + }, + { + "auxiliary_loss_clip": 0.01390968, + "auxiliary_loss_mlp": 0.01107698, + "balance_loss_clip": 1.11175239, + "balance_loss_mlp": 1.05779719, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.081550844929623, + "language_loss": 0.90512073, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93010741, + "num_input_tokens_seen": 4560395, + "step": 216, + "time_per_iteration": 2.559075355529785 + }, + { + "auxiliary_loss_clip": 0.01383663, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_clip": 1.10622871, + "balance_loss_mlp": 1.05349922, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 6.030324181268161, + "language_loss": 0.93489206, + "learning_rate": 3.463858658104523e-06, + "loss": 0.95977169, + "num_input_tokens_seen": 4575785, + "step": 217, + "time_per_iteration": 2.479166030883789 + }, + { + "auxiliary_loss_clip": 0.01378855, + "auxiliary_loss_mlp": 0.01104441, + "balance_loss_clip": 1.10552454, + "balance_loss_mlp": 1.05117798, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 2.66898266433367, + "language_loss": 0.93638206, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96121502, + "num_input_tokens_seen": 4594985, + "step": 218, + "time_per_iteration": 2.4722437858581543 + }, + { + "auxiliary_loss_clip": 0.01373833, + "auxiliary_loss_mlp": 0.01107145, + "balance_loss_clip": 1.10517824, + "balance_loss_mlp": 1.05748224, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 1.804648032454879, + "language_loss": 0.86237872, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88718843, + "num_input_tokens_seen": 4616125, + "step": 219, + "time_per_iteration": 3.949139356613159 + }, + { + "auxiliary_loss_clip": 0.01377987, + "auxiliary_loss_mlp": 0.01097631, + "balance_loss_clip": 1.10701323, + "balance_loss_mlp": 1.04806352, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.9031118088012042, + "language_loss": 0.87728441, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.9020406, + "num_input_tokens_seen": 4637795, + "step": 220, + "time_per_iteration": 2.572349786758423 + }, + { + "auxiliary_loss_clip": 0.01372646, + "auxiliary_loss_mlp": 0.0110794, + "balance_loss_clip": 1.10385418, + "balance_loss_mlp": 1.06166339, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 2.0752837894753875, + "language_loss": 0.86523306, + "learning_rate": 3.475618842282164e-06, + "loss": 0.89003897, + "num_input_tokens_seen": 4656835, + "step": 221, + "time_per_iteration": 2.5099239349365234 + }, + { + "auxiliary_loss_clip": 0.01375986, + "auxiliary_loss_mlp": 0.01110987, + "balance_loss_clip": 1.10239029, + "balance_loss_mlp": 1.06025112, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.722193053897704, + "language_loss": 0.92610401, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.95097369, + "num_input_tokens_seen": 4673015, + "step": 222, + "time_per_iteration": 2.4502313137054443 + }, + { + "auxiliary_loss_clip": 0.01375017, + "auxiliary_loss_mlp": 0.01106092, + "balance_loss_clip": 1.10570645, + "balance_loss_mlp": 1.05240011, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 3.0780594050336094, + "language_loss": 0.95865679, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98346794, + "num_input_tokens_seen": 4692355, + "step": 223, + "time_per_iteration": 2.4879422187805176 + }, + { + "auxiliary_loss_clip": 0.01375922, + "auxiliary_loss_mlp": 0.01105568, + "balance_loss_clip": 1.10663426, + "balance_loss_mlp": 1.05590582, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 3.5628949547149125, + "language_loss": 0.88098228, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90579712, + "num_input_tokens_seen": 4710080, + "step": 224, + "time_per_iteration": 2.4841973781585693 + }, + { + "auxiliary_loss_clip": 0.01375874, + "auxiliary_loss_mlp": 0.01101718, + "balance_loss_clip": 1.10557628, + "balance_loss_mlp": 1.04840755, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 4.4864887705108405, + "language_loss": 0.89849484, + "learning_rate": 3.487168070036317e-06, + "loss": 0.9232707, + "num_input_tokens_seen": 4728980, + "step": 225, + "time_per_iteration": 2.47271466255188 + }, + { + "auxiliary_loss_clip": 0.01371536, + "auxiliary_loss_mlp": 0.01119205, + "balance_loss_clip": 1.10462165, + "balance_loss_mlp": 1.06723022, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 3.332675754614929, + "language_loss": 0.99045455, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01536191, + "num_input_tokens_seen": 4747020, + "step": 226, + "time_per_iteration": 2.4641432762145996 + }, + { + "auxiliary_loss_clip": 0.01375619, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_clip": 1.10602212, + "balance_loss_mlp": 1.05680251, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 3.590431606541198, + "language_loss": 0.90959072, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93444586, + "num_input_tokens_seen": 4765000, + "step": 227, + "time_per_iteration": 2.4907190799713135 + }, + { + "auxiliary_loss_clip": 0.01257346, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.11236835, + "balance_loss_mlp": 1.02975178, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9358486583353766, + "language_loss": 0.57678378, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59983599, + "num_input_tokens_seen": 4833210, + "step": 228, + "time_per_iteration": 3.169078826904297 + }, + { + "auxiliary_loss_clip": 0.01367072, + "auxiliary_loss_mlp": 0.01109752, + "balance_loss_clip": 1.10257173, + "balance_loss_mlp": 1.06120956, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.339058272273342, + "language_loss": 0.87817699, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90294528, + "num_input_tokens_seen": 4850120, + "step": 229, + "time_per_iteration": 2.4594578742980957 + }, + { + "auxiliary_loss_clip": 0.01375282, + "auxiliary_loss_mlp": 0.01096757, + "balance_loss_clip": 1.10411215, + "balance_loss_mlp": 1.04843009, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 2.708523202922955, + "language_loss": 0.83798689, + "learning_rate": 3.501319237118231e-06, + "loss": 0.86270726, + "num_input_tokens_seen": 4866215, + "step": 230, + "time_per_iteration": 2.4734203815460205 + }, + { + "auxiliary_loss_clip": 0.0137446, + "auxiliary_loss_mlp": 0.01111894, + "balance_loss_clip": 1.10590684, + "balance_loss_mlp": 1.06339979, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.9525235788310171, + "language_loss": 0.90339214, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92825568, + "num_input_tokens_seen": 4885630, + "step": 231, + "time_per_iteration": 2.5117592811584473 + }, + { + "auxiliary_loss_clip": 0.01377066, + "auxiliary_loss_mlp": 0.01106095, + "balance_loss_clip": 1.1091826, + "balance_loss_mlp": 1.05779171, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 1.9599842656652255, + "language_loss": 0.83900261, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86383426, + "num_input_tokens_seen": 4905570, + "step": 232, + "time_per_iteration": 2.488180160522461 + }, + { + "auxiliary_loss_clip": 0.01378314, + "auxiliary_loss_mlp": 0.01091694, + "balance_loss_clip": 1.10065079, + "balance_loss_mlp": 1.04160213, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 5.163727512317506, + "language_loss": 0.74346447, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76816463, + "num_input_tokens_seen": 4923535, + "step": 233, + "time_per_iteration": 2.526461124420166 + }, + { + "auxiliary_loss_clip": 0.01381817, + "auxiliary_loss_mlp": 0.01116363, + "balance_loss_clip": 1.10771263, + "balance_loss_mlp": 1.06529403, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.1390766148076716, + "language_loss": 0.85585225, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88083404, + "num_input_tokens_seen": 4939200, + "step": 234, + "time_per_iteration": 2.460657835006714 + }, + { + "auxiliary_loss_clip": 0.01375833, + "auxiliary_loss_mlp": 0.01106623, + "balance_loss_clip": 1.1075983, + "balance_loss_mlp": 1.05836678, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.608738573104185, + "language_loss": 0.89393133, + "learning_rate": 3.515166054308634e-06, + "loss": 0.91875589, + "num_input_tokens_seen": 4956620, + "step": 235, + "time_per_iteration": 2.5093982219696045 + }, + { + "auxiliary_loss_clip": 0.01374816, + "auxiliary_loss_mlp": 0.011154, + "balance_loss_clip": 1.10940754, + "balance_loss_mlp": 1.06611919, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 2.1046013946737765, + "language_loss": 0.85650074, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88140297, + "num_input_tokens_seen": 4975650, + "step": 236, + "time_per_iteration": 2.598095417022705 + }, + { + "auxiliary_loss_clip": 0.01369521, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_clip": 1.10124946, + "balance_loss_mlp": 1.04592538, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 1.9046037087213008, + "language_loss": 0.82534558, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84999144, + "num_input_tokens_seen": 4997415, + "step": 237, + "time_per_iteration": 2.652592420578003 + }, + { + "auxiliary_loss_clip": 0.01369821, + "auxiliary_loss_mlp": 0.01115905, + "balance_loss_clip": 1.1053071, + "balance_loss_mlp": 1.06385827, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 5.385203875399552, + "language_loss": 0.77069265, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79554987, + "num_input_tokens_seen": 5013905, + "step": 238, + "time_per_iteration": 2.50907564163208 + }, + { + "auxiliary_loss_clip": 0.01366649, + "auxiliary_loss_mlp": 0.01107856, + "balance_loss_clip": 1.10630155, + "balance_loss_mlp": 1.06184077, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.898837102845785, + "language_loss": 0.87004435, + "learning_rate": 3.526033015791284e-06, + "loss": 0.8947894, + "num_input_tokens_seen": 5033645, + "step": 239, + "time_per_iteration": 2.5379183292388916 + }, + { + "auxiliary_loss_clip": 0.01352557, + "auxiliary_loss_mlp": 0.01096413, + "balance_loss_clip": 1.09792256, + "balance_loss_mlp": 1.04977858, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.509125648741597, + "language_loss": 0.9309634, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95545304, + "num_input_tokens_seen": 5052875, + "step": 240, + "time_per_iteration": 2.5452969074249268 + }, + { + "auxiliary_loss_clip": 0.01361727, + "auxiliary_loss_mlp": 0.01103612, + "balance_loss_clip": 1.10336685, + "balance_loss_mlp": 1.05790734, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.6619743731592918, + "language_loss": 0.85306752, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87772095, + "num_input_tokens_seen": 5075005, + "step": 241, + "time_per_iteration": 2.547118902206421 + }, + { + "auxiliary_loss_clip": 0.01360642, + "auxiliary_loss_mlp": 0.011183, + "balance_loss_clip": 1.10956383, + "balance_loss_mlp": 1.06975806, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.9724125345107713, + "language_loss": 0.88371098, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90850043, + "num_input_tokens_seen": 5091875, + "step": 242, + "time_per_iteration": 2.506615400314331 + }, + { + "auxiliary_loss_clip": 0.01360542, + "auxiliary_loss_mlp": 0.01098395, + "balance_loss_clip": 1.1028595, + "balance_loss_mlp": 1.04920936, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.110499783179231, + "language_loss": 0.86651802, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89110744, + "num_input_tokens_seen": 5111290, + "step": 243, + "time_per_iteration": 2.4969913959503174 + }, + { + "auxiliary_loss_clip": 0.01365643, + "auxiliary_loss_mlp": 0.01104354, + "balance_loss_clip": 1.10486078, + "balance_loss_mlp": 1.0555501, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.5780720038273512, + "language_loss": 0.84308362, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86778355, + "num_input_tokens_seen": 5132265, + "step": 244, + "time_per_iteration": 2.536526679992676 + }, + { + "auxiliary_loss_clip": 0.01374126, + "auxiliary_loss_mlp": 0.0110944, + "balance_loss_clip": 1.10766566, + "balance_loss_mlp": 1.05968213, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.578870390020949, + "language_loss": 0.78741097, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81224662, + "num_input_tokens_seen": 5148575, + "step": 245, + "time_per_iteration": 2.493699073791504 + }, + { + "auxiliary_loss_clip": 0.01370693, + "auxiliary_loss_mlp": 0.01101682, + "balance_loss_clip": 1.10262299, + "balance_loss_mlp": 1.05030262, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 1.920487661423537, + "language_loss": 0.84381235, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86853611, + "num_input_tokens_seen": 5170415, + "step": 246, + "time_per_iteration": 2.529954671859741 + }, + { + "auxiliary_loss_clip": 0.01366776, + "auxiliary_loss_mlp": 0.01097203, + "balance_loss_clip": 1.10236406, + "balance_loss_mlp": 1.04804134, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 2.197439433150473, + "language_loss": 0.89861923, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92325896, + "num_input_tokens_seen": 5188565, + "step": 247, + "time_per_iteration": 2.471529483795166 + }, + { + "auxiliary_loss_clip": 0.01364738, + "auxiliary_loss_mlp": 0.01096974, + "balance_loss_clip": 1.09715724, + "balance_loss_mlp": 1.0498147, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.2300163971487357, + "language_loss": 0.78100777, + "learning_rate": 3.549833136812155e-06, + "loss": 0.8056249, + "num_input_tokens_seen": 5207810, + "step": 248, + "time_per_iteration": 2.49983549118042 + }, + { + "auxiliary_loss_clip": 0.0136533, + "auxiliary_loss_mlp": 0.01104187, + "balance_loss_clip": 1.10540831, + "balance_loss_mlp": 1.05624104, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.1766533723966974, + "language_loss": 0.83914971, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86384487, + "num_input_tokens_seen": 5226210, + "step": 249, + "time_per_iteration": 2.534379482269287 + }, + { + "auxiliary_loss_clip": 0.01358275, + "auxiliary_loss_mlp": 0.01098628, + "balance_loss_clip": 1.09910035, + "balance_loss_mlp": 1.05220747, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.0579835883067945, + "language_loss": 0.93476218, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95933121, + "num_input_tokens_seen": 5241660, + "step": 250, + "time_per_iteration": 2.4468421936035156 + }, + { + "auxiliary_loss_clip": 0.01367621, + "auxiliary_loss_mlp": 0.01113865, + "balance_loss_clip": 1.10496664, + "balance_loss_mlp": 1.06439352, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.5685031795794675, + "language_loss": 0.96709311, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99190789, + "num_input_tokens_seen": 5261090, + "step": 251, + "time_per_iteration": 2.5202202796936035 + }, + { + "auxiliary_loss_clip": 0.01360885, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_clip": 1.09914684, + "balance_loss_mlp": 1.05374622, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.535275163336554, + "language_loss": 0.84185517, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86646354, + "num_input_tokens_seen": 5279175, + "step": 252, + "time_per_iteration": 2.515549898147583 + }, + { + "auxiliary_loss_clip": 0.01356375, + "auxiliary_loss_mlp": 0.01110883, + "balance_loss_clip": 1.10115755, + "balance_loss_mlp": 1.06310403, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.202691332335342, + "language_loss": 0.9830966, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00776923, + "num_input_tokens_seen": 5296975, + "step": 253, + "time_per_iteration": 2.4925734996795654 + }, + { + "auxiliary_loss_clip": 0.01246185, + "auxiliary_loss_mlp": 0.01040127, + "balance_loss_clip": 1.10934019, + "balance_loss_mlp": 1.02124429, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8530149196690382, + "language_loss": 0.556624, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57948709, + "num_input_tokens_seen": 5358375, + "step": 254, + "time_per_iteration": 3.075920343399048 + }, + { + "auxiliary_loss_clip": 0.01359861, + "auxiliary_loss_mlp": 0.01110706, + "balance_loss_clip": 1.09620512, + "balance_loss_mlp": 1.06335664, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.32995153095399, + "language_loss": 0.90406823, + "learning_rate": 3.567754632921479e-06, + "loss": 0.92877388, + "num_input_tokens_seen": 5377255, + "step": 255, + "time_per_iteration": 3.984647750854492 + }, + { + "auxiliary_loss_clip": 0.0135954, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_clip": 1.10069513, + "balance_loss_mlp": 1.0769155, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.4075769168244623, + "language_loss": 0.85505843, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.87990391, + "num_input_tokens_seen": 5395320, + "step": 256, + "time_per_iteration": 5.445713996887207 + }, + { + "auxiliary_loss_clip": 0.01365422, + "auxiliary_loss_mlp": 0.01112983, + "balance_loss_clip": 1.10033751, + "balance_loss_mlp": 1.06529975, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 3.2916472766802, + "language_loss": 0.71234804, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73713213, + "num_input_tokens_seen": 5411970, + "step": 257, + "time_per_iteration": 2.4745001792907715 + }, + { + "auxiliary_loss_clip": 0.01356208, + "auxiliary_loss_mlp": 0.01104018, + "balance_loss_clip": 1.09925294, + "balance_loss_mlp": 1.05657268, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 2.0588610725565792, + "language_loss": 0.94845182, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97305405, + "num_input_tokens_seen": 5430245, + "step": 258, + "time_per_iteration": 2.4935879707336426 + }, + { + "auxiliary_loss_clip": 0.01356343, + "auxiliary_loss_mlp": 0.01106123, + "balance_loss_clip": 1.0976553, + "balance_loss_mlp": 1.0597024, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 3.3570787655551806, + "language_loss": 0.92775482, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95237947, + "num_input_tokens_seen": 5448905, + "step": 259, + "time_per_iteration": 2.529538631439209 + }, + { + "auxiliary_loss_clip": 0.01348215, + "auxiliary_loss_mlp": 0.01096357, + "balance_loss_clip": 1.09951711, + "balance_loss_mlp": 1.05267894, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 1.7561702387253788, + "language_loss": 0.9726826, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99712837, + "num_input_tokens_seen": 5466405, + "step": 260, + "time_per_iteration": 3.8897969722747803 + }, + { + "auxiliary_loss_clip": 0.01365505, + "auxiliary_loss_mlp": 0.01110549, + "balance_loss_clip": 1.10199404, + "balance_loss_mlp": 1.06463003, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 1.9496446620658698, + "language_loss": 0.87577462, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90053523, + "num_input_tokens_seen": 5487055, + "step": 261, + "time_per_iteration": 2.6225905418395996 + }, + { + "auxiliary_loss_clip": 0.01356252, + "auxiliary_loss_mlp": 0.01111257, + "balance_loss_clip": 1.09708786, + "balance_loss_mlp": 1.0644083, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 1.8445312277018382, + "language_loss": 0.67348367, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69815874, + "num_input_tokens_seen": 5506600, + "step": 262, + "time_per_iteration": 2.499211311340332 + }, + { + "auxiliary_loss_clip": 0.01354185, + "auxiliary_loss_mlp": 0.01120363, + "balance_loss_clip": 1.09910071, + "balance_loss_mlp": 1.07256007, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 3.3964030032577406, + "language_loss": 0.68212068, + "learning_rate": 3.587643540438383e-06, + "loss": 0.7068662, + "num_input_tokens_seen": 5524350, + "step": 263, + "time_per_iteration": 2.497523546218872 + }, + { + "auxiliary_loss_clip": 0.01355866, + "auxiliary_loss_mlp": 0.01106942, + "balance_loss_clip": 1.09583426, + "balance_loss_mlp": 1.05947304, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.247494278640288, + "language_loss": 0.85107124, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87569928, + "num_input_tokens_seen": 5542145, + "step": 264, + "time_per_iteration": 2.459629774093628 + }, + { + "auxiliary_loss_clip": 0.0135992, + "auxiliary_loss_mlp": 0.01089162, + "balance_loss_clip": 1.09949398, + "balance_loss_mlp": 1.04481637, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 2.5848456106214774, + "language_loss": 1.04296112, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06745195, + "num_input_tokens_seen": 5557920, + "step": 265, + "time_per_iteration": 2.4775149822235107 + }, + { + "auxiliary_loss_clip": 0.01363793, + "auxiliary_loss_mlp": 0.01109466, + "balance_loss_clip": 1.10303974, + "balance_loss_mlp": 1.060256, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.1637422989215414, + "language_loss": 0.74849772, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77323031, + "num_input_tokens_seen": 5576290, + "step": 266, + "time_per_iteration": 2.4606246948242188 + }, + { + "auxiliary_loss_clip": 0.01351046, + "auxiliary_loss_mlp": 0.01096552, + "balance_loss_clip": 1.10013413, + "balance_loss_mlp": 1.0501318, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.7809456952889415, + "language_loss": 0.90624082, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93071687, + "num_input_tokens_seen": 5595205, + "step": 267, + "time_per_iteration": 2.5125083923339844 + }, + { + "auxiliary_loss_clip": 0.0135916, + "auxiliary_loss_mlp": 0.01110504, + "balance_loss_clip": 1.10020208, + "balance_loss_mlp": 1.06532359, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.1932230666662935, + "language_loss": 0.85615957, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88085622, + "num_input_tokens_seen": 5612645, + "step": 268, + "time_per_iteration": 2.4802701473236084 + }, + { + "auxiliary_loss_clip": 0.01352978, + "auxiliary_loss_mlp": 0.01096541, + "balance_loss_clip": 1.10273004, + "balance_loss_mlp": 1.05131292, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 2.8340839348642044, + "language_loss": 0.88279212, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90728736, + "num_input_tokens_seen": 5628345, + "step": 269, + "time_per_iteration": 2.526850938796997 + }, + { + "auxiliary_loss_clip": 0.01357447, + "auxiliary_loss_mlp": 0.01097628, + "balance_loss_clip": 1.09927607, + "balance_loss_mlp": 1.04844213, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.0611008585730857, + "language_loss": 0.97002208, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99457282, + "num_input_tokens_seen": 5645940, + "step": 270, + "time_per_iteration": 2.538825511932373 + }, + { + "auxiliary_loss_clip": 0.01357458, + "auxiliary_loss_mlp": 0.01113541, + "balance_loss_clip": 1.1033802, + "balance_loss_mlp": 1.06707299, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.5355461313071035, + "language_loss": 0.86563289, + "learning_rate": 3.606936435072361e-06, + "loss": 0.89034283, + "num_input_tokens_seen": 5665690, + "step": 271, + "time_per_iteration": 2.5250065326690674 + }, + { + "auxiliary_loss_clip": 0.01354609, + "auxiliary_loss_mlp": 0.01099604, + "balance_loss_clip": 1.09634709, + "balance_loss_mlp": 1.05370879, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 2.7866678749862874, + "language_loss": 0.81285536, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83739746, + "num_input_tokens_seen": 5683190, + "step": 272, + "time_per_iteration": 2.525155544281006 + }, + { + "auxiliary_loss_clip": 0.01348014, + "auxiliary_loss_mlp": 0.01113736, + "balance_loss_clip": 1.09690666, + "balance_loss_mlp": 1.06917596, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.2289483971263206, + "language_loss": 0.81146073, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83607823, + "num_input_tokens_seen": 5699780, + "step": 273, + "time_per_iteration": 2.4777450561523438 + }, + { + "auxiliary_loss_clip": 0.0134901, + "auxiliary_loss_mlp": 0.01098093, + "balance_loss_clip": 1.09497809, + "balance_loss_mlp": 1.05236435, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 4.253964252280091, + "language_loss": 0.91417611, + "learning_rate": 3.614024787585744e-06, + "loss": 0.93864715, + "num_input_tokens_seen": 5716980, + "step": 274, + "time_per_iteration": 2.473722457885742 + }, + { + "auxiliary_loss_clip": 0.0134667, + "auxiliary_loss_mlp": 0.01104395, + "balance_loss_clip": 1.09690738, + "balance_loss_mlp": 1.05787969, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.8902194172800533, + "language_loss": 0.87996864, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90447927, + "num_input_tokens_seen": 5737780, + "step": 275, + "time_per_iteration": 2.5322914123535156 + }, + { + "auxiliary_loss_clip": 0.01348671, + "auxiliary_loss_mlp": 0.01101034, + "balance_loss_clip": 1.09627593, + "balance_loss_mlp": 1.05304003, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 2.025555952733325, + "language_loss": 0.8071782, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83167529, + "num_input_tokens_seen": 5758330, + "step": 276, + "time_per_iteration": 2.505620241165161 + }, + { + "auxiliary_loss_clip": 0.01340811, + "auxiliary_loss_mlp": 0.01097289, + "balance_loss_clip": 1.09616458, + "balance_loss_mlp": 1.05427837, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.97496996240197, + "language_loss": 0.81084448, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83522552, + "num_input_tokens_seen": 5778340, + "step": 277, + "time_per_iteration": 2.5837647914886475 + }, + { + "auxiliary_loss_clip": 0.0133843, + "auxiliary_loss_mlp": 0.01093563, + "balance_loss_clip": 1.08929491, + "balance_loss_mlp": 1.04854989, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 3.004268580596335, + "language_loss": 0.80319834, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82751822, + "num_input_tokens_seen": 5794295, + "step": 278, + "time_per_iteration": 2.4749159812927246 + }, + { + "auxiliary_loss_clip": 0.01343296, + "auxiliary_loss_mlp": 0.01094882, + "balance_loss_clip": 1.09471226, + "balance_loss_mlp": 1.05129921, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 5.576345345918437, + "language_loss": 0.9057911, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.93017286, + "num_input_tokens_seen": 5814405, + "step": 279, + "time_per_iteration": 2.538004159927368 + }, + { + "auxiliary_loss_clip": 0.01348408, + "auxiliary_loss_mlp": 0.0111305, + "balance_loss_clip": 1.09460592, + "balance_loss_mlp": 1.06703496, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 2.6562087817624738, + "language_loss": 0.94060361, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96521819, + "num_input_tokens_seen": 5832795, + "step": 280, + "time_per_iteration": 2.461811065673828 + }, + { + "auxiliary_loss_clip": 0.01350646, + "auxiliary_loss_mlp": 0.01102296, + "balance_loss_clip": 1.09483695, + "balance_loss_mlp": 1.05580413, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 2.7798304941523186, + "language_loss": 0.74170077, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76623017, + "num_input_tokens_seen": 5855750, + "step": 281, + "time_per_iteration": 2.5706076622009277 + }, + { + "auxiliary_loss_clip": 0.01344765, + "auxiliary_loss_mlp": 0.01108498, + "balance_loss_clip": 1.09605944, + "balance_loss_mlp": 1.06484365, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.4634173261800862, + "language_loss": 0.80018687, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82471949, + "num_input_tokens_seen": 5872610, + "step": 282, + "time_per_iteration": 2.4536666870117188 + }, + { + "auxiliary_loss_clip": 0.01349065, + "auxiliary_loss_mlp": 0.01115145, + "balance_loss_clip": 1.09786677, + "balance_loss_mlp": 1.06982148, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.1438993355040723, + "language_loss": 0.77696741, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80160946, + "num_input_tokens_seen": 5892985, + "step": 283, + "time_per_iteration": 2.514089345932007 + }, + { + "auxiliary_loss_clip": 0.01349034, + "auxiliary_loss_mlp": 0.01089016, + "balance_loss_clip": 1.09886742, + "balance_loss_mlp": 1.04614866, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.0167109522490234, + "language_loss": 0.84279883, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86717927, + "num_input_tokens_seen": 5914060, + "step": 284, + "time_per_iteration": 2.5875539779663086 + }, + { + "auxiliary_loss_clip": 0.01341633, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_clip": 1.09409785, + "balance_loss_mlp": 1.05271876, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 2.7116083078241573, + "language_loss": 0.96644068, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99083889, + "num_input_tokens_seen": 5932860, + "step": 285, + "time_per_iteration": 2.5067193508148193 + }, + { + "auxiliary_loss_clip": 0.01343277, + "auxiliary_loss_mlp": 0.01092336, + "balance_loss_clip": 1.09564328, + "balance_loss_mlp": 1.04992151, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.610558193457133, + "language_loss": 0.93784893, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96220493, + "num_input_tokens_seen": 5952725, + "step": 286, + "time_per_iteration": 2.5199899673461914 + }, + { + "auxiliary_loss_clip": 0.01334649, + "auxiliary_loss_mlp": 0.01088786, + "balance_loss_clip": 1.08995581, + "balance_loss_mlp": 1.04434502, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.7525734114699505, + "language_loss": 0.92269361, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94692791, + "num_input_tokens_seen": 5970560, + "step": 287, + "time_per_iteration": 2.522958517074585 + }, + { + "auxiliary_loss_clip": 0.01339753, + "auxiliary_loss_mlp": 0.01085337, + "balance_loss_clip": 1.09052587, + "balance_loss_mlp": 1.04139602, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 2.9337332631809603, + "language_loss": 1.01775944, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04201031, + "num_input_tokens_seen": 5982980, + "step": 288, + "time_per_iteration": 2.4166312217712402 + }, + { + "auxiliary_loss_clip": 0.01231926, + "auxiliary_loss_mlp": 0.01080217, + "balance_loss_clip": 1.09917426, + "balance_loss_mlp": 1.06362307, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9236942330296173, + "language_loss": 0.63925326, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66237468, + "num_input_tokens_seen": 6049445, + "step": 289, + "time_per_iteration": 3.2080752849578857 + }, + { + "auxiliary_loss_clip": 0.01343169, + "auxiliary_loss_mlp": 0.01103724, + "balance_loss_clip": 1.09676957, + "balance_loss_mlp": 1.06185806, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4488817272544536, + "language_loss": 0.88445842, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90892726, + "num_input_tokens_seen": 6064150, + "step": 290, + "time_per_iteration": 2.44972825050354 + }, + { + "auxiliary_loss_clip": 0.0133934, + "auxiliary_loss_mlp": 0.01092024, + "balance_loss_clip": 1.09372199, + "balance_loss_mlp": 1.04877484, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 3.6774414612584247, + "language_loss": 0.84859455, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87290812, + "num_input_tokens_seen": 6083920, + "step": 291, + "time_per_iteration": 2.53277325630188 + }, + { + "auxiliary_loss_clip": 0.01345236, + "auxiliary_loss_mlp": 0.0110898, + "balance_loss_clip": 1.10037279, + "balance_loss_mlp": 1.06201112, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.662535373406668, + "language_loss": 0.72688532, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75142753, + "num_input_tokens_seen": 6105460, + "step": 292, + "time_per_iteration": 2.5307374000549316 + }, + { + "auxiliary_loss_clip": 0.01335759, + "auxiliary_loss_mlp": 0.01100655, + "balance_loss_clip": 1.09430075, + "balance_loss_mlp": 1.05666673, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 12.59911602060746, + "language_loss": 0.87064266, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89500672, + "num_input_tokens_seen": 6122890, + "step": 293, + "time_per_iteration": 2.508338451385498 + }, + { + "auxiliary_loss_clip": 0.01335692, + "auxiliary_loss_mlp": 0.01105217, + "balance_loss_clip": 1.09301782, + "balance_loss_mlp": 1.06268311, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.1216397138645178, + "language_loss": 0.80994695, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83435601, + "num_input_tokens_seen": 6142890, + "step": 294, + "time_per_iteration": 2.510176181793213 + }, + { + "auxiliary_loss_clip": 0.0133443, + "auxiliary_loss_mlp": 0.01110088, + "balance_loss_clip": 1.08952761, + "balance_loss_mlp": 1.06750655, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.3273913843643674, + "language_loss": 0.84026003, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.8647052, + "num_input_tokens_seen": 6162030, + "step": 295, + "time_per_iteration": 3.938333511352539 + }, + { + "auxiliary_loss_clip": 0.01339869, + "auxiliary_loss_mlp": 0.01114777, + "balance_loss_clip": 1.09780574, + "balance_loss_mlp": 1.07198048, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.734420606153167, + "language_loss": 0.84432316, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.8688696, + "num_input_tokens_seen": 6180540, + "step": 296, + "time_per_iteration": 3.900998830795288 + }, + { + "auxiliary_loss_clip": 0.01340381, + "auxiliary_loss_mlp": 0.0110367, + "balance_loss_clip": 1.09460831, + "balance_loss_mlp": 1.06101739, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.196416924390639, + "language_loss": 0.87889975, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90334022, + "num_input_tokens_seen": 6199425, + "step": 297, + "time_per_iteration": 3.94921875 + }, + { + "auxiliary_loss_clip": 0.01339065, + "auxiliary_loss_mlp": 0.01098455, + "balance_loss_clip": 1.09390628, + "balance_loss_mlp": 1.05666065, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.179227888482242, + "language_loss": 0.88747871, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.91185391, + "num_input_tokens_seen": 6219170, + "step": 298, + "time_per_iteration": 2.4933362007141113 + }, + { + "auxiliary_loss_clip": 0.013337, + "auxiliary_loss_mlp": 0.01115641, + "balance_loss_clip": 1.09331632, + "balance_loss_mlp": 1.07174826, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 3.158942718098602, + "language_loss": 0.8868472, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91134059, + "num_input_tokens_seen": 6237930, + "step": 299, + "time_per_iteration": 3.914130687713623 + }, + { + "auxiliary_loss_clip": 0.01340532, + "auxiliary_loss_mlp": 0.01105238, + "balance_loss_clip": 1.09358263, + "balance_loss_mlp": 1.06203675, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.1833774600787907, + "language_loss": 0.65116233, + "learning_rate": 3.672392800539357e-06, + "loss": 0.67562008, + "num_input_tokens_seen": 6257170, + "step": 300, + "time_per_iteration": 2.5339884757995605 + }, + { + "auxiliary_loss_clip": 0.01339814, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_clip": 1.09704065, + "balance_loss_mlp": 1.06181479, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.750200409989873, + "language_loss": 0.8836804, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90812653, + "num_input_tokens_seen": 6274780, + "step": 301, + "time_per_iteration": 2.476020336151123 + }, + { + "auxiliary_loss_clip": 0.01226111, + "auxiliary_loss_mlp": 0.01047913, + "balance_loss_clip": 1.09287012, + "balance_loss_mlp": 1.03255868, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8365119364986705, + "language_loss": 0.62309098, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64583123, + "num_input_tokens_seen": 6340435, + "step": 302, + "time_per_iteration": 3.206728458404541 + }, + { + "auxiliary_loss_clip": 0.01331543, + "auxiliary_loss_mlp": 0.01104892, + "balance_loss_clip": 1.09083927, + "balance_loss_mlp": 1.06128526, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.17314706426626, + "language_loss": 0.8968991, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92126346, + "num_input_tokens_seen": 6358160, + "step": 303, + "time_per_iteration": 2.4857423305511475 + }, + { + "auxiliary_loss_clip": 0.01339178, + "auxiliary_loss_mlp": 0.01118617, + "balance_loss_clip": 1.09576619, + "balance_loss_mlp": 1.07326984, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 1.853249764479849, + "language_loss": 0.80275989, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82733786, + "num_input_tokens_seen": 6378485, + "step": 304, + "time_per_iteration": 2.5510451793670654 + }, + { + "auxiliary_loss_clip": 0.01332898, + "auxiliary_loss_mlp": 0.01088227, + "balance_loss_clip": 1.0969311, + "balance_loss_mlp": 1.04645586, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.7304720613151863, + "language_loss": 0.83009815, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85430938, + "num_input_tokens_seen": 6397845, + "step": 305, + "time_per_iteration": 2.489819049835205 + }, + { + "auxiliary_loss_clip": 0.01332507, + "auxiliary_loss_mlp": 0.0109183, + "balance_loss_clip": 1.08911729, + "balance_loss_mlp": 1.05065548, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.998992614005302, + "language_loss": 0.90865397, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93289733, + "num_input_tokens_seen": 6416475, + "step": 306, + "time_per_iteration": 2.460258722305298 + }, + { + "auxiliary_loss_clip": 0.01328223, + "auxiliary_loss_mlp": 0.01087101, + "balance_loss_clip": 1.08886886, + "balance_loss_mlp": 1.04592609, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.1754803175172475, + "language_loss": 0.86514831, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88930154, + "num_input_tokens_seen": 6437520, + "step": 307, + "time_per_iteration": 2.550722360610962 + }, + { + "auxiliary_loss_clip": 0.01328772, + "auxiliary_loss_mlp": 0.01102816, + "balance_loss_clip": 1.09243715, + "balance_loss_mlp": 1.0570159, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 3.1665447372330817, + "language_loss": 0.71879208, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74310791, + "num_input_tokens_seen": 6455680, + "step": 308, + "time_per_iteration": 2.463118314743042 + }, + { + "auxiliary_loss_clip": 0.01333773, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_clip": 1.0891062, + "balance_loss_mlp": 1.05775237, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.0182912045877552, + "language_loss": 0.91948426, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94382155, + "num_input_tokens_seen": 6474880, + "step": 309, + "time_per_iteration": 2.4942262172698975 + }, + { + "auxiliary_loss_clip": 0.01343867, + "auxiliary_loss_mlp": 0.01106054, + "balance_loss_clip": 1.09179235, + "balance_loss_mlp": 1.06070662, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 1.899445356045275, + "language_loss": 0.72632545, + "learning_rate": 3.69350459956065e-06, + "loss": 0.75082463, + "num_input_tokens_seen": 6495945, + "step": 310, + "time_per_iteration": 2.5309481620788574 + }, + { + "auxiliary_loss_clip": 0.01332813, + "auxiliary_loss_mlp": 0.01109709, + "balance_loss_clip": 1.09472179, + "balance_loss_mlp": 1.0676285, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 2.1551720363282456, + "language_loss": 0.74244255, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76686776, + "num_input_tokens_seen": 6519930, + "step": 311, + "time_per_iteration": 2.7023937702178955 + }, + { + "auxiliary_loss_clip": 0.01341776, + "auxiliary_loss_mlp": 0.01105409, + "balance_loss_clip": 1.09252739, + "balance_loss_mlp": 1.06316125, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.5238694445447827, + "language_loss": 0.91717803, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.94164991, + "num_input_tokens_seen": 6535070, + "step": 312, + "time_per_iteration": 2.461899518966675 + }, + { + "auxiliary_loss_clip": 0.01339947, + "auxiliary_loss_mlp": 0.01113591, + "balance_loss_clip": 1.09409261, + "balance_loss_mlp": 1.06924522, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.136859507364212, + "language_loss": 0.89849353, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92302889, + "num_input_tokens_seen": 6554135, + "step": 313, + "time_per_iteration": 2.496241569519043 + }, + { + "auxiliary_loss_clip": 0.01341093, + "auxiliary_loss_mlp": 0.01099164, + "balance_loss_clip": 1.0911808, + "balance_loss_mlp": 1.05450845, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 3.4733316903202405, + "language_loss": 0.72862625, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75302881, + "num_input_tokens_seen": 6572275, + "step": 314, + "time_per_iteration": 2.475269317626953 + }, + { + "auxiliary_loss_clip": 0.01330786, + "auxiliary_loss_mlp": 0.01098747, + "balance_loss_clip": 1.09032428, + "balance_loss_mlp": 1.05657077, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.354521662221602, + "language_loss": 0.8975085, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92180377, + "num_input_tokens_seen": 6594520, + "step": 315, + "time_per_iteration": 2.5711212158203125 + }, + { + "auxiliary_loss_clip": 0.01332966, + "auxiliary_loss_mlp": 0.01094237, + "balance_loss_clip": 1.09188747, + "balance_loss_mlp": 1.0512979, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.8819825920231137, + "language_loss": 0.80462211, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.82889414, + "num_input_tokens_seen": 6614245, + "step": 316, + "time_per_iteration": 2.519044876098633 + }, + { + "auxiliary_loss_clip": 0.01326705, + "auxiliary_loss_mlp": 0.01089728, + "balance_loss_clip": 1.08858204, + "balance_loss_mlp": 1.04714608, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.45545861371881, + "language_loss": 0.90161741, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92578173, + "num_input_tokens_seen": 6632015, + "step": 317, + "time_per_iteration": 2.4678704738616943 + }, + { + "auxiliary_loss_clip": 0.01322969, + "auxiliary_loss_mlp": 0.01094816, + "balance_loss_clip": 1.08714461, + "balance_loss_mlp": 1.05151892, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.625226350272872, + "language_loss": 0.91141826, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93559611, + "num_input_tokens_seen": 6649015, + "step": 318, + "time_per_iteration": 2.43766713142395 + }, + { + "auxiliary_loss_clip": 0.01324805, + "auxiliary_loss_mlp": 0.0108682, + "balance_loss_clip": 1.08695912, + "balance_loss_mlp": 1.04621768, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.3503322945086436, + "language_loss": 0.93974102, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96385729, + "num_input_tokens_seen": 6669225, + "step": 319, + "time_per_iteration": 2.51778507232666 + }, + { + "auxiliary_loss_clip": 0.01217035, + "auxiliary_loss_mlp": 0.01092375, + "balance_loss_clip": 1.08695436, + "balance_loss_mlp": 1.07711661, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9463389720998256, + "language_loss": 0.59848481, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62157893, + "num_input_tokens_seen": 6725775, + "step": 320, + "time_per_iteration": 2.980090618133545 + }, + { + "auxiliary_loss_clip": 0.01322455, + "auxiliary_loss_mlp": 0.01100194, + "balance_loss_clip": 1.0859766, + "balance_loss_mlp": 1.05942476, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 5.932100754404701, + "language_loss": 0.90214074, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92636716, + "num_input_tokens_seen": 6744170, + "step": 321, + "time_per_iteration": 2.492007255554199 + }, + { + "auxiliary_loss_clip": 0.01332663, + "auxiliary_loss_mlp": 0.01115041, + "balance_loss_clip": 1.09055674, + "balance_loss_mlp": 1.07188737, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.230640637246534, + "language_loss": 0.83002436, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85450137, + "num_input_tokens_seen": 6764565, + "step": 322, + "time_per_iteration": 2.527339458465576 + }, + { + "auxiliary_loss_clip": 0.01332615, + "auxiliary_loss_mlp": 0.01085278, + "balance_loss_clip": 1.08884335, + "balance_loss_mlp": 1.04517555, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.1695544596704104, + "language_loss": 0.72848475, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75266373, + "num_input_tokens_seen": 6785310, + "step": 323, + "time_per_iteration": 2.5003013610839844 + }, + { + "auxiliary_loss_clip": 0.01322157, + "auxiliary_loss_mlp": 0.01087458, + "balance_loss_clip": 1.08413506, + "balance_loss_mlp": 1.04594898, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.3572753709025944, + "language_loss": 0.92329848, + "learning_rate": 3.721944334919596e-06, + "loss": 0.94739467, + "num_input_tokens_seen": 6803290, + "step": 324, + "time_per_iteration": 2.465348482131958 + }, + { + "auxiliary_loss_clip": 0.01331258, + "auxiliary_loss_mlp": 0.0108793, + "balance_loss_clip": 1.09108543, + "balance_loss_mlp": 1.04861498, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 3.73448382897044, + "language_loss": 0.6539402, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67813206, + "num_input_tokens_seen": 6822570, + "step": 325, + "time_per_iteration": 2.490997791290283 + }, + { + "auxiliary_loss_clip": 0.0133, + "auxiliary_loss_mlp": 0.01102521, + "balance_loss_clip": 1.09613824, + "balance_loss_mlp": 1.06084585, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 2.4010957537055657, + "language_loss": 0.76620531, + "learning_rate": 3.72590651470665e-06, + "loss": 0.79053056, + "num_input_tokens_seen": 6841910, + "step": 326, + "time_per_iteration": 2.467724561691284 + }, + { + "auxiliary_loss_clip": 0.01321449, + "auxiliary_loss_mlp": 0.0110181, + "balance_loss_clip": 1.0899657, + "balance_loss_mlp": 1.05982423, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.189692402650846, + "language_loss": 0.79579806, + "learning_rate": 3.727878498433505e-06, + "loss": 0.82003069, + "num_input_tokens_seen": 6862480, + "step": 327, + "time_per_iteration": 2.5952308177948 + }, + { + "auxiliary_loss_clip": 0.0133027, + "auxiliary_loss_mlp": 0.01113375, + "balance_loss_clip": 1.09185171, + "balance_loss_mlp": 1.07217646, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.8289461544197163, + "language_loss": 0.80942881, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83386528, + "num_input_tokens_seen": 6882015, + "step": 328, + "time_per_iteration": 2.5071957111358643 + }, + { + "auxiliary_loss_clip": 0.01327821, + "auxiliary_loss_mlp": 0.01096237, + "balance_loss_clip": 1.0860399, + "balance_loss_mlp": 1.05344129, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.3686315838429537, + "language_loss": 0.93946159, + "learning_rate": 3.731804438545683e-06, + "loss": 0.9637022, + "num_input_tokens_seen": 6899785, + "step": 329, + "time_per_iteration": 2.473954200744629 + }, + { + "auxiliary_loss_clip": 0.01335664, + "auxiliary_loss_mlp": 0.01108606, + "balance_loss_clip": 1.09284461, + "balance_loss_mlp": 1.06690633, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 4.319637204977247, + "language_loss": 0.74561107, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.7700538, + "num_input_tokens_seen": 6918575, + "step": 330, + "time_per_iteration": 2.4870998859405518 + }, + { + "auxiliary_loss_clip": 0.01330454, + "auxiliary_loss_mlp": 0.01123333, + "balance_loss_clip": 1.08976436, + "balance_loss_mlp": 1.08134794, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 3.197023503929683, + "language_loss": 0.93529898, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.95983684, + "num_input_tokens_seen": 6936965, + "step": 331, + "time_per_iteration": 2.470994234085083 + }, + { + "auxiliary_loss_clip": 0.01318953, + "auxiliary_loss_mlp": 0.01088227, + "balance_loss_clip": 1.08842516, + "balance_loss_mlp": 1.04881644, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.148698452923138, + "language_loss": 0.92590296, + "learning_rate": 3.737648825272422e-06, + "loss": 0.94997478, + "num_input_tokens_seen": 6953475, + "step": 332, + "time_per_iteration": 2.4409186840057373 + }, + { + "auxiliary_loss_clip": 0.01325237, + "auxiliary_loss_mlp": 0.01096265, + "balance_loss_clip": 1.0916245, + "balance_loss_mlp": 1.05435109, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 3.135842564504033, + "language_loss": 0.75710475, + "learning_rate": 3.739585224276384e-06, + "loss": 0.7813198, + "num_input_tokens_seen": 6971630, + "step": 333, + "time_per_iteration": 2.507233142852783 + }, + { + "auxiliary_loss_clip": 0.01327891, + "auxiliary_loss_mlp": 0.01088918, + "balance_loss_clip": 1.08948731, + "balance_loss_mlp": 1.04838705, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 3.1354887918276444, + "language_loss": 0.79034805, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81451619, + "num_input_tokens_seen": 6992775, + "step": 334, + "time_per_iteration": 2.616846799850464 + }, + { + "auxiliary_loss_clip": 0.01326511, + "auxiliary_loss_mlp": 0.01097109, + "balance_loss_clip": 1.08462965, + "balance_loss_mlp": 1.05319214, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 1.9147458894170153, + "language_loss": 0.83279043, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85702664, + "num_input_tokens_seen": 7011425, + "step": 335, + "time_per_iteration": 2.512467622756958 + }, + { + "auxiliary_loss_clip": 0.01323902, + "auxiliary_loss_mlp": 0.01089339, + "balance_loss_clip": 1.08833301, + "balance_loss_mlp": 1.0485692, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.5601958622623453, + "language_loss": 0.92171061, + "learning_rate": 3.745359722027911e-06, + "loss": 0.9458431, + "num_input_tokens_seen": 7029450, + "step": 336, + "time_per_iteration": 3.9015705585479736 + }, + { + "auxiliary_loss_clip": 0.0132187, + "auxiliary_loss_mlp": 0.01083777, + "balance_loss_clip": 1.08440232, + "balance_loss_mlp": 1.0435555, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 2.7776367708114926, + "language_loss": 0.88261974, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90667617, + "num_input_tokens_seen": 7047555, + "step": 337, + "time_per_iteration": 3.8530378341674805 + }, + { + "auxiliary_loss_clip": 0.01312823, + "auxiliary_loss_mlp": 0.01105027, + "balance_loss_clip": 1.08227158, + "balance_loss_mlp": 1.06213546, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.437707212420485, + "language_loss": 0.8990308, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92320931, + "num_input_tokens_seen": 7068185, + "step": 338, + "time_per_iteration": 2.506226062774658 + }, + { + "auxiliary_loss_clip": 0.01325037, + "auxiliary_loss_mlp": 0.01100155, + "balance_loss_clip": 1.08754766, + "balance_loss_mlp": 1.05900407, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.305037281582599, + "language_loss": 0.85032213, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87457407, + "num_input_tokens_seen": 7085955, + "step": 339, + "time_per_iteration": 2.480450391769409 + }, + { + "auxiliary_loss_clip": 0.01330519, + "auxiliary_loss_mlp": 0.01097299, + "balance_loss_clip": 1.09029722, + "balance_loss_mlp": 1.05512297, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.8845668019632695, + "language_loss": 0.88977486, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91405308, + "num_input_tokens_seen": 7106345, + "step": 340, + "time_per_iteration": 4.072977781295776 + }, + { + "auxiliary_loss_clip": 0.01324258, + "auxiliary_loss_mlp": 0.01083354, + "balance_loss_clip": 1.08607709, + "balance_loss_mlp": 1.03996181, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.307769604997878, + "language_loss": 0.88413668, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90821278, + "num_input_tokens_seen": 7125070, + "step": 341, + "time_per_iteration": 2.5404324531555176 + }, + { + "auxiliary_loss_clip": 0.01326837, + "auxiliary_loss_mlp": 0.01100216, + "balance_loss_clip": 1.08489037, + "balance_loss_mlp": 1.05844498, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 2.577409075404897, + "language_loss": 0.80751473, + "learning_rate": 3.756755633390458e-06, + "loss": 0.83178532, + "num_input_tokens_seen": 7144675, + "step": 342, + "time_per_iteration": 2.4913253784179688 + }, + { + "auxiliary_loss_clip": 0.0131645, + "auxiliary_loss_mlp": 0.01094506, + "balance_loss_clip": 1.08466792, + "balance_loss_mlp": 1.05049372, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.342418847543117, + "language_loss": 0.89469361, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91880316, + "num_input_tokens_seen": 7165505, + "step": 343, + "time_per_iteration": 2.518921375274658 + }, + { + "auxiliary_loss_clip": 0.01328053, + "auxiliary_loss_mlp": 0.01093929, + "balance_loss_clip": 1.09204304, + "balance_loss_mlp": 1.05318308, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.2758751624340015, + "language_loss": 0.78262615, + "learning_rate": 3.7605098841644e-06, + "loss": 0.8068459, + "num_input_tokens_seen": 7184605, + "step": 344, + "time_per_iteration": 2.487805128097534 + }, + { + "auxiliary_loss_clip": 0.01314016, + "auxiliary_loss_mlp": 0.01099225, + "balance_loss_clip": 1.08517361, + "balance_loss_mlp": 1.0567627, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.914468512626705, + "language_loss": 0.75181007, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.7759425, + "num_input_tokens_seen": 7203065, + "step": 345, + "time_per_iteration": 2.4860970973968506 + }, + { + "auxiliary_loss_clip": 0.01318401, + "auxiliary_loss_mlp": 0.01101651, + "balance_loss_clip": 1.08849525, + "balance_loss_mlp": 1.06018996, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 3.6134232053327877, + "language_loss": 0.90435696, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92855752, + "num_input_tokens_seen": 7222995, + "step": 346, + "time_per_iteration": 2.4894537925720215 + }, + { + "auxiliary_loss_clip": 0.01314846, + "auxiliary_loss_mlp": 0.01093877, + "balance_loss_clip": 1.08520341, + "balance_loss_mlp": 1.05527687, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.0139313886232166, + "language_loss": 0.79091573, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81500292, + "num_input_tokens_seen": 7244625, + "step": 347, + "time_per_iteration": 2.522524833679199 + }, + { + "auxiliary_loss_clip": 0.01319652, + "auxiliary_loss_mlp": 0.0109468, + "balance_loss_clip": 1.0892868, + "balance_loss_mlp": 1.05247951, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.175923569585439, + "language_loss": 0.71087229, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.73501557, + "num_input_tokens_seen": 7263255, + "step": 348, + "time_per_iteration": 2.505429744720459 + }, + { + "auxiliary_loss_clip": 0.0132249, + "auxiliary_loss_mlp": 0.01100222, + "balance_loss_clip": 1.0867523, + "balance_loss_mlp": 1.05802202, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 3.929685879904005, + "language_loss": 0.76482928, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.78905642, + "num_input_tokens_seen": 7279275, + "step": 349, + "time_per_iteration": 2.4882137775421143 + }, + { + "auxiliary_loss_clip": 0.01304808, + "auxiliary_loss_mlp": 0.01101858, + "balance_loss_clip": 1.08355212, + "balance_loss_mlp": 1.06156576, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 2.152371503642441, + "language_loss": 0.85246921, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87653589, + "num_input_tokens_seen": 7300180, + "step": 350, + "time_per_iteration": 2.5127670764923096 + }, + { + "auxiliary_loss_clip": 0.01313677, + "auxiliary_loss_mlp": 0.01089355, + "balance_loss_clip": 1.08753395, + "balance_loss_mlp": 1.05163765, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.473665906701108, + "language_loss": 0.79743111, + "learning_rate": 3.773480007028776e-06, + "loss": 0.8214615, + "num_input_tokens_seen": 7317430, + "step": 351, + "time_per_iteration": 2.491016149520874 + }, + { + "auxiliary_loss_clip": 0.01320817, + "auxiliary_loss_mlp": 0.01103205, + "balance_loss_clip": 1.08881044, + "balance_loss_mlp": 1.06186342, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 3.043079770547671, + "language_loss": 0.87514383, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89938402, + "num_input_tokens_seen": 7334875, + "step": 352, + "time_per_iteration": 2.498291254043579 + }, + { + "auxiliary_loss_clip": 0.01312631, + "auxiliary_loss_mlp": 0.01099866, + "balance_loss_clip": 1.08639741, + "balance_loss_mlp": 1.05861998, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.8779648771346387, + "language_loss": 0.82383776, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.8479628, + "num_input_tokens_seen": 7355185, + "step": 353, + "time_per_iteration": 2.520554304122925 + }, + { + "auxiliary_loss_clip": 0.01310822, + "auxiliary_loss_mlp": 0.0108704, + "balance_loss_clip": 1.08638203, + "balance_loss_mlp": 1.04834497, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.1059474650725427, + "language_loss": 0.80989194, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83387053, + "num_input_tokens_seen": 7374425, + "step": 354, + "time_per_iteration": 2.526477575302124 + }, + { + "auxiliary_loss_clip": 0.01318077, + "auxiliary_loss_mlp": 0.01090365, + "balance_loss_clip": 1.08506918, + "balance_loss_mlp": 1.04845059, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.905702165991997, + "language_loss": 0.8095566, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83364099, + "num_input_tokens_seen": 7394175, + "step": 355, + "time_per_iteration": 2.5150527954101562 + }, + { + "auxiliary_loss_clip": 0.01313566, + "auxiliary_loss_mlp": 0.01089576, + "balance_loss_clip": 1.08419275, + "balance_loss_mlp": 1.05011821, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2044783617533534, + "language_loss": 0.89647329, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.92050469, + "num_input_tokens_seen": 7412645, + "step": 356, + "time_per_iteration": 2.463325262069702 + }, + { + "auxiliary_loss_clip": 0.01309396, + "auxiliary_loss_mlp": 0.01083092, + "balance_loss_clip": 1.08513677, + "balance_loss_mlp": 1.04232264, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.0268176104672566, + "language_loss": 0.80260766, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82653254, + "num_input_tokens_seen": 7432275, + "step": 357, + "time_per_iteration": 2.554547071456909 + }, + { + "auxiliary_loss_clip": 0.01312592, + "auxiliary_loss_mlp": 0.01081223, + "balance_loss_clip": 1.08476079, + "balance_loss_mlp": 1.04369628, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.5385034954939103, + "language_loss": 0.76442295, + "learning_rate": 3.786194003461506e-06, + "loss": 0.78836113, + "num_input_tokens_seen": 7450245, + "step": 358, + "time_per_iteration": 2.4306373596191406 + }, + { + "auxiliary_loss_clip": 0.01308758, + "auxiliary_loss_mlp": 0.01088186, + "balance_loss_clip": 1.08131087, + "balance_loss_mlp": 1.04648662, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 2.5685636752614065, + "language_loss": 0.88125753, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90522683, + "num_input_tokens_seen": 7466845, + "step": 359, + "time_per_iteration": 2.475381851196289 + }, + { + "auxiliary_loss_clip": 0.01318939, + "auxiliary_loss_mlp": 0.01090638, + "balance_loss_clip": 1.08718371, + "balance_loss_mlp": 1.05351651, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 3.0777074474966475, + "language_loss": 0.7565074, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78060317, + "num_input_tokens_seen": 7485450, + "step": 360, + "time_per_iteration": 2.4796133041381836 + }, + { + "auxiliary_loss_clip": 0.01210211, + "auxiliary_loss_mlp": 0.01071956, + "balance_loss_clip": 1.08384395, + "balance_loss_mlp": 1.05946314, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8607846581582749, + "language_loss": 0.64957654, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67239821, + "num_input_tokens_seen": 7553780, + "step": 361, + "time_per_iteration": 3.185333490371704 + }, + { + "auxiliary_loss_clip": 0.01307522, + "auxiliary_loss_mlp": 0.01082462, + "balance_loss_clip": 1.080832, + "balance_loss_mlp": 1.04407692, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 3.966026651439888, + "language_loss": 0.78438008, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80827987, + "num_input_tokens_seen": 7574155, + "step": 362, + "time_per_iteration": 2.4928317070007324 + }, + { + "auxiliary_loss_clip": 0.01311704, + "auxiliary_loss_mlp": 0.01089976, + "balance_loss_clip": 1.08366203, + "balance_loss_mlp": 1.05221009, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.0939960884430384, + "language_loss": 0.92267394, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94669068, + "num_input_tokens_seen": 7592320, + "step": 363, + "time_per_iteration": 2.529503345489502 + }, + { + "auxiliary_loss_clip": 0.01309326, + "auxiliary_loss_mlp": 0.01094505, + "balance_loss_clip": 1.08443105, + "balance_loss_mlp": 1.05678678, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.213294049683643, + "language_loss": 0.89617538, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92021376, + "num_input_tokens_seen": 7611185, + "step": 364, + "time_per_iteration": 2.4914395809173584 + }, + { + "auxiliary_loss_clip": 0.01314878, + "auxiliary_loss_mlp": 0.01088064, + "balance_loss_clip": 1.08825719, + "balance_loss_mlp": 1.04774737, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.2836335819165123, + "language_loss": 0.79521787, + "learning_rate": 3.798661793553676e-06, + "loss": 0.81924736, + "num_input_tokens_seen": 7631970, + "step": 365, + "time_per_iteration": 2.501058340072632 + }, + { + "auxiliary_loss_clip": 0.0130876, + "auxiliary_loss_mlp": 0.01091767, + "balance_loss_clip": 1.0846529, + "balance_loss_mlp": 1.05033016, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.1485168931748837, + "language_loss": 0.84290522, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86691052, + "num_input_tokens_seen": 7649745, + "step": 366, + "time_per_iteration": 2.4495251178741455 + }, + { + "auxiliary_loss_clip": 0.01314463, + "auxiliary_loss_mlp": 0.01084932, + "balance_loss_clip": 1.08684516, + "balance_loss_mlp": 1.04816759, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.194239942585268, + "language_loss": 0.87036693, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.8943609, + "num_input_tokens_seen": 7668830, + "step": 367, + "time_per_iteration": 2.4696333408355713 + }, + { + "auxiliary_loss_clip": 0.01316324, + "auxiliary_loss_mlp": 0.01093839, + "balance_loss_clip": 1.08442497, + "balance_loss_mlp": 1.05345058, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 2.1722775011122297, + "language_loss": 0.84575629, + "learning_rate": 3.803932100062912e-06, + "loss": 0.86985791, + "num_input_tokens_seen": 7687240, + "step": 368, + "time_per_iteration": 2.47137188911438 + }, + { + "auxiliary_loss_clip": 0.01314719, + "auxiliary_loss_mlp": 0.01077342, + "balance_loss_clip": 1.08287907, + "balance_loss_mlp": 1.03931463, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 2.4945244445423653, + "language_loss": 0.75892448, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.78284514, + "num_input_tokens_seen": 7704440, + "step": 369, + "time_per_iteration": 2.475496530532837 + }, + { + "auxiliary_loss_clip": 0.01308245, + "auxiliary_loss_mlp": 0.01095037, + "balance_loss_clip": 1.08399904, + "balance_loss_mlp": 1.05760491, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.0542480946953403, + "language_loss": 0.83025014, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85428298, + "num_input_tokens_seen": 7727160, + "step": 370, + "time_per_iteration": 2.5301005840301514 + }, + { + "auxiliary_loss_clip": 0.01308043, + "auxiliary_loss_mlp": 0.01094078, + "balance_loss_clip": 1.08381367, + "balance_loss_mlp": 1.05562162, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.7353895983111514, + "language_loss": 0.81829554, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.84231675, + "num_input_tokens_seen": 7747730, + "step": 371, + "time_per_iteration": 2.5108542442321777 + }, + { + "auxiliary_loss_clip": 0.01313407, + "auxiliary_loss_mlp": 0.01090319, + "balance_loss_clip": 1.08912718, + "balance_loss_mlp": 1.05071723, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.0379112475492267, + "language_loss": 0.83179808, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85583532, + "num_input_tokens_seen": 7766765, + "step": 372, + "time_per_iteration": 2.4872589111328125 + }, + { + "auxiliary_loss_clip": 0.01304775, + "auxiliary_loss_mlp": 0.0108615, + "balance_loss_clip": 1.08413458, + "balance_loss_mlp": 1.04771638, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.6617767675436035, + "language_loss": 0.7859689, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.80987811, + "num_input_tokens_seen": 7784010, + "step": 373, + "time_per_iteration": 2.4677906036376953 + }, + { + "auxiliary_loss_clip": 0.01309847, + "auxiliary_loss_mlp": 0.01083028, + "balance_loss_clip": 1.0855304, + "balance_loss_mlp": 1.04356992, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.5810413835394863, + "language_loss": 0.78303814, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.8069669, + "num_input_tokens_seen": 7801305, + "step": 374, + "time_per_iteration": 2.4326980113983154 + }, + { + "auxiliary_loss_clip": 0.01300331, + "auxiliary_loss_mlp": 0.01071621, + "balance_loss_clip": 1.07699478, + "balance_loss_mlp": 1.03218639, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 1.6316515062059884, + "language_loss": 0.86159229, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88531184, + "num_input_tokens_seen": 7823965, + "step": 375, + "time_per_iteration": 2.5410475730895996 + }, + { + "auxiliary_loss_clip": 0.01306919, + "auxiliary_loss_mlp": 0.01096226, + "balance_loss_clip": 1.0840472, + "balance_loss_mlp": 1.05562317, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 6.289562698116345, + "language_loss": 0.88816154, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91219294, + "num_input_tokens_seen": 7842115, + "step": 376, + "time_per_iteration": 3.9480769634246826 + }, + { + "auxiliary_loss_clip": 0.0130698, + "auxiliary_loss_mlp": 0.01086079, + "balance_loss_clip": 1.07993031, + "balance_loss_mlp": 1.04919529, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 3.330520479535116, + "language_loss": 0.75252807, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77645862, + "num_input_tokens_seen": 7857830, + "step": 377, + "time_per_iteration": 2.5678420066833496 + }, + { + "auxiliary_loss_clip": 0.01299805, + "auxiliary_loss_mlp": 0.01086501, + "balance_loss_clip": 1.08348036, + "balance_loss_mlp": 1.0476861, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 2.6551973937155733, + "language_loss": 0.99406409, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01792717, + "num_input_tokens_seen": 7875840, + "step": 378, + "time_per_iteration": 5.317187070846558 + }, + { + "auxiliary_loss_clip": 0.01194173, + "auxiliary_loss_mlp": 0.01072041, + "balance_loss_clip": 1.07429576, + "balance_loss_mlp": 1.05716348, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9811044272962381, + "language_loss": 0.7550478, + "learning_rate": 3.822895650276492e-06, + "loss": 0.7777099, + "num_input_tokens_seen": 7940190, + "step": 379, + "time_per_iteration": 3.1283857822418213 + }, + { + "auxiliary_loss_clip": 0.01308333, + "auxiliary_loss_mlp": 0.01082467, + "balance_loss_clip": 1.07980525, + "balance_loss_mlp": 1.04622698, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 2.3151593025535377, + "language_loss": 0.77980644, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80371439, + "num_input_tokens_seen": 7960840, + "step": 380, + "time_per_iteration": 2.6128807067871094 + }, + { + "auxiliary_loss_clip": 0.01302006, + "auxiliary_loss_mlp": 0.0108243, + "balance_loss_clip": 1.08125412, + "balance_loss_mlp": 1.04583311, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.1958098383831786, + "language_loss": 0.97055089, + "learning_rate": 3.826284353801652e-06, + "loss": 0.9943952, + "num_input_tokens_seen": 7975500, + "step": 381, + "time_per_iteration": 3.9644391536712646 + }, + { + "auxiliary_loss_clip": 0.01311673, + "auxiliary_loss_mlp": 0.01084631, + "balance_loss_clip": 1.08405733, + "balance_loss_mlp": 1.04657936, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.185596651504433, + "language_loss": 0.87781656, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90177965, + "num_input_tokens_seen": 7993880, + "step": 382, + "time_per_iteration": 2.476691961288452 + }, + { + "auxiliary_loss_clip": 0.01304569, + "auxiliary_loss_mlp": 0.01088573, + "balance_loss_clip": 1.08387375, + "balance_loss_mlp": 1.05111778, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.607792916111602, + "language_loss": 0.84893954, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87287092, + "num_input_tokens_seen": 8012730, + "step": 383, + "time_per_iteration": 2.4866368770599365 + }, + { + "auxiliary_loss_clip": 0.01301424, + "auxiliary_loss_mlp": 0.01106345, + "balance_loss_clip": 1.08344591, + "balance_loss_mlp": 1.06936622, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.178192374745353, + "language_loss": 0.83220589, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85628366, + "num_input_tokens_seen": 8031275, + "step": 384, + "time_per_iteration": 2.46148681640625 + }, + { + "auxiliary_loss_clip": 0.01301247, + "auxiliary_loss_mlp": 0.01086085, + "balance_loss_clip": 1.08650625, + "balance_loss_mlp": 1.05137157, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.7863812615996804, + "language_loss": 0.8933984, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91727173, + "num_input_tokens_seen": 8051600, + "step": 385, + "time_per_iteration": 2.4865779876708984 + }, + { + "auxiliary_loss_clip": 0.01304397, + "auxiliary_loss_mlp": 0.01108868, + "balance_loss_clip": 1.08423495, + "balance_loss_mlp": 1.07272387, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 1.9967449889801958, + "language_loss": 0.699655, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72378761, + "num_input_tokens_seen": 8070600, + "step": 386, + "time_per_iteration": 2.466581344604492 + }, + { + "auxiliary_loss_clip": 0.01306837, + "auxiliary_loss_mlp": 0.01092967, + "balance_loss_clip": 1.08537233, + "balance_loss_mlp": 1.05746686, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 2.103553029509805, + "language_loss": 0.88141358, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90541166, + "num_input_tokens_seen": 8090680, + "step": 387, + "time_per_iteration": 2.5130040645599365 + }, + { + "auxiliary_loss_clip": 0.01305959, + "auxiliary_loss_mlp": 0.01078973, + "balance_loss_clip": 1.08327246, + "balance_loss_mlp": 1.04146981, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 3.4137057405078783, + "language_loss": 0.83395278, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85780203, + "num_input_tokens_seen": 8114610, + "step": 388, + "time_per_iteration": 2.5434439182281494 + }, + { + "auxiliary_loss_clip": 0.01303492, + "auxiliary_loss_mlp": 0.01085136, + "balance_loss_clip": 1.08347368, + "balance_loss_mlp": 1.05023098, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 2.1658715627236567, + "language_loss": 0.93742704, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96131337, + "num_input_tokens_seen": 8133975, + "step": 389, + "time_per_iteration": 2.5323901176452637 + }, + { + "auxiliary_loss_clip": 0.01295797, + "auxiliary_loss_mlp": 0.01081117, + "balance_loss_clip": 1.08211088, + "balance_loss_mlp": 1.04487741, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.0081725522357328, + "language_loss": 0.87779927, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90156835, + "num_input_tokens_seen": 8153570, + "step": 390, + "time_per_iteration": 2.4881715774536133 + }, + { + "auxiliary_loss_clip": 0.01302802, + "auxiliary_loss_mlp": 0.01086593, + "balance_loss_clip": 1.08465302, + "balance_loss_mlp": 1.05207038, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.536145277836234, + "language_loss": 0.89219999, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91609401, + "num_input_tokens_seen": 8170075, + "step": 391, + "time_per_iteration": 2.51261830329895 + }, + { + "auxiliary_loss_clip": 0.01298044, + "auxiliary_loss_mlp": 0.01070904, + "balance_loss_clip": 1.08097756, + "balance_loss_mlp": 1.03530777, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.5999519589920266, + "language_loss": 0.86103284, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88472235, + "num_input_tokens_seen": 8190420, + "step": 392, + "time_per_iteration": 2.5214462280273438 + }, + { + "auxiliary_loss_clip": 0.01295618, + "auxiliary_loss_mlp": 0.01085332, + "balance_loss_clip": 1.08167684, + "balance_loss_mlp": 1.04964101, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 1.9176892886395394, + "language_loss": 0.88907981, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91288936, + "num_input_tokens_seen": 8208790, + "step": 393, + "time_per_iteration": 2.469419479370117 + }, + { + "auxiliary_loss_clip": 0.01307256, + "auxiliary_loss_mlp": 0.01105653, + "balance_loss_clip": 1.08801305, + "balance_loss_mlp": 1.06722033, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 1.9017638941867112, + "language_loss": 0.81403363, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83816266, + "num_input_tokens_seen": 8226885, + "step": 394, + "time_per_iteration": 2.457136392593384 + }, + { + "auxiliary_loss_clip": 0.01296568, + "auxiliary_loss_mlp": 0.01086904, + "balance_loss_clip": 1.0802536, + "balance_loss_mlp": 1.04956794, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.3405527034100664, + "language_loss": 0.85907555, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88291031, + "num_input_tokens_seen": 8246825, + "step": 395, + "time_per_iteration": 2.509035587310791 + }, + { + "auxiliary_loss_clip": 0.01192623, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_clip": 1.07655764, + "balance_loss_mlp": 1.03026474, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9507898659790229, + "language_loss": 0.63832939, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66068232, + "num_input_tokens_seen": 8302835, + "step": 396, + "time_per_iteration": 2.8933732509613037 + }, + { + "auxiliary_loss_clip": 0.01293366, + "auxiliary_loss_mlp": 0.01069853, + "balance_loss_clip": 1.07842159, + "balance_loss_mlp": 1.03454375, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.3637178861141295, + "language_loss": 0.83732355, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86095572, + "num_input_tokens_seen": 8320745, + "step": 397, + "time_per_iteration": 2.5534424781799316 + }, + { + "auxiliary_loss_clip": 0.01297772, + "auxiliary_loss_mlp": 0.01086536, + "balance_loss_clip": 1.08165598, + "balance_loss_mlp": 1.05017745, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 11.172432887904309, + "language_loss": 0.84120715, + "learning_rate": 3.854390195044404e-06, + "loss": 0.8650502, + "num_input_tokens_seen": 8339540, + "step": 398, + "time_per_iteration": 2.493992328643799 + }, + { + "auxiliary_loss_clip": 0.0129928, + "auxiliary_loss_mlp": 0.01075901, + "balance_loss_clip": 1.07870936, + "balance_loss_mlp": 1.03835011, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 3.0156776702330097, + "language_loss": 0.85680723, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88055909, + "num_input_tokens_seen": 8354890, + "step": 399, + "time_per_iteration": 2.4660167694091797 + }, + { + "auxiliary_loss_clip": 0.01294167, + "auxiliary_loss_mlp": 0.01089305, + "balance_loss_clip": 1.0814054, + "balance_loss_mlp": 1.05347037, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.094241530264203, + "language_loss": 0.86181217, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88564694, + "num_input_tokens_seen": 8375845, + "step": 400, + "time_per_iteration": 2.525749683380127 + }, + { + "auxiliary_loss_clip": 0.01301728, + "auxiliary_loss_mlp": 0.01082618, + "balance_loss_clip": 1.08432961, + "balance_loss_mlp": 1.04659307, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 1.8637418952071743, + "language_loss": 0.78989929, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81374276, + "num_input_tokens_seen": 8395240, + "step": 401, + "time_per_iteration": 2.5477042198181152 + }, + { + "auxiliary_loss_clip": 0.01294474, + "auxiliary_loss_mlp": 0.01092828, + "balance_loss_clip": 1.07916999, + "balance_loss_mlp": 1.05718458, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 3.1802454518414045, + "language_loss": 0.78665495, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.81052798, + "num_input_tokens_seen": 8416950, + "step": 402, + "time_per_iteration": 2.5231077671051025 + }, + { + "auxiliary_loss_clip": 0.01299319, + "auxiliary_loss_mlp": 0.01080713, + "balance_loss_clip": 1.07944977, + "balance_loss_mlp": 1.04321015, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 3.9250748091609715, + "language_loss": 0.94584942, + "learning_rate": 3.86242840411147e-06, + "loss": 0.96964979, + "num_input_tokens_seen": 8433660, + "step": 403, + "time_per_iteration": 2.493530750274658 + }, + { + "auxiliary_loss_clip": 0.0130269, + "auxiliary_loss_mlp": 0.01092416, + "balance_loss_clip": 1.07818103, + "balance_loss_mlp": 1.05400658, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.414895491728752, + "language_loss": 0.99812508, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02207613, + "num_input_tokens_seen": 8450180, + "step": 404, + "time_per_iteration": 2.4340763092041016 + }, + { + "auxiliary_loss_clip": 0.01300483, + "auxiliary_loss_mlp": 0.01104464, + "balance_loss_clip": 1.08076072, + "balance_loss_mlp": 1.06801009, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.34664328730422, + "language_loss": 0.87707865, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90112817, + "num_input_tokens_seen": 8467775, + "step": 405, + "time_per_iteration": 2.4558143615722656 + }, + { + "auxiliary_loss_clip": 0.01311771, + "auxiliary_loss_mlp": 0.01105825, + "balance_loss_clip": 1.08601642, + "balance_loss_mlp": 1.06817877, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 2.4303212635324143, + "language_loss": 0.9338882, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9580642, + "num_input_tokens_seen": 8486765, + "step": 406, + "time_per_iteration": 2.452406644821167 + }, + { + "auxiliary_loss_clip": 0.0130279, + "auxiliary_loss_mlp": 0.01090247, + "balance_loss_clip": 1.08376718, + "balance_loss_mlp": 1.0518856, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.3855633517558132, + "language_loss": 0.8724221, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89635247, + "num_input_tokens_seen": 8506515, + "step": 407, + "time_per_iteration": 2.4870669841766357 + }, + { + "auxiliary_loss_clip": 0.01299481, + "auxiliary_loss_mlp": 0.01090188, + "balance_loss_clip": 1.08416653, + "balance_loss_mlp": 1.05323315, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.074479056003595, + "language_loss": 0.7384069, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76230359, + "num_input_tokens_seen": 8528035, + "step": 408, + "time_per_iteration": 2.5655996799468994 + }, + { + "auxiliary_loss_clip": 0.01303049, + "auxiliary_loss_mlp": 0.01095285, + "balance_loss_clip": 1.08405781, + "balance_loss_mlp": 1.05780506, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 4.245260111962556, + "language_loss": 0.92660695, + "learning_rate": 3.871943634189376e-06, + "loss": 0.95059025, + "num_input_tokens_seen": 8546455, + "step": 409, + "time_per_iteration": 2.484515905380249 + }, + { + "auxiliary_loss_clip": 0.01302691, + "auxiliary_loss_mlp": 0.01081006, + "balance_loss_clip": 1.08468103, + "balance_loss_mlp": 1.04653096, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 2.146379205534621, + "language_loss": 0.82796389, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85180086, + "num_input_tokens_seen": 8568450, + "step": 410, + "time_per_iteration": 2.6026365756988525 + }, + { + "auxiliary_loss_clip": 0.01302061, + "auxiliary_loss_mlp": 0.01093402, + "balance_loss_clip": 1.08363175, + "balance_loss_mlp": 1.05735338, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 2.471129188503811, + "language_loss": 0.77822566, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80218029, + "num_input_tokens_seen": 8589340, + "step": 411, + "time_per_iteration": 2.5301759243011475 + }, + { + "auxiliary_loss_clip": 0.01301647, + "auxiliary_loss_mlp": 0.01102039, + "balance_loss_clip": 1.0812732, + "balance_loss_mlp": 1.06382036, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.3509227131430266, + "language_loss": 0.86688399, + "learning_rate": 3.87664903040738e-06, + "loss": 0.89092088, + "num_input_tokens_seen": 8607150, + "step": 412, + "time_per_iteration": 2.452902317047119 + }, + { + "auxiliary_loss_clip": 0.0118234, + "auxiliary_loss_mlp": 0.01087792, + "balance_loss_clip": 1.06828856, + "balance_loss_mlp": 1.07715881, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 1.2945188071605016, + "language_loss": 0.5849005, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60760188, + "num_input_tokens_seen": 8669865, + "step": 413, + "time_per_iteration": 3.1646206378936768 + }, + { + "auxiliary_loss_clip": 0.0129549, + "auxiliary_loss_mlp": 0.01092592, + "balance_loss_clip": 1.08061516, + "balance_loss_mlp": 1.05370605, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.8721259700923472, + "language_loss": 0.807796, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83167684, + "num_input_tokens_seen": 8690235, + "step": 414, + "time_per_iteration": 2.55409574508667 + }, + { + "auxiliary_loss_clip": 0.01290341, + "auxiliary_loss_mlp": 0.01093011, + "balance_loss_clip": 1.07830334, + "balance_loss_mlp": 1.05677152, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.1640324534088067, + "language_loss": 0.80478883, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82862234, + "num_input_tokens_seen": 8706295, + "step": 415, + "time_per_iteration": 2.444949150085449 + }, + { + "auxiliary_loss_clip": 0.01306921, + "auxiliary_loss_mlp": 0.01083612, + "balance_loss_clip": 1.08477271, + "balance_loss_mlp": 1.04739594, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 2.832018627703688, + "language_loss": 0.96316797, + "learning_rate": 3.882869872844723e-06, + "loss": 0.9870733, + "num_input_tokens_seen": 8724200, + "step": 416, + "time_per_iteration": 5.320829153060913 + }, + { + "auxiliary_loss_clip": 0.01296769, + "auxiliary_loss_mlp": 0.01073264, + "balance_loss_clip": 1.08053017, + "balance_loss_mlp": 1.03492665, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 1.6850988648305947, + "language_loss": 0.77517772, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79887807, + "num_input_tokens_seen": 8744170, + "step": 417, + "time_per_iteration": 3.975346088409424 + }, + { + "auxiliary_loss_clip": 0.0129262, + "auxiliary_loss_mlp": 0.01087501, + "balance_loss_clip": 1.08269799, + "balance_loss_mlp": 1.05071306, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 2.6253316044763326, + "language_loss": 0.7676816, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79148281, + "num_input_tokens_seen": 8765120, + "step": 418, + "time_per_iteration": 2.5258097648620605 + }, + { + "auxiliary_loss_clip": 0.0130088, + "auxiliary_loss_mlp": 0.01074524, + "balance_loss_clip": 1.08388615, + "balance_loss_mlp": 1.0397625, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.6820294314460766, + "language_loss": 0.8160882, + "learning_rate": 3.887496375507294e-06, + "loss": 0.8398422, + "num_input_tokens_seen": 8783500, + "step": 419, + "time_per_iteration": 2.469996452331543 + }, + { + "auxiliary_loss_clip": 0.01294093, + "auxiliary_loss_mlp": 0.01084451, + "balance_loss_clip": 1.08222914, + "balance_loss_mlp": 1.04625618, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 2.1312476842526342, + "language_loss": 0.73548806, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.75927347, + "num_input_tokens_seen": 8801175, + "step": 420, + "time_per_iteration": 2.4492461681365967 + }, + { + "auxiliary_loss_clip": 0.01294173, + "auxiliary_loss_mlp": 0.0109458, + "balance_loss_clip": 1.07914662, + "balance_loss_mlp": 1.05862689, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.7323269917533584, + "language_loss": 0.78868544, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81257308, + "num_input_tokens_seen": 8820215, + "step": 421, + "time_per_iteration": 2.506955146789551 + }, + { + "auxiliary_loss_clip": 0.01293103, + "auxiliary_loss_mlp": 0.01086858, + "balance_loss_clip": 1.08111429, + "balance_loss_mlp": 1.04883003, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.1596363434896784, + "language_loss": 0.81998521, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84378481, + "num_input_tokens_seen": 8839660, + "step": 422, + "time_per_iteration": 2.5949461460113525 + }, + { + "auxiliary_loss_clip": 0.01296005, + "auxiliary_loss_mlp": 0.01079046, + "balance_loss_clip": 1.07957089, + "balance_loss_mlp": 1.04526186, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.8118115410888183, + "language_loss": 0.83519304, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85894352, + "num_input_tokens_seen": 8859280, + "step": 423, + "time_per_iteration": 3.9203882217407227 + }, + { + "auxiliary_loss_clip": 0.01290184, + "auxiliary_loss_mlp": 0.01079816, + "balance_loss_clip": 1.07756937, + "balance_loss_mlp": 1.04486394, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.355448577098984, + "language_loss": 0.74317527, + "learning_rate": 3.895134094768415e-06, + "loss": 0.76687527, + "num_input_tokens_seen": 8880560, + "step": 424, + "time_per_iteration": 2.608105421066284 + }, + { + "auxiliary_loss_clip": 0.01299283, + "auxiliary_loss_mlp": 0.01093584, + "balance_loss_clip": 1.08399022, + "balance_loss_mlp": 1.05903733, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.163361744412, + "language_loss": 0.83050299, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85443163, + "num_input_tokens_seen": 8899155, + "step": 425, + "time_per_iteration": 2.541419267654419 + }, + { + "auxiliary_loss_clip": 0.01296123, + "auxiliary_loss_mlp": 0.01087081, + "balance_loss_clip": 1.07601416, + "balance_loss_mlp": 1.04922032, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.1726550867656798, + "language_loss": 0.85232753, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87615961, + "num_input_tokens_seen": 8917890, + "step": 426, + "time_per_iteration": 2.5229058265686035 + }, + { + "auxiliary_loss_clip": 0.01176479, + "auxiliary_loss_mlp": 0.01013718, + "balance_loss_clip": 1.06789243, + "balance_loss_mlp": 1.00351357, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8888876805171764, + "language_loss": 0.57253915, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59444112, + "num_input_tokens_seen": 8978260, + "step": 427, + "time_per_iteration": 3.1506543159484863 + }, + { + "auxiliary_loss_clip": 0.01293096, + "auxiliary_loss_mlp": 0.01091402, + "balance_loss_clip": 1.08398426, + "balance_loss_mlp": 1.05652153, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.52188493549771, + "language_loss": 0.87825131, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90209633, + "num_input_tokens_seen": 8994460, + "step": 428, + "time_per_iteration": 2.449904203414917 + }, + { + "auxiliary_loss_clip": 0.01286207, + "auxiliary_loss_mlp": 0.01074631, + "balance_loss_clip": 1.07718015, + "balance_loss_mlp": 1.03700888, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.7372346385565982, + "language_loss": 0.85568726, + "learning_rate": 3.902682272467353e-06, + "loss": 0.87929565, + "num_input_tokens_seen": 9016670, + "step": 429, + "time_per_iteration": 2.5749669075012207 + }, + { + "auxiliary_loss_clip": 0.01292349, + "auxiliary_loss_mlp": 0.0108156, + "balance_loss_clip": 1.07648516, + "balance_loss_mlp": 1.04477215, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.412411603063884, + "language_loss": 0.8817426, + "learning_rate": 3.904181346912895e-06, + "loss": 0.9054817, + "num_input_tokens_seen": 9039720, + "step": 430, + "time_per_iteration": 2.5480337142944336 + }, + { + "auxiliary_loss_clip": 0.01294368, + "auxiliary_loss_mlp": 0.0107905, + "balance_loss_clip": 1.08381391, + "balance_loss_mlp": 1.04550433, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 2.006549929401175, + "language_loss": 0.84201729, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86575145, + "num_input_tokens_seen": 9059850, + "step": 431, + "time_per_iteration": 2.4783549308776855 + }, + { + "auxiliary_loss_clip": 0.01288968, + "auxiliary_loss_mlp": 0.01072859, + "balance_loss_clip": 1.07947481, + "balance_loss_mlp": 1.03993309, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 3.69379510431146, + "language_loss": 0.86792946, + "learning_rate": 3.907169065422638e-06, + "loss": 0.8915478, + "num_input_tokens_seen": 9077590, + "step": 432, + "time_per_iteration": 2.4694571495056152 + }, + { + "auxiliary_loss_clip": 0.01292393, + "auxiliary_loss_mlp": 0.01070828, + "balance_loss_clip": 1.08095729, + "balance_loss_mlp": 1.03797424, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.0812654705753517, + "language_loss": 0.75867558, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78230774, + "num_input_tokens_seen": 9099880, + "step": 433, + "time_per_iteration": 2.5453786849975586 + }, + { + "auxiliary_loss_clip": 0.01290848, + "auxiliary_loss_mlp": 0.01085117, + "balance_loss_clip": 1.07741928, + "balance_loss_mlp": 1.04785275, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.3041893508478135, + "language_loss": 0.89679503, + "learning_rate": 3.910142983797699e-06, + "loss": 0.9205547, + "num_input_tokens_seen": 9118620, + "step": 434, + "time_per_iteration": 2.4422571659088135 + }, + { + "auxiliary_loss_clip": 0.01293729, + "auxiliary_loss_mlp": 0.01095964, + "balance_loss_clip": 1.08406496, + "balance_loss_mlp": 1.05955756, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 3.1322265870348667, + "language_loss": 0.80137193, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82526881, + "num_input_tokens_seen": 9135655, + "step": 435, + "time_per_iteration": 2.453030824661255 + }, + { + "auxiliary_loss_clip": 0.01287363, + "auxiliary_loss_mlp": 0.01085011, + "balance_loss_clip": 1.07641876, + "balance_loss_mlp": 1.05003476, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 24.880752018299194, + "language_loss": 0.86399508, + "learning_rate": 3.913103228936546e-06, + "loss": 0.8877188, + "num_input_tokens_seen": 9153520, + "step": 436, + "time_per_iteration": 2.4605295658111572 + }, + { + "auxiliary_loss_clip": 0.0129125, + "auxiliary_loss_mlp": 0.01095111, + "balance_loss_clip": 1.08032465, + "balance_loss_mlp": 1.05980134, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.3781844530533505, + "language_loss": 0.75005299, + "learning_rate": 3.914578263220868e-06, + "loss": 0.7739166, + "num_input_tokens_seen": 9170750, + "step": 437, + "time_per_iteration": 2.4855802059173584 + }, + { + "auxiliary_loss_clip": 0.01290723, + "auxiliary_loss_mlp": 0.01093277, + "balance_loss_clip": 1.08141994, + "balance_loss_mlp": 1.05594087, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.6108038227588013, + "language_loss": 0.91493219, + "learning_rate": 3.916049925995316e-06, + "loss": 0.9387722, + "num_input_tokens_seen": 9188430, + "step": 438, + "time_per_iteration": 2.4764225482940674 + }, + { + "auxiliary_loss_clip": 0.01170789, + "auxiliary_loss_mlp": 0.01039169, + "balance_loss_clip": 1.06298983, + "balance_loss_mlp": 1.0292033, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.8710250973954692, + "language_loss": 0.62653303, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64863265, + "num_input_tokens_seen": 9255835, + "step": 439, + "time_per_iteration": 3.1295714378356934 + }, + { + "auxiliary_loss_clip": 0.0130036, + "auxiliary_loss_mlp": 0.01095933, + "balance_loss_clip": 1.08609176, + "balance_loss_mlp": 1.05919254, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 1.829561596361049, + "language_loss": 0.75819826, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78216118, + "num_input_tokens_seen": 9276835, + "step": 440, + "time_per_iteration": 2.509260416030884 + }, + { + "auxiliary_loss_clip": 0.01292162, + "auxiliary_loss_mlp": 0.01077393, + "balance_loss_clip": 1.08150947, + "balance_loss_mlp": 1.04167819, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.9512761207760536, + "language_loss": 0.83050501, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85420048, + "num_input_tokens_seen": 9295075, + "step": 441, + "time_per_iteration": 2.4488961696624756 + }, + { + "auxiliary_loss_clip": 0.0129454, + "auxiliary_loss_mlp": 0.01084563, + "balance_loss_clip": 1.07942319, + "balance_loss_mlp": 1.04765582, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 3.1787007844525004, + "language_loss": 0.78453279, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80832374, + "num_input_tokens_seen": 9314205, + "step": 442, + "time_per_iteration": 2.4520413875579834 + }, + { + "auxiliary_loss_clip": 0.01164697, + "auxiliary_loss_mlp": 0.01010449, + "balance_loss_clip": 1.05908787, + "balance_loss_mlp": 1.00095987, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9619000432475342, + "language_loss": 0.64471042, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66646188, + "num_input_tokens_seen": 9367395, + "step": 443, + "time_per_iteration": 2.9124019145965576 + }, + { + "auxiliary_loss_clip": 0.01294968, + "auxiliary_loss_mlp": 0.01086807, + "balance_loss_clip": 1.08420289, + "balance_loss_mlp": 1.05109155, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.3456635940558344, + "language_loss": 0.82454789, + "learning_rate": 3.924809954779425e-06, + "loss": 0.84836566, + "num_input_tokens_seen": 9385185, + "step": 444, + "time_per_iteration": 2.4532310962677 + }, + { + "auxiliary_loss_clip": 0.0129717, + "auxiliary_loss_mlp": 0.0108323, + "balance_loss_clip": 1.08139038, + "balance_loss_mlp": 1.04491639, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 4.045992036054237, + "language_loss": 0.95733821, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.98114228, + "num_input_tokens_seen": 9403225, + "step": 445, + "time_per_iteration": 2.5018186569213867 + }, + { + "auxiliary_loss_clip": 0.01294778, + "auxiliary_loss_mlp": 0.01090073, + "balance_loss_clip": 1.08412445, + "balance_loss_mlp": 1.05230772, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.4746763269577836, + "language_loss": 0.91610199, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.93995047, + "num_input_tokens_seen": 9420540, + "step": 446, + "time_per_iteration": 2.463296890258789 + }, + { + "auxiliary_loss_clip": 0.01290894, + "auxiliary_loss_mlp": 0.01080456, + "balance_loss_clip": 1.08244634, + "balance_loss_mlp": 1.04502738, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.357433434832568, + "language_loss": 0.796143, + "learning_rate": 3.92914567610317e-06, + "loss": 0.81985652, + "num_input_tokens_seen": 9438840, + "step": 447, + "time_per_iteration": 2.467989683151245 + }, + { + "auxiliary_loss_clip": 0.01292687, + "auxiliary_loss_mlp": 0.01072216, + "balance_loss_clip": 1.08260739, + "balance_loss_mlp": 1.03800297, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.0480986346862142, + "language_loss": 0.86427796, + "learning_rate": 3.930584452530952e-06, + "loss": 0.88792706, + "num_input_tokens_seen": 9457215, + "step": 448, + "time_per_iteration": 2.450880527496338 + }, + { + "auxiliary_loss_clip": 0.01285779, + "auxiliary_loss_mlp": 0.01092338, + "balance_loss_clip": 1.0807035, + "balance_loss_mlp": 1.05934131, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 2.5159642529157398, + "language_loss": 0.88755834, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91133946, + "num_input_tokens_seen": 9475615, + "step": 449, + "time_per_iteration": 2.480502128601074 + }, + { + "auxiliary_loss_clip": 0.01297317, + "auxiliary_loss_mlp": 0.01087825, + "balance_loss_clip": 1.08234918, + "balance_loss_mlp": 1.0509181, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 2.7966785652675834, + "language_loss": 0.80246222, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82631361, + "num_input_tokens_seen": 9493975, + "step": 450, + "time_per_iteration": 2.429137706756592 + }, + { + "auxiliary_loss_clip": 0.01290241, + "auxiliary_loss_mlp": 0.01073326, + "balance_loss_clip": 1.08516872, + "balance_loss_mlp": 1.03823102, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 2.1129708135946146, + "language_loss": 0.8148396, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83847523, + "num_input_tokens_seen": 9514810, + "step": 451, + "time_per_iteration": 2.5108237266540527 + }, + { + "auxiliary_loss_clip": 0.01290315, + "auxiliary_loss_mlp": 0.01090408, + "balance_loss_clip": 1.08545971, + "balance_loss_mlp": 1.05376267, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 2.8852191037530854, + "language_loss": 0.7689212, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79272842, + "num_input_tokens_seen": 9533635, + "step": 452, + "time_per_iteration": 2.484020948410034 + }, + { + "auxiliary_loss_clip": 0.01289819, + "auxiliary_loss_mlp": 0.010832, + "balance_loss_clip": 1.08328664, + "balance_loss_mlp": 1.04786611, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.8363137282212316, + "language_loss": 0.73107868, + "learning_rate": 3.937730499067294e-06, + "loss": 0.7548089, + "num_input_tokens_seen": 9555420, + "step": 453, + "time_per_iteration": 2.5219359397888184 + }, + { + "auxiliary_loss_clip": 0.01284617, + "auxiliary_loss_mlp": 0.01082411, + "balance_loss_clip": 1.08028364, + "balance_loss_mlp": 1.04793561, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 2.1893309958555633, + "language_loss": 0.82536066, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84903097, + "num_input_tokens_seen": 9578950, + "step": 454, + "time_per_iteration": 2.630223512649536 + }, + { + "auxiliary_loss_clip": 0.01289742, + "auxiliary_loss_mlp": 0.01078732, + "balance_loss_clip": 1.08274198, + "balance_loss_mlp": 1.04637837, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 1.7961545172811608, + "language_loss": 0.7543143, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.7779991, + "num_input_tokens_seen": 9598160, + "step": 455, + "time_per_iteration": 2.4986767768859863 + }, + { + "auxiliary_loss_clip": 0.01288016, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_clip": 1.07897639, + "balance_loss_mlp": 1.05634427, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.292841008231573, + "language_loss": 0.80629444, + "learning_rate": 3.941980363893499e-06, + "loss": 0.83007431, + "num_input_tokens_seen": 9616010, + "step": 456, + "time_per_iteration": 3.862703561782837 + }, + { + "auxiliary_loss_clip": 0.0128417, + "auxiliary_loss_mlp": 0.01076266, + "balance_loss_clip": 1.07994902, + "balance_loss_mlp": 1.04083741, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.241416638322389, + "language_loss": 0.81458175, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83818609, + "num_input_tokens_seen": 9634000, + "step": 457, + "time_per_iteration": 3.8845856189727783 + }, + { + "auxiliary_loss_clip": 0.01291176, + "auxiliary_loss_mlp": 0.01079214, + "balance_loss_clip": 1.08069038, + "balance_loss_mlp": 1.04466701, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.0026319790471843, + "language_loss": 0.93954623, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96325016, + "num_input_tokens_seen": 9653455, + "step": 458, + "time_per_iteration": 3.9042749404907227 + }, + { + "auxiliary_loss_clip": 0.01287154, + "auxiliary_loss_mlp": 0.01090931, + "balance_loss_clip": 1.08025932, + "balance_loss_mlp": 1.05681288, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.5292243541886092, + "language_loss": 0.79697424, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.82075512, + "num_input_tokens_seen": 9669650, + "step": 459, + "time_per_iteration": 2.437376022338867 + }, + { + "auxiliary_loss_clip": 0.01292889, + "auxiliary_loss_mlp": 0.01082801, + "balance_loss_clip": 1.08568966, + "balance_loss_mlp": 1.04494047, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.8689705515176844, + "language_loss": 0.83302706, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85678399, + "num_input_tokens_seen": 9691415, + "step": 460, + "time_per_iteration": 2.50738787651062 + }, + { + "auxiliary_loss_clip": 0.01154961, + "auxiliary_loss_mlp": 0.01048461, + "balance_loss_clip": 1.05041718, + "balance_loss_mlp": 1.03935361, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5770907904819542, + "language_loss": 0.73607564, + "learning_rate": 3.949001722282675e-06, + "loss": 0.75810981, + "num_input_tokens_seen": 9755605, + "step": 461, + "time_per_iteration": 3.067850351333618 + }, + { + "auxiliary_loss_clip": 0.01289028, + "auxiliary_loss_mlp": 0.01079176, + "balance_loss_clip": 1.08786941, + "balance_loss_mlp": 1.04706144, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.8582070027308393, + "language_loss": 0.81166589, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83534795, + "num_input_tokens_seen": 9776270, + "step": 462, + "time_per_iteration": 2.5365984439849854 + }, + { + "auxiliary_loss_clip": 0.01285641, + "auxiliary_loss_mlp": 0.01076926, + "balance_loss_clip": 1.08169293, + "balance_loss_mlp": 1.04519212, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2343350243834132, + "language_loss": 0.90335989, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92698562, + "num_input_tokens_seen": 9794465, + "step": 463, + "time_per_iteration": 2.4640347957611084 + }, + { + "auxiliary_loss_clip": 0.01152429, + "auxiliary_loss_mlp": 0.01009388, + "balance_loss_clip": 1.04996347, + "balance_loss_mlp": 1.00018501, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8839071888313383, + "language_loss": 0.59105688, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61267501, + "num_input_tokens_seen": 9849685, + "step": 464, + "time_per_iteration": 4.400179147720337 + }, + { + "auxiliary_loss_clip": 0.01300995, + "auxiliary_loss_mlp": 0.01097922, + "balance_loss_clip": 1.08983874, + "balance_loss_mlp": 1.0627079, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.47926988399744, + "language_loss": 0.81329083, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83728004, + "num_input_tokens_seen": 9869505, + "step": 465, + "time_per_iteration": 2.505235195159912 + }, + { + "auxiliary_loss_clip": 0.01286608, + "auxiliary_loss_mlp": 0.01082948, + "balance_loss_clip": 1.08133566, + "balance_loss_mlp": 1.04861557, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 3.7574909035851682, + "language_loss": 0.78580838, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80950391, + "num_input_tokens_seen": 9890950, + "step": 466, + "time_per_iteration": 2.4947669506073 + }, + { + "auxiliary_loss_clip": 0.01286079, + "auxiliary_loss_mlp": 0.01087251, + "balance_loss_clip": 1.08261716, + "balance_loss_mlp": 1.05346751, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 1.8131323484418362, + "language_loss": 0.87682253, + "learning_rate": 3.957327513084761e-06, + "loss": 0.90055585, + "num_input_tokens_seen": 9911265, + "step": 467, + "time_per_iteration": 2.5006372928619385 + }, + { + "auxiliary_loss_clip": 0.01292039, + "auxiliary_loss_mlp": 0.01100193, + "balance_loss_clip": 1.08479381, + "balance_loss_mlp": 1.06407237, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.009715505431471, + "language_loss": 0.8629359, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88685822, + "num_input_tokens_seen": 9929025, + "step": 468, + "time_per_iteration": 2.4640250205993652 + }, + { + "auxiliary_loss_clip": 0.01288595, + "auxiliary_loss_mlp": 0.01079486, + "balance_loss_clip": 1.08249366, + "balance_loss_mlp": 1.04384232, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 2.1947431465003437, + "language_loss": 0.91865355, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94233429, + "num_input_tokens_seen": 9945190, + "step": 469, + "time_per_iteration": 2.458516836166382 + }, + { + "auxiliary_loss_clip": 0.0128626, + "auxiliary_loss_mlp": 0.0109401, + "balance_loss_clip": 1.08311272, + "balance_loss_mlp": 1.05881894, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 2.276024775163174, + "language_loss": 0.81705022, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84085298, + "num_input_tokens_seen": 9962820, + "step": 470, + "time_per_iteration": 2.447678804397583 + }, + { + "auxiliary_loss_clip": 0.01285562, + "auxiliary_loss_mlp": 0.01086129, + "balance_loss_clip": 1.07904625, + "balance_loss_mlp": 1.05036664, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 2.1305886513461734, + "language_loss": 0.93162584, + "learning_rate": 3.962818822989861e-06, + "loss": 0.95534277, + "num_input_tokens_seen": 9982595, + "step": 471, + "time_per_iteration": 2.496142864227295 + }, + { + "auxiliary_loss_clip": 0.01283535, + "auxiliary_loss_mlp": 0.01094094, + "balance_loss_clip": 1.08015931, + "balance_loss_mlp": 1.05930865, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 2.615031793429671, + "language_loss": 0.76013547, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78391176, + "num_input_tokens_seen": 10004645, + "step": 472, + "time_per_iteration": 2.5124542713165283 + }, + { + "auxiliary_loss_clip": 0.01288026, + "auxiliary_loss_mlp": 0.01077903, + "balance_loss_clip": 1.07835162, + "balance_loss_mlp": 1.04442883, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 1.9886612290904822, + "language_loss": 0.9347744, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95843375, + "num_input_tokens_seen": 10022555, + "step": 473, + "time_per_iteration": 2.5006906986236572 + }, + { + "auxiliary_loss_clip": 0.01293195, + "auxiliary_loss_mlp": 0.01114844, + "balance_loss_clip": 1.08290839, + "balance_loss_mlp": 1.08196616, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 3.523770717269956, + "language_loss": 0.88402307, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90810347, + "num_input_tokens_seen": 10041025, + "step": 474, + "time_per_iteration": 2.4817323684692383 + }, + { + "auxiliary_loss_clip": 0.01284215, + "auxiliary_loss_mlp": 0.01085558, + "balance_loss_clip": 1.08066177, + "balance_loss_mlp": 1.05072486, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.0118192348857225, + "language_loss": 0.78807127, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81176901, + "num_input_tokens_seen": 10060775, + "step": 475, + "time_per_iteration": 2.516758680343628 + }, + { + "auxiliary_loss_clip": 0.01155213, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_clip": 1.04836679, + "balance_loss_mlp": 1.05490291, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9289904059462711, + "language_loss": 0.67002511, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69220877, + "num_input_tokens_seen": 10120225, + "step": 476, + "time_per_iteration": 2.95268177986145 + }, + { + "auxiliary_loss_clip": 0.01288501, + "auxiliary_loss_mlp": 0.01079225, + "balance_loss_clip": 1.08339143, + "balance_loss_mlp": 1.04391551, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 3.2083550779762677, + "language_loss": 0.83861744, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86229467, + "num_input_tokens_seen": 10137880, + "step": 477, + "time_per_iteration": 2.4692654609680176 + }, + { + "auxiliary_loss_clip": 0.01292329, + "auxiliary_loss_mlp": 0.01090235, + "balance_loss_clip": 1.0851047, + "balance_loss_mlp": 1.05504429, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.0305898857771827, + "language_loss": 0.8239609, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84778655, + "num_input_tokens_seen": 10156930, + "step": 478, + "time_per_iteration": 2.5214927196502686 + }, + { + "auxiliary_loss_clip": 0.01284321, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_clip": 1.08079004, + "balance_loss_mlp": 1.04443145, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 1.8038068825856084, + "language_loss": 0.81306648, + "learning_rate": 3.973662905576082e-06, + "loss": 0.83669591, + "num_input_tokens_seen": 10176295, + "step": 479, + "time_per_iteration": 2.4543511867523193 + }, + { + "auxiliary_loss_clip": 0.01281979, + "auxiliary_loss_mlp": 0.01081657, + "balance_loss_clip": 1.07924747, + "balance_loss_mlp": 1.04567993, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.160126447438278, + "language_loss": 0.73598003, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75961643, + "num_input_tokens_seen": 10195790, + "step": 480, + "time_per_iteration": 2.457181215286255 + }, + { + "auxiliary_loss_clip": 0.01280189, + "auxiliary_loss_mlp": 0.0106732, + "balance_loss_clip": 1.08049798, + "balance_loss_mlp": 1.03603947, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.9988362925568401, + "language_loss": 0.87932837, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90280342, + "num_input_tokens_seen": 10218405, + "step": 481, + "time_per_iteration": 2.5222909450531006 + }, + { + "auxiliary_loss_clip": 0.01151378, + "auxiliary_loss_mlp": 0.0100913, + "balance_loss_clip": 1.04463053, + "balance_loss_mlp": 1.00097597, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8256831333223844, + "language_loss": 0.66120255, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68280762, + "num_input_tokens_seen": 10271005, + "step": 482, + "time_per_iteration": 2.798541784286499 + }, + { + "auxiliary_loss_clip": 0.01295245, + "auxiliary_loss_mlp": 0.01071226, + "balance_loss_clip": 1.0837692, + "balance_loss_mlp": 1.0380857, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.814648066209773, + "language_loss": 0.79021496, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81387967, + "num_input_tokens_seen": 10288405, + "step": 483, + "time_per_iteration": 2.422499895095825 + }, + { + "auxiliary_loss_clip": 0.01291668, + "auxiliary_loss_mlp": 0.01089109, + "balance_loss_clip": 1.0846889, + "balance_loss_mlp": 1.05460966, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.209195826802924, + "language_loss": 0.75479198, + "learning_rate": 3.980348865796749e-06, + "loss": 0.77859974, + "num_input_tokens_seen": 10306875, + "step": 484, + "time_per_iteration": 2.4246771335601807 + }, + { + "auxiliary_loss_clip": 0.01286981, + "auxiliary_loss_mlp": 0.0108006, + "balance_loss_clip": 1.08221257, + "balance_loss_mlp": 1.04708719, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.0816032038989634, + "language_loss": 0.84257442, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86624479, + "num_input_tokens_seen": 10323965, + "step": 485, + "time_per_iteration": 2.4343996047973633 + }, + { + "auxiliary_loss_clip": 0.01293183, + "auxiliary_loss_mlp": 0.01080365, + "balance_loss_clip": 1.08978701, + "balance_loss_mlp": 1.04641438, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 2.562239725167488, + "language_loss": 0.84308338, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86681879, + "num_input_tokens_seen": 10342620, + "step": 486, + "time_per_iteration": 2.439990758895874 + }, + { + "auxiliary_loss_clip": 0.01284919, + "auxiliary_loss_mlp": 0.01087034, + "balance_loss_clip": 1.0801214, + "balance_loss_mlp": 1.05234408, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.7067383663827331, + "language_loss": 0.88858104, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91230059, + "num_input_tokens_seen": 10364610, + "step": 487, + "time_per_iteration": 2.478191375732422 + }, + { + "auxiliary_loss_clip": 0.01289949, + "auxiliary_loss_mlp": 0.01070047, + "balance_loss_clip": 1.08493364, + "balance_loss_mlp": 1.03821802, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.4314764560952744, + "language_loss": 0.88222229, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90582228, + "num_input_tokens_seen": 10380910, + "step": 488, + "time_per_iteration": 2.4348742961883545 + }, + { + "auxiliary_loss_clip": 0.01285143, + "auxiliary_loss_mlp": 0.01077951, + "balance_loss_clip": 1.08214092, + "balance_loss_mlp": 1.04435754, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 1.8387340964934769, + "language_loss": 0.88904583, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91267669, + "num_input_tokens_seen": 10400665, + "step": 489, + "time_per_iteration": 2.478949546813965 + }, + { + "auxiliary_loss_clip": 0.01277387, + "auxiliary_loss_mlp": 0.0107469, + "balance_loss_clip": 1.07611775, + "balance_loss_mlp": 1.04069114, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 1.9133685525298934, + "language_loss": 0.88461757, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90813839, + "num_input_tokens_seen": 10420150, + "step": 490, + "time_per_iteration": 2.457620143890381 + }, + { + "auxiliary_loss_clip": 0.0128526, + "auxiliary_loss_mlp": 0.0108943, + "balance_loss_clip": 1.0790534, + "balance_loss_mlp": 1.05624223, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 3.6083385587020556, + "language_loss": 0.9167757, + "learning_rate": 3.989594081641164e-06, + "loss": 0.94052255, + "num_input_tokens_seen": 10438210, + "step": 491, + "time_per_iteration": 2.4559409618377686 + }, + { + "auxiliary_loss_clip": 0.01275483, + "auxiliary_loss_mlp": 0.01074688, + "balance_loss_clip": 1.07853985, + "balance_loss_mlp": 1.04266858, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 1.8148665782325895, + "language_loss": 0.85293072, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87643242, + "num_input_tokens_seen": 10455125, + "step": 492, + "time_per_iteration": 2.438765048980713 + }, + { + "auxiliary_loss_clip": 0.01286025, + "auxiliary_loss_mlp": 0.01097285, + "balance_loss_clip": 1.08294928, + "balance_loss_mlp": 1.06333399, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 3.3690879412425905, + "language_loss": 0.8424747, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86630774, + "num_input_tokens_seen": 10470990, + "step": 493, + "time_per_iteration": 2.421844244003296 + }, + { + "auxiliary_loss_clip": 0.01281497, + "auxiliary_loss_mlp": 0.01075246, + "balance_loss_clip": 1.07906139, + "balance_loss_mlp": 1.04246306, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.7598866608654486, + "language_loss": 0.86559778, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88916516, + "num_input_tokens_seen": 10490685, + "step": 494, + "time_per_iteration": 2.461742401123047 + }, + { + "auxiliary_loss_clip": 0.01287929, + "auxiliary_loss_mlp": 0.01080288, + "balance_loss_clip": 1.08282733, + "balance_loss_mlp": 1.04955626, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 2.837876610239879, + "language_loss": 0.86302459, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88670671, + "num_input_tokens_seen": 10509435, + "step": 495, + "time_per_iteration": 2.4652841091156006 + }, + { + "auxiliary_loss_clip": 0.0127612, + "auxiliary_loss_mlp": 0.01074842, + "balance_loss_clip": 1.07888961, + "balance_loss_mlp": 1.04294157, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 3.3685274721139797, + "language_loss": 0.62126458, + "learning_rate": 3.99611746250533e-06, + "loss": 0.6447742, + "num_input_tokens_seen": 10530050, + "step": 496, + "time_per_iteration": 5.367973327636719 + }, + { + "auxiliary_loss_clip": 0.012786, + "auxiliary_loss_mlp": 0.01086944, + "balance_loss_clip": 1.08141732, + "balance_loss_mlp": 1.05501938, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 2.0360291902389003, + "language_loss": 0.8900333, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91368872, + "num_input_tokens_seen": 10551370, + "step": 497, + "time_per_iteration": 2.484804630279541 + }, + { + "auxiliary_loss_clip": 0.01284159, + "auxiliary_loss_mlp": 0.01080084, + "balance_loss_clip": 1.08221054, + "balance_loss_mlp": 1.04761124, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 2.62783049673762, + "language_loss": 0.85088694, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87452936, + "num_input_tokens_seen": 10569225, + "step": 498, + "time_per_iteration": 2.416926622390747 + }, + { + "auxiliary_loss_clip": 0.01280894, + "auxiliary_loss_mlp": 0.01077499, + "balance_loss_clip": 1.08024216, + "balance_loss_mlp": 1.04590845, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 2.987627233382885, + "language_loss": 0.78171974, + "learning_rate": 4e-06, + "loss": 0.80530363, + "num_input_tokens_seen": 10586170, + "step": 499, + "time_per_iteration": 3.9549622535705566 + }, + { + "auxiliary_loss_clip": 0.01283339, + "auxiliary_loss_mlp": 0.01081357, + "balance_loss_clip": 1.08266819, + "balance_loss_mlp": 1.04990911, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 1.933849566964907, + "language_loss": 0.82589722, + "learning_rate": 3.9999999620799e-06, + "loss": 0.84954417, + "num_input_tokens_seen": 10606205, + "step": 500, + "time_per_iteration": 2.4614627361297607 + }, + { + "auxiliary_loss_clip": 0.01276297, + "auxiliary_loss_mlp": 0.01086769, + "balance_loss_clip": 1.07786417, + "balance_loss_mlp": 1.05150712, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.974216722597347, + "language_loss": 0.88191509, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90554571, + "num_input_tokens_seen": 10625995, + "step": 501, + "time_per_iteration": 2.4612977504730225 + }, + { + "auxiliary_loss_clip": 0.01282652, + "auxiliary_loss_mlp": 0.0107394, + "balance_loss_clip": 1.07998335, + "balance_loss_mlp": 1.04254055, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.303362430804444, + "language_loss": 0.86961716, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.89318311, + "num_input_tokens_seen": 10644105, + "step": 502, + "time_per_iteration": 2.4261627197265625 + }, + { + "auxiliary_loss_clip": 0.01278764, + "auxiliary_loss_mlp": 0.01077875, + "balance_loss_clip": 1.08149111, + "balance_loss_mlp": 1.04502106, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.499112467822762, + "language_loss": 0.84699726, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87056363, + "num_input_tokens_seen": 10661090, + "step": 503, + "time_per_iteration": 3.8819870948791504 + }, + { + "auxiliary_loss_clip": 0.01272082, + "auxiliary_loss_mlp": 0.01090716, + "balance_loss_clip": 1.07856309, + "balance_loss_mlp": 1.05812454, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.7328185381033079, + "language_loss": 0.88613123, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90975916, + "num_input_tokens_seen": 10682380, + "step": 504, + "time_per_iteration": 2.5103535652160645 + }, + { + "auxiliary_loss_clip": 0.01275275, + "auxiliary_loss_mlp": 0.01087613, + "balance_loss_clip": 1.07887673, + "balance_loss_mlp": 1.05573654, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.272151332215829, + "language_loss": 0.78245932, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80608821, + "num_input_tokens_seen": 10699925, + "step": 505, + "time_per_iteration": 2.419429302215576 + }, + { + "auxiliary_loss_clip": 0.01155674, + "auxiliary_loss_mlp": 0.01025654, + "balance_loss_clip": 1.0425477, + "balance_loss_mlp": 1.01778626, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8531156282589654, + "language_loss": 0.55031639, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57212967, + "num_input_tokens_seen": 10766525, + "step": 506, + "time_per_iteration": 3.188394546508789 + }, + { + "auxiliary_loss_clip": 0.01276439, + "auxiliary_loss_mlp": 0.01091341, + "balance_loss_clip": 1.07808948, + "balance_loss_mlp": 1.05858195, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 2.7538954352729195, + "language_loss": 0.83211052, + "learning_rate": 3.999997573114069e-06, + "loss": 0.85578835, + "num_input_tokens_seen": 10786725, + "step": 507, + "time_per_iteration": 2.463144063949585 + }, + { + "auxiliary_loss_clip": 0.01279215, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_clip": 1.07856464, + "balance_loss_mlp": 1.04257381, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.3729736015590714, + "language_loss": 0.8904717, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91401172, + "num_input_tokens_seen": 10805390, + "step": 508, + "time_per_iteration": 2.454904556274414 + }, + { + "auxiliary_loss_clip": 0.01282774, + "auxiliary_loss_mlp": 0.01065526, + "balance_loss_clip": 1.08007812, + "balance_loss_mlp": 1.03302956, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 1.8370304084079898, + "language_loss": 0.7163958, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73987877, + "num_input_tokens_seen": 10828030, + "step": 509, + "time_per_iteration": 2.558640480041504 + }, + { + "auxiliary_loss_clip": 0.01274771, + "auxiliary_loss_mlp": 0.01074396, + "balance_loss_clip": 1.07994235, + "balance_loss_mlp": 1.04435563, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.4850096417514145, + "language_loss": 0.82505959, + "learning_rate": 3.999995411669614e-06, + "loss": 0.84855127, + "num_input_tokens_seen": 10845240, + "step": 510, + "time_per_iteration": 2.4708471298217773 + }, + { + "auxiliary_loss_clip": 0.01278081, + "auxiliary_loss_mlp": 0.01080683, + "balance_loss_clip": 1.08246589, + "balance_loss_mlp": 1.0489738, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.6413452674597306, + "language_loss": 0.83845186, + "learning_rate": 3.999994539508036e-06, + "loss": 0.86203957, + "num_input_tokens_seen": 10864325, + "step": 511, + "time_per_iteration": 2.456199884414673 + }, + { + "auxiliary_loss_clip": 0.01279916, + "auxiliary_loss_mlp": 0.01082499, + "balance_loss_clip": 1.07851839, + "balance_loss_mlp": 1.05143332, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.7961546101440304, + "language_loss": 0.82054013, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.84416431, + "num_input_tokens_seen": 10883860, + "step": 512, + "time_per_iteration": 2.495436668395996 + }, + { + "auxiliary_loss_clip": 0.01275091, + "auxiliary_loss_mlp": 0.01077561, + "balance_loss_clip": 1.07728612, + "balance_loss_mlp": 1.04520774, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 1.8754960691493199, + "language_loss": 0.87018186, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89370841, + "num_input_tokens_seen": 10904555, + "step": 513, + "time_per_iteration": 2.530998468399048 + }, + { + "auxiliary_loss_clip": 0.01283498, + "auxiliary_loss_mlp": 0.01086228, + "balance_loss_clip": 1.0813477, + "balance_loss_mlp": 1.05349278, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 1.7935805320938538, + "language_loss": 0.79240751, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81610471, + "num_input_tokens_seen": 10923700, + "step": 514, + "time_per_iteration": 2.4612371921539307 + }, + { + "auxiliary_loss_clip": 0.01276133, + "auxiliary_loss_mlp": 0.01071005, + "balance_loss_clip": 1.08202958, + "balance_loss_mlp": 1.04017711, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 3.5229192897020707, + "language_loss": 0.77223432, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79570568, + "num_input_tokens_seen": 10942730, + "step": 515, + "time_per_iteration": 2.482990264892578 + }, + { + "auxiliary_loss_clip": 0.01271832, + "auxiliary_loss_mlp": 0.01069064, + "balance_loss_clip": 1.07467246, + "balance_loss_mlp": 1.03733027, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.8582036331725873, + "language_loss": 0.82481098, + "learning_rate": 3.999989041101011e-06, + "loss": 0.84821999, + "num_input_tokens_seen": 10967120, + "step": 516, + "time_per_iteration": 2.6621415615081787 + }, + { + "auxiliary_loss_clip": 0.01271547, + "auxiliary_loss_mlp": 0.01071699, + "balance_loss_clip": 1.077425, + "balance_loss_mlp": 1.03972709, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 1.8033833091426907, + "language_loss": 0.78844118, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81187367, + "num_input_tokens_seen": 10986775, + "step": 517, + "time_per_iteration": 2.472193956375122 + }, + { + "auxiliary_loss_clip": 0.01268015, + "auxiliary_loss_mlp": 0.01071817, + "balance_loss_clip": 1.07707071, + "balance_loss_mlp": 1.04153776, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 2.4640944810254033, + "language_loss": 0.90749544, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93089378, + "num_input_tokens_seen": 11011360, + "step": 518, + "time_per_iteration": 2.532752513885498 + }, + { + "auxiliary_loss_clip": 0.0128104, + "auxiliary_loss_mlp": 0.01094615, + "balance_loss_clip": 1.08535755, + "balance_loss_mlp": 1.0611645, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 1.9237247213287487, + "language_loss": 0.8640002, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88775671, + "num_input_tokens_seen": 11030150, + "step": 519, + "time_per_iteration": 2.4704062938690186 + }, + { + "auxiliary_loss_clip": 0.0127511, + "auxiliary_loss_mlp": 0.01085151, + "balance_loss_clip": 1.07612705, + "balance_loss_mlp": 1.05460954, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.2867340764818884, + "language_loss": 0.87054706, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89414966, + "num_input_tokens_seen": 11049145, + "step": 520, + "time_per_iteration": 2.4690170288085938 + }, + { + "auxiliary_loss_clip": 0.01279381, + "auxiliary_loss_mlp": 0.01085138, + "balance_loss_clip": 1.07920992, + "balance_loss_mlp": 1.05308247, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 3.339533624625454, + "language_loss": 0.89124644, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91489166, + "num_input_tokens_seen": 11068835, + "step": 521, + "time_per_iteration": 2.4792206287384033 + }, + { + "auxiliary_loss_clip": 0.01272577, + "auxiliary_loss_mlp": 0.01081498, + "balance_loss_clip": 1.07840061, + "balance_loss_mlp": 1.04881072, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.058205829567261, + "language_loss": 0.71303087, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73657161, + "num_input_tokens_seen": 11088980, + "step": 522, + "time_per_iteration": 2.4696645736694336 + }, + { + "auxiliary_loss_clip": 0.01277211, + "auxiliary_loss_mlp": 0.01081418, + "balance_loss_clip": 1.0769732, + "balance_loss_mlp": 1.0506382, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.3456411981910508, + "language_loss": 0.85184807, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87543434, + "num_input_tokens_seen": 11104300, + "step": 523, + "time_per_iteration": 2.4361157417297363 + }, + { + "auxiliary_loss_clip": 0.01281229, + "auxiliary_loss_mlp": 0.01076697, + "balance_loss_clip": 1.07860994, + "balance_loss_mlp": 1.04355633, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 3.211062575658916, + "language_loss": 0.9016642, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92524344, + "num_input_tokens_seen": 11123335, + "step": 524, + "time_per_iteration": 2.4600226879119873 + }, + { + "auxiliary_loss_clip": 0.01284927, + "auxiliary_loss_mlp": 0.01080818, + "balance_loss_clip": 1.08182573, + "balance_loss_mlp": 1.04810715, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 6.588928727721471, + "language_loss": 0.80303568, + "learning_rate": 3.999974366066933e-06, + "loss": 0.82669306, + "num_input_tokens_seen": 11140880, + "step": 525, + "time_per_iteration": 2.511016368865967 + }, + { + "auxiliary_loss_clip": 0.01276602, + "auxiliary_loss_mlp": 0.0108252, + "balance_loss_clip": 1.07634807, + "balance_loss_mlp": 1.05002332, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.0145042338091423, + "language_loss": 0.80787379, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83146501, + "num_input_tokens_seen": 11158710, + "step": 526, + "time_per_iteration": 2.410109043121338 + }, + { + "auxiliary_loss_clip": 0.01284977, + "auxiliary_loss_mlp": 0.0106929, + "balance_loss_clip": 1.08297253, + "balance_loss_mlp": 1.03531551, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.2759331375199348, + "language_loss": 0.81258273, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83612537, + "num_input_tokens_seen": 11177550, + "step": 527, + "time_per_iteration": 2.433640480041504 + }, + { + "auxiliary_loss_clip": 0.01272454, + "auxiliary_loss_mlp": 0.01082145, + "balance_loss_clip": 1.07594776, + "balance_loss_mlp": 1.04933846, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.5987641435611426, + "language_loss": 0.9377448, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96129078, + "num_input_tokens_seen": 11196230, + "step": 528, + "time_per_iteration": 2.4397025108337402 + }, + { + "auxiliary_loss_clip": 0.01272511, + "auxiliary_loss_mlp": 0.01069026, + "balance_loss_clip": 1.07608998, + "balance_loss_mlp": 1.03776932, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8928418333003785, + "language_loss": 0.83869928, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86211467, + "num_input_tokens_seen": 11214935, + "step": 529, + "time_per_iteration": 2.4642252922058105 + }, + { + "auxiliary_loss_clip": 0.01279297, + "auxiliary_loss_mlp": 0.01083145, + "balance_loss_clip": 1.08505464, + "balance_loss_mlp": 1.05192459, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.053292502763954, + "language_loss": 0.90368611, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92731053, + "num_input_tokens_seen": 11235310, + "step": 530, + "time_per_iteration": 2.4738636016845703 + }, + { + "auxiliary_loss_clip": 0.01272353, + "auxiliary_loss_mlp": 0.01073656, + "balance_loss_clip": 1.07331133, + "balance_loss_mlp": 1.04039705, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.2593702625933836, + "language_loss": 0.76184678, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78530687, + "num_input_tokens_seen": 11254425, + "step": 531, + "time_per_iteration": 2.4299888610839844 + }, + { + "auxiliary_loss_clip": 0.01272472, + "auxiliary_loss_mlp": 0.01062457, + "balance_loss_clip": 1.07552397, + "balance_loss_mlp": 1.03031838, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.7270732972066596, + "language_loss": 0.90576613, + "learning_rate": 3.999958705152843e-06, + "loss": 0.92911547, + "num_input_tokens_seen": 11274595, + "step": 532, + "time_per_iteration": 2.4768471717834473 + }, + { + "auxiliary_loss_clip": 0.01157538, + "auxiliary_loss_mlp": 0.01117177, + "balance_loss_clip": 1.05871749, + "balance_loss_mlp": 1.10816467, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7737839958956888, + "language_loss": 0.57963091, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60237807, + "num_input_tokens_seen": 11336705, + "step": 533, + "time_per_iteration": 3.0975115299224854 + }, + { + "auxiliary_loss_clip": 0.012718, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_clip": 1.07552671, + "balance_loss_mlp": 1.05097175, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 2.022700187638969, + "language_loss": 0.86566567, + "learning_rate": 3.999953548056907e-06, + "loss": 0.88920587, + "num_input_tokens_seen": 11356820, + "step": 534, + "time_per_iteration": 2.51423978805542 + }, + { + "auxiliary_loss_clip": 0.01272582, + "auxiliary_loss_mlp": 0.01071348, + "balance_loss_clip": 1.07795858, + "balance_loss_mlp": 1.03990066, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 3.992559375712948, + "language_loss": 0.77383405, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79727328, + "num_input_tokens_seen": 11376645, + "step": 535, + "time_per_iteration": 3.907583713531494 + }, + { + "auxiliary_loss_clip": 0.01277525, + "auxiliary_loss_mlp": 0.01082649, + "balance_loss_clip": 1.08039451, + "balance_loss_mlp": 1.05110669, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.194375549230747, + "language_loss": 0.80687088, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83047259, + "num_input_tokens_seen": 11397310, + "step": 536, + "time_per_iteration": 2.525546073913574 + }, + { + "auxiliary_loss_clip": 0.01278429, + "auxiliary_loss_mlp": 0.01085167, + "balance_loss_clip": 1.08120835, + "balance_loss_mlp": 1.05209875, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.0990928988204955, + "language_loss": 0.70358026, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72721624, + "num_input_tokens_seen": 11418475, + "step": 537, + "time_per_iteration": 2.5574045181274414 + }, + { + "auxiliary_loss_clip": 0.01277848, + "auxiliary_loss_mlp": 0.01092239, + "balance_loss_clip": 1.08588231, + "balance_loss_mlp": 1.06098258, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.0025468641384276, + "language_loss": 0.82651603, + "learning_rate": 3.999942323804607e-06, + "loss": 0.85021693, + "num_input_tokens_seen": 11436630, + "step": 538, + "time_per_iteration": 3.913520574569702 + }, + { + "auxiliary_loss_clip": 0.01284734, + "auxiliary_loss_mlp": 0.01085332, + "balance_loss_clip": 1.08241487, + "balance_loss_mlp": 1.05340791, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.7747986062954748, + "language_loss": 0.79389948, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81760013, + "num_input_tokens_seen": 11457275, + "step": 539, + "time_per_iteration": 2.5014429092407227 + }, + { + "auxiliary_loss_clip": 0.01276942, + "auxiliary_loss_mlp": 0.01079209, + "balance_loss_clip": 1.08110094, + "balance_loss_mlp": 1.04583037, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.2728468580643284, + "language_loss": 0.77607501, + "learning_rate": 3.999936256649943e-06, + "loss": 0.79963654, + "num_input_tokens_seen": 11476925, + "step": 540, + "time_per_iteration": 3.899293899536133 + }, + { + "auxiliary_loss_clip": 0.01285703, + "auxiliary_loss_mlp": 0.01081359, + "balance_loss_clip": 1.08640146, + "balance_loss_mlp": 1.05062687, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 1.8363283880760592, + "language_loss": 0.85493952, + "learning_rate": 3.999933109315878e-06, + "loss": 0.87861013, + "num_input_tokens_seen": 11496830, + "step": 541, + "time_per_iteration": 2.4757790565490723 + }, + { + "auxiliary_loss_clip": 0.01273999, + "auxiliary_loss_mlp": 0.01090835, + "balance_loss_clip": 1.08184218, + "balance_loss_mlp": 1.05812359, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.606535405086847, + "language_loss": 0.8930912, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91673952, + "num_input_tokens_seen": 11515605, + "step": 542, + "time_per_iteration": 2.45794677734375 + }, + { + "auxiliary_loss_clip": 0.01277902, + "auxiliary_loss_mlp": 0.01089588, + "balance_loss_clip": 1.08081698, + "balance_loss_mlp": 1.05764008, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.5966724481805965, + "language_loss": 0.70665574, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73033071, + "num_input_tokens_seen": 11536230, + "step": 543, + "time_per_iteration": 2.4855496883392334 + }, + { + "auxiliary_loss_clip": 0.01271775, + "auxiliary_loss_mlp": 0.01094802, + "balance_loss_clip": 1.07318473, + "balance_loss_mlp": 1.062783, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.9426259197939806, + "language_loss": 0.91510504, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93877089, + "num_input_tokens_seen": 11554715, + "step": 544, + "time_per_iteration": 2.4508824348449707 + }, + { + "auxiliary_loss_clip": 0.01278714, + "auxiliary_loss_mlp": 0.01085057, + "balance_loss_clip": 1.08145046, + "balance_loss_mlp": 1.05554044, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 7.581931348630019, + "language_loss": 0.66306877, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68670648, + "num_input_tokens_seen": 11571370, + "step": 545, + "time_per_iteration": 3.8931336402893066 + }, + { + "auxiliary_loss_clip": 0.01273782, + "auxiliary_loss_mlp": 0.01069422, + "balance_loss_clip": 1.07677829, + "balance_loss_mlp": 1.03799808, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.2064519473313373, + "language_loss": 0.92015541, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94358742, + "num_input_tokens_seen": 11588560, + "step": 546, + "time_per_iteration": 2.4714996814727783 + }, + { + "auxiliary_loss_clip": 0.01271653, + "auxiliary_loss_mlp": 0.01074375, + "balance_loss_clip": 1.07390678, + "balance_loss_mlp": 1.04218888, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.0653259544769726, + "language_loss": 0.81844789, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84190822, + "num_input_tokens_seen": 11605685, + "step": 547, + "time_per_iteration": 2.44377064704895 + }, + { + "auxiliary_loss_clip": 0.01274061, + "auxiliary_loss_mlp": 0.01076897, + "balance_loss_clip": 1.0781163, + "balance_loss_mlp": 1.04494905, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 2.0031420567854195, + "language_loss": 0.81054044, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83405, + "num_input_tokens_seen": 11626290, + "step": 548, + "time_per_iteration": 2.4811224937438965 + }, + { + "auxiliary_loss_clip": 0.01273121, + "auxiliary_loss_mlp": 0.01080861, + "balance_loss_clip": 1.0773685, + "balance_loss_mlp": 1.04722023, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 2.520924559537777, + "language_loss": 0.67533571, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69887549, + "num_input_tokens_seen": 11643950, + "step": 549, + "time_per_iteration": 2.4228882789611816 + }, + { + "auxiliary_loss_clip": 0.01266703, + "auxiliary_loss_mlp": 0.01072514, + "balance_loss_clip": 1.07699156, + "balance_loss_mlp": 1.04066086, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 1.9706876690702926, + "language_loss": 0.86110604, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88449824, + "num_input_tokens_seen": 11662560, + "step": 550, + "time_per_iteration": 2.4409239292144775 + }, + { + "auxiliary_loss_clip": 0.01274864, + "auxiliary_loss_mlp": 0.01085131, + "balance_loss_clip": 1.08095872, + "balance_loss_mlp": 1.05330205, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 1.8356320838120885, + "language_loss": 0.81133437, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83493435, + "num_input_tokens_seen": 11682265, + "step": 551, + "time_per_iteration": 2.4629006385803223 + }, + { + "auxiliary_loss_clip": 0.01280759, + "auxiliary_loss_mlp": 0.01085231, + "balance_loss_clip": 1.08059263, + "balance_loss_mlp": 1.05254388, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.8660927595486108, + "language_loss": 0.86616862, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88982856, + "num_input_tokens_seen": 11699300, + "step": 552, + "time_per_iteration": 2.5037295818328857 + }, + { + "auxiliary_loss_clip": 0.012782, + "auxiliary_loss_mlp": 0.01077012, + "balance_loss_clip": 1.08190906, + "balance_loss_mlp": 1.04406285, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 3.328464646516859, + "language_loss": 0.9289819, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95253402, + "num_input_tokens_seen": 11716955, + "step": 553, + "time_per_iteration": 2.4460549354553223 + }, + { + "auxiliary_loss_clip": 0.01275005, + "auxiliary_loss_mlp": 0.0107345, + "balance_loss_clip": 1.07941675, + "balance_loss_mlp": 1.03983283, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 5.404319649804097, + "language_loss": 0.78582346, + "learning_rate": 3.999885292792986e-06, + "loss": 0.80930793, + "num_input_tokens_seen": 11736130, + "step": 554, + "time_per_iteration": 2.485914707183838 + }, + { + "auxiliary_loss_clip": 0.01268642, + "auxiliary_loss_mlp": 0.01080082, + "balance_loss_clip": 1.07654107, + "balance_loss_mlp": 1.04658437, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.1999115740002164, + "language_loss": 0.81961226, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84309947, + "num_input_tokens_seen": 11754425, + "step": 555, + "time_per_iteration": 2.450165271759033 + }, + { + "auxiliary_loss_clip": 0.01275076, + "auxiliary_loss_mlp": 0.01081206, + "balance_loss_clip": 1.07719874, + "balance_loss_mlp": 1.04780316, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.3569523864326687, + "language_loss": 0.88517725, + "learning_rate": 3.999876798858914e-06, + "loss": 0.9087401, + "num_input_tokens_seen": 11772845, + "step": 556, + "time_per_iteration": 2.498898506164551 + }, + { + "auxiliary_loss_clip": 0.01274542, + "auxiliary_loss_mlp": 0.01084471, + "balance_loss_clip": 1.07861447, + "balance_loss_mlp": 1.05085385, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.0575566372682434, + "language_loss": 0.83820462, + "learning_rate": 3.999872438138503e-06, + "loss": 0.86179471, + "num_input_tokens_seen": 11792850, + "step": 557, + "time_per_iteration": 2.470043897628784 + }, + { + "auxiliary_loss_clip": 0.01278325, + "auxiliary_loss_mlp": 0.01071819, + "balance_loss_clip": 1.08290339, + "balance_loss_mlp": 1.04115844, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 2.611673742667386, + "language_loss": 0.9402017, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96370304, + "num_input_tokens_seen": 11809670, + "step": 558, + "time_per_iteration": 2.43674898147583 + }, + { + "auxiliary_loss_clip": 0.01271947, + "auxiliary_loss_mlp": 0.01077501, + "balance_loss_clip": 1.07708395, + "balance_loss_mlp": 1.04510033, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.533188266389237, + "language_loss": 0.77572799, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79922241, + "num_input_tokens_seen": 11829665, + "step": 559, + "time_per_iteration": 2.485779047012329 + }, + { + "auxiliary_loss_clip": 0.01272836, + "auxiliary_loss_mlp": 0.0108532, + "balance_loss_clip": 1.0773077, + "balance_loss_mlp": 1.05465913, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.483893101435379, + "language_loss": 0.87261724, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.89619887, + "num_input_tokens_seen": 11848190, + "step": 560, + "time_per_iteration": 2.4824392795562744 + }, + { + "auxiliary_loss_clip": 0.01269096, + "auxiliary_loss_mlp": 0.01074104, + "balance_loss_clip": 1.07715905, + "balance_loss_mlp": 1.04427803, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.5737790084798196, + "language_loss": 0.81367242, + "learning_rate": 3.999854236904925e-06, + "loss": 0.8371045, + "num_input_tokens_seen": 11864795, + "step": 561, + "time_per_iteration": 2.50199818611145 + }, + { + "auxiliary_loss_clip": 0.01269678, + "auxiliary_loss_mlp": 0.01074677, + "balance_loss_clip": 1.07910013, + "balance_loss_mlp": 1.04375386, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.9640193262184287, + "language_loss": 0.82199067, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84543419, + "num_input_tokens_seen": 11885275, + "step": 562, + "time_per_iteration": 2.550475835800171 + }, + { + "auxiliary_loss_clip": 0.01277068, + "auxiliary_loss_mlp": 0.01085605, + "balance_loss_clip": 1.08093762, + "balance_loss_mlp": 1.0539906, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 2.492291999931478, + "language_loss": 0.84448898, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86811572, + "num_input_tokens_seen": 11903595, + "step": 563, + "time_per_iteration": 2.4144771099090576 + }, + { + "auxiliary_loss_clip": 0.01274221, + "auxiliary_loss_mlp": 0.01088069, + "balance_loss_clip": 1.08001614, + "balance_loss_mlp": 1.05645514, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.342383407373896, + "language_loss": 0.94403398, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96765697, + "num_input_tokens_seen": 11917815, + "step": 564, + "time_per_iteration": 2.415493965148926 + }, + { + "auxiliary_loss_clip": 0.01272491, + "auxiliary_loss_mlp": 0.01075914, + "balance_loss_clip": 1.0754813, + "balance_loss_mlp": 1.04270244, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 3.9964655413036128, + "language_loss": 0.94136989, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96485394, + "num_input_tokens_seen": 11936305, + "step": 565, + "time_per_iteration": 2.450113534927368 + }, + { + "auxiliary_loss_clip": 0.01164466, + "auxiliary_loss_mlp": 0.01092026, + "balance_loss_clip": 1.0712955, + "balance_loss_mlp": 1.08453989, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1431247523498664, + "language_loss": 0.54900742, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57157236, + "num_input_tokens_seen": 11998940, + "step": 566, + "time_per_iteration": 3.100250005722046 + }, + { + "auxiliary_loss_clip": 0.01275631, + "auxiliary_loss_mlp": 0.01076285, + "balance_loss_clip": 1.0794642, + "balance_loss_mlp": 1.04207134, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.2215857914768993, + "language_loss": 0.76678705, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79030615, + "num_input_tokens_seen": 12018860, + "step": 567, + "time_per_iteration": 2.477834701538086 + }, + { + "auxiliary_loss_clip": 0.01267885, + "auxiliary_loss_mlp": 0.01083958, + "balance_loss_clip": 1.07814646, + "balance_loss_mlp": 1.05336881, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 1.9001920014564326, + "language_loss": 0.80707216, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83059061, + "num_input_tokens_seen": 12039675, + "step": 568, + "time_per_iteration": 2.5157694816589355 + }, + { + "auxiliary_loss_clip": 0.01271553, + "auxiliary_loss_mlp": 0.01088101, + "balance_loss_clip": 1.08156502, + "balance_loss_mlp": 1.05655873, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.7976096499844985, + "language_loss": 0.86672163, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89031821, + "num_input_tokens_seen": 12057680, + "step": 569, + "time_per_iteration": 2.45261549949646 + }, + { + "auxiliary_loss_clip": 0.01270213, + "auxiliary_loss_mlp": 0.01090005, + "balance_loss_clip": 1.07770634, + "balance_loss_mlp": 1.05793762, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.7880907682647778, + "language_loss": 0.95655596, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98015809, + "num_input_tokens_seen": 12076135, + "step": 570, + "time_per_iteration": 2.449108600616455 + }, + { + "auxiliary_loss_clip": 0.01270412, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_clip": 1.0736866, + "balance_loss_mlp": 1.0473063, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.1105245187621184, + "language_loss": 0.79912448, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.82264519, + "num_input_tokens_seen": 12094785, + "step": 571, + "time_per_iteration": 2.449002504348755 + }, + { + "auxiliary_loss_clip": 0.0127205, + "auxiliary_loss_mlp": 0.01084019, + "balance_loss_clip": 1.07921576, + "balance_loss_mlp": 1.05121243, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.1409596116480585, + "language_loss": 0.80686533, + "learning_rate": 3.999797927188199e-06, + "loss": 0.83042604, + "num_input_tokens_seen": 12114590, + "step": 572, + "time_per_iteration": 2.495663642883301 + }, + { + "auxiliary_loss_clip": 0.01278048, + "auxiliary_loss_mlp": 0.01079111, + "balance_loss_clip": 1.08151746, + "balance_loss_mlp": 1.04659033, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 2.2522948812067254, + "language_loss": 0.84344667, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86701822, + "num_input_tokens_seen": 12132390, + "step": 573, + "time_per_iteration": 2.5088279247283936 + }, + { + "auxiliary_loss_clip": 0.01270011, + "auxiliary_loss_mlp": 0.01068957, + "balance_loss_clip": 1.07469845, + "balance_loss_mlp": 1.03827226, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 3.8859481844639805, + "language_loss": 0.76421344, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78760314, + "num_input_tokens_seen": 12149035, + "step": 574, + "time_per_iteration": 2.5370726585388184 + }, + { + "auxiliary_loss_clip": 0.01270816, + "auxiliary_loss_mlp": 0.01069064, + "balance_loss_clip": 1.07699502, + "balance_loss_mlp": 1.03876066, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.826893768930259, + "language_loss": 0.83855736, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86195624, + "num_input_tokens_seen": 12167530, + "step": 575, + "time_per_iteration": 2.483168125152588 + }, + { + "auxiliary_loss_clip": 0.01265933, + "auxiliary_loss_mlp": 0.01076662, + "balance_loss_clip": 1.07870448, + "balance_loss_mlp": 1.04614449, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.435918267065688, + "language_loss": 0.83971137, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86313736, + "num_input_tokens_seen": 12186340, + "step": 576, + "time_per_iteration": 2.5043907165527344 + }, + { + "auxiliary_loss_clip": 0.01274154, + "auxiliary_loss_mlp": 0.01081244, + "balance_loss_clip": 1.08628273, + "balance_loss_mlp": 1.05153704, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.583231988336432, + "language_loss": 0.86323655, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88679051, + "num_input_tokens_seen": 12204090, + "step": 577, + "time_per_iteration": 2.4717066287994385 + }, + { + "auxiliary_loss_clip": 0.01269395, + "auxiliary_loss_mlp": 0.01076791, + "balance_loss_clip": 1.08136785, + "balance_loss_mlp": 1.04543924, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 2.7321104176239537, + "language_loss": 0.72365057, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74711239, + "num_input_tokens_seen": 12224850, + "step": 578, + "time_per_iteration": 3.9375436305999756 + }, + { + "auxiliary_loss_clip": 0.01270131, + "auxiliary_loss_mlp": 0.01078033, + "balance_loss_clip": 1.07833707, + "balance_loss_mlp": 1.04558372, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 2.0042742968497445, + "language_loss": 0.77460039, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79808199, + "num_input_tokens_seen": 12244935, + "step": 579, + "time_per_iteration": 2.4704537391662598 + }, + { + "auxiliary_loss_clip": 0.01264459, + "auxiliary_loss_mlp": 0.01084116, + "balance_loss_clip": 1.07630658, + "balance_loss_mlp": 1.05159545, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 1.9622530463063341, + "language_loss": 0.86709613, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89058185, + "num_input_tokens_seen": 12262140, + "step": 580, + "time_per_iteration": 3.8407881259918213 + }, + { + "auxiliary_loss_clip": 0.01271842, + "auxiliary_loss_mlp": 0.01069022, + "balance_loss_clip": 1.07767844, + "balance_loss_mlp": 1.04012537, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 2.490677887231509, + "language_loss": 0.81863439, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84204298, + "num_input_tokens_seen": 12280930, + "step": 581, + "time_per_iteration": 2.4333243370056152 + }, + { + "auxiliary_loss_clip": 0.01267923, + "auxiliary_loss_mlp": 0.01073569, + "balance_loss_clip": 1.07929897, + "balance_loss_mlp": 1.04424334, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.1576098969437485, + "language_loss": 0.77036393, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79377878, + "num_input_tokens_seen": 12299125, + "step": 582, + "time_per_iteration": 2.4525742530822754 + }, + { + "auxiliary_loss_clip": 0.01266575, + "auxiliary_loss_mlp": 0.01077345, + "balance_loss_clip": 1.07993782, + "balance_loss_mlp": 1.04546881, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.1946423784642706, + "language_loss": 0.87742811, + "learning_rate": 3.999732441737877e-06, + "loss": 0.90086734, + "num_input_tokens_seen": 12316905, + "step": 583, + "time_per_iteration": 2.438532829284668 + }, + { + "auxiliary_loss_clip": 0.01272013, + "auxiliary_loss_mlp": 0.01085042, + "balance_loss_clip": 1.07850873, + "balance_loss_mlp": 1.05409551, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 3.988945466609614, + "language_loss": 0.80981457, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83338517, + "num_input_tokens_seen": 12335070, + "step": 584, + "time_per_iteration": 2.472381830215454 + }, + { + "auxiliary_loss_clip": 0.01265697, + "auxiliary_loss_mlp": 0.01068583, + "balance_loss_clip": 1.07407594, + "balance_loss_mlp": 1.03811288, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 4.1822715781443245, + "language_loss": 0.92764026, + "learning_rate": 3.999719549492551e-06, + "loss": 0.95098305, + "num_input_tokens_seen": 12350315, + "step": 585, + "time_per_iteration": 2.4075777530670166 + }, + { + "auxiliary_loss_clip": 0.01267164, + "auxiliary_loss_mlp": 0.01072806, + "balance_loss_clip": 1.07620871, + "balance_loss_mlp": 1.04247952, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 3.2098136816151692, + "language_loss": 0.87391543, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89731514, + "num_input_tokens_seen": 12366030, + "step": 586, + "time_per_iteration": 2.4479141235351562 + }, + { + "auxiliary_loss_clip": 0.01272024, + "auxiliary_loss_mlp": 0.01075271, + "balance_loss_clip": 1.07923806, + "balance_loss_mlp": 1.04527831, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 2.397103519853014, + "language_loss": 0.76539087, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78886384, + "num_input_tokens_seen": 12384895, + "step": 587, + "time_per_iteration": 3.9251513481140137 + }, + { + "auxiliary_loss_clip": 0.01272272, + "auxiliary_loss_mlp": 0.01063453, + "balance_loss_clip": 1.07778823, + "balance_loss_mlp": 1.0319103, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 4.739745646907559, + "language_loss": 0.7896179, + "learning_rate": 3.999699642403449e-06, + "loss": 0.81297517, + "num_input_tokens_seen": 12404980, + "step": 588, + "time_per_iteration": 2.4701409339904785 + }, + { + "auxiliary_loss_clip": 0.0126924, + "auxiliary_loss_mlp": 0.01074763, + "balance_loss_clip": 1.07699096, + "balance_loss_mlp": 1.04202855, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.091389782237175, + "language_loss": 0.94178873, + "learning_rate": 3.99969285504912e-06, + "loss": 0.9652288, + "num_input_tokens_seen": 12423835, + "step": 589, + "time_per_iteration": 2.444581985473633 + }, + { + "auxiliary_loss_clip": 0.01271811, + "auxiliary_loss_mlp": 0.01070098, + "balance_loss_clip": 1.07818723, + "balance_loss_mlp": 1.04098725, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.5340739181639913, + "language_loss": 0.83994007, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86335921, + "num_input_tokens_seen": 12443135, + "step": 590, + "time_per_iteration": 2.5634195804595947 + }, + { + "auxiliary_loss_clip": 0.01262145, + "auxiliary_loss_mlp": 0.01061879, + "balance_loss_clip": 1.07631254, + "balance_loss_mlp": 1.03329253, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.2351517318536325, + "language_loss": 0.86982334, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89306355, + "num_input_tokens_seen": 12462895, + "step": 591, + "time_per_iteration": 2.4361307621002197 + }, + { + "auxiliary_loss_clip": 0.01265415, + "auxiliary_loss_mlp": 0.01072462, + "balance_loss_clip": 1.0736444, + "balance_loss_mlp": 1.04199231, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 2.577496209032043, + "language_loss": 0.82591313, + "learning_rate": 3.999672038015861e-06, + "loss": 0.84929192, + "num_input_tokens_seen": 12481515, + "step": 592, + "time_per_iteration": 2.46256422996521 + }, + { + "auxiliary_loss_clip": 0.01141412, + "auxiliary_loss_mlp": 0.01058055, + "balance_loss_clip": 1.04968739, + "balance_loss_mlp": 1.05090201, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8852114288171674, + "language_loss": 0.59795022, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61994487, + "num_input_tokens_seen": 12548220, + "step": 593, + "time_per_iteration": 3.0864200592041016 + }, + { + "auxiliary_loss_clip": 0.01265732, + "auxiliary_loss_mlp": 0.0107012, + "balance_loss_clip": 1.07959819, + "balance_loss_mlp": 1.03955483, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 2.5315380478046543, + "language_loss": 0.86869836, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89205688, + "num_input_tokens_seen": 12566105, + "step": 594, + "time_per_iteration": 2.4606361389160156 + }, + { + "auxiliary_loss_clip": 0.01265785, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_clip": 1.07530296, + "balance_loss_mlp": 1.03932643, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.0895984720341283, + "language_loss": 0.83652461, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85986924, + "num_input_tokens_seen": 12586680, + "step": 595, + "time_per_iteration": 2.4821298122406006 + }, + { + "auxiliary_loss_clip": 0.01264605, + "auxiliary_loss_mlp": 0.01078355, + "balance_loss_clip": 1.07744551, + "balance_loss_mlp": 1.0486486, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 2.6100916099635745, + "language_loss": 0.96244597, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98587549, + "num_input_tokens_seen": 12601605, + "step": 596, + "time_per_iteration": 2.448692560195923 + }, + { + "auxiliary_loss_clip": 0.01267195, + "auxiliary_loss_mlp": 0.01073464, + "balance_loss_clip": 1.07907498, + "balance_loss_mlp": 1.04546213, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 3.6806029806542235, + "language_loss": 0.82718378, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85059035, + "num_input_tokens_seen": 12620365, + "step": 597, + "time_per_iteration": 2.4463391304016113 + }, + { + "auxiliary_loss_clip": 0.01262084, + "auxiliary_loss_mlp": 0.01067227, + "balance_loss_clip": 1.07863009, + "balance_loss_mlp": 1.03848612, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 1.7697839854777417, + "language_loss": 0.8136909, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83698404, + "num_input_tokens_seen": 12641140, + "step": 598, + "time_per_iteration": 2.496406078338623 + }, + { + "auxiliary_loss_clip": 0.01258321, + "auxiliary_loss_mlp": 0.01070517, + "balance_loss_clip": 1.07833827, + "balance_loss_mlp": 1.04026175, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 2.349164937057976, + "language_loss": 0.81320536, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83649373, + "num_input_tokens_seen": 12661080, + "step": 599, + "time_per_iteration": 2.468496799468994 + }, + { + "auxiliary_loss_clip": 0.01267648, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_clip": 1.0763998, + "balance_loss_mlp": 1.05132949, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.232623411035475, + "language_loss": 0.85824347, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88171428, + "num_input_tokens_seen": 12678270, + "step": 600, + "time_per_iteration": 2.525188446044922 + }, + { + "auxiliary_loss_clip": 0.01261685, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_clip": 1.07403898, + "balance_loss_mlp": 1.05556011, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 2.341832453748546, + "language_loss": 0.82217181, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84564078, + "num_input_tokens_seen": 12697295, + "step": 601, + "time_per_iteration": 2.4552252292633057 + }, + { + "auxiliary_loss_clip": 0.01257293, + "auxiliary_loss_mlp": 0.01060764, + "balance_loss_clip": 1.07397902, + "balance_loss_mlp": 1.03122389, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.7970250779707366, + "language_loss": 0.75068569, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77386624, + "num_input_tokens_seen": 12716165, + "step": 602, + "time_per_iteration": 2.4838531017303467 + }, + { + "auxiliary_loss_clip": 0.01255266, + "auxiliary_loss_mlp": 0.01061772, + "balance_loss_clip": 1.07240307, + "balance_loss_mlp": 1.03263783, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 2.3699184693760134, + "language_loss": 0.7994535, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82262391, + "num_input_tokens_seen": 12735475, + "step": 603, + "time_per_iteration": 2.4999868869781494 + }, + { + "auxiliary_loss_clip": 0.01261365, + "auxiliary_loss_mlp": 0.01064961, + "balance_loss_clip": 1.07849145, + "balance_loss_mlp": 1.03683972, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 2.002385899590124, + "language_loss": 0.87044954, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89371276, + "num_input_tokens_seen": 12754540, + "step": 604, + "time_per_iteration": 2.505256175994873 + }, + { + "auxiliary_loss_clip": 0.01262205, + "auxiliary_loss_mlp": 0.01065735, + "balance_loss_clip": 1.0783118, + "balance_loss_mlp": 1.03490782, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 2.5136929896558713, + "language_loss": 0.80286336, + "learning_rate": 3.999573944880424e-06, + "loss": 0.82614267, + "num_input_tokens_seen": 12773050, + "step": 605, + "time_per_iteration": 2.416313886642456 + }, + { + "auxiliary_loss_clip": 0.01259246, + "auxiliary_loss_mlp": 0.01070581, + "balance_loss_clip": 1.07542467, + "balance_loss_mlp": 1.04255486, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.518360864022548, + "language_loss": 0.85489815, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87819636, + "num_input_tokens_seen": 12791240, + "step": 606, + "time_per_iteration": 2.4345858097076416 + }, + { + "auxiliary_loss_clip": 0.0126465, + "auxiliary_loss_mlp": 0.01076235, + "balance_loss_clip": 1.07782495, + "balance_loss_mlp": 1.04683805, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.768129313210865, + "language_loss": 0.82233775, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84574664, + "num_input_tokens_seen": 12812245, + "step": 607, + "time_per_iteration": 2.469715118408203 + }, + { + "auxiliary_loss_clip": 0.01259896, + "auxiliary_loss_mlp": 0.0107039, + "balance_loss_clip": 1.07737589, + "balance_loss_mlp": 1.04189885, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 2.263769642340842, + "language_loss": 0.83022541, + "learning_rate": 3.999549488202358e-06, + "loss": 0.85352826, + "num_input_tokens_seen": 12831085, + "step": 608, + "time_per_iteration": 2.4758353233337402 + }, + { + "auxiliary_loss_clip": 0.01264486, + "auxiliary_loss_mlp": 0.01063825, + "balance_loss_clip": 1.07917058, + "balance_loss_mlp": 1.03275943, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.3590629330848416, + "language_loss": 0.8216843, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84496742, + "num_input_tokens_seen": 12849115, + "step": 609, + "time_per_iteration": 2.4323623180389404 + }, + { + "auxiliary_loss_clip": 0.01271579, + "auxiliary_loss_mlp": 0.01087515, + "balance_loss_clip": 1.08529425, + "balance_loss_mlp": 1.05995405, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.046599289551559, + "language_loss": 0.79445648, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81804746, + "num_input_tokens_seen": 12868005, + "step": 610, + "time_per_iteration": 2.505591630935669 + }, + { + "auxiliary_loss_clip": 0.01270819, + "auxiliary_loss_mlp": 0.01083556, + "balance_loss_clip": 1.08154202, + "balance_loss_mlp": 1.05415893, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 1.976136204127922, + "language_loss": 0.87160063, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89514434, + "num_input_tokens_seen": 12886890, + "step": 611, + "time_per_iteration": 2.4598593711853027 + }, + { + "auxiliary_loss_clip": 0.01262824, + "auxiliary_loss_mlp": 0.01089164, + "balance_loss_clip": 1.07985544, + "balance_loss_mlp": 1.06007695, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 5.8153075729327455, + "language_loss": 0.72870517, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75222504, + "num_input_tokens_seen": 12906130, + "step": 612, + "time_per_iteration": 2.502063751220703 + }, + { + "auxiliary_loss_clip": 0.01264172, + "auxiliary_loss_mlp": 0.01073772, + "balance_loss_clip": 1.076249, + "balance_loss_mlp": 1.04402888, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 3.106293166270406, + "language_loss": 0.79378164, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81716108, + "num_input_tokens_seen": 12925260, + "step": 613, + "time_per_iteration": 2.460052728652954 + }, + { + "auxiliary_loss_clip": 0.01259623, + "auxiliary_loss_mlp": 0.01078398, + "balance_loss_clip": 1.07598281, + "balance_loss_mlp": 1.04919183, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.09559896555292, + "language_loss": 0.93633223, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95971245, + "num_input_tokens_seen": 12944590, + "step": 614, + "time_per_iteration": 2.4726614952087402 + }, + { + "auxiliary_loss_clip": 0.01269835, + "auxiliary_loss_mlp": 0.01076394, + "balance_loss_clip": 1.08248258, + "balance_loss_mlp": 1.0442555, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 3.3062313399867946, + "language_loss": 0.73153806, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75500035, + "num_input_tokens_seen": 12964785, + "step": 615, + "time_per_iteration": 2.5492353439331055 + }, + { + "auxiliary_loss_clip": 0.01264083, + "auxiliary_loss_mlp": 0.01072759, + "balance_loss_clip": 1.07623959, + "balance_loss_mlp": 1.04295671, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 1.9062097371707072, + "language_loss": 0.81668317, + "learning_rate": 3.999480934200528e-06, + "loss": 0.84005165, + "num_input_tokens_seen": 12986705, + "step": 616, + "time_per_iteration": 2.555856227874756 + }, + { + "auxiliary_loss_clip": 0.0126453, + "auxiliary_loss_mlp": 0.01071542, + "balance_loss_clip": 1.07872856, + "balance_loss_mlp": 1.0432179, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 2.822834291704613, + "language_loss": 0.68268991, + "learning_rate": 3.999472023754499e-06, + "loss": 0.70605063, + "num_input_tokens_seen": 13010560, + "step": 617, + "time_per_iteration": 4.027007579803467 + }, + { + "auxiliary_loss_clip": 0.01268653, + "auxiliary_loss_mlp": 0.01067116, + "balance_loss_clip": 1.08140206, + "balance_loss_mlp": 1.03578734, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 2.3145248134405683, + "language_loss": 0.80251431, + "learning_rate": 3.99946303748829e-06, + "loss": 0.82587194, + "num_input_tokens_seen": 13028935, + "step": 618, + "time_per_iteration": 3.883991003036499 + }, + { + "auxiliary_loss_clip": 0.0127141, + "auxiliary_loss_mlp": 0.01077706, + "balance_loss_clip": 1.07831299, + "balance_loss_mlp": 1.04549599, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.864721418289537, + "language_loss": 0.9150157, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93850684, + "num_input_tokens_seen": 13046000, + "step": 619, + "time_per_iteration": 2.4264655113220215 + }, + { + "auxiliary_loss_clip": 0.01265954, + "auxiliary_loss_mlp": 0.01084779, + "balance_loss_clip": 1.08125019, + "balance_loss_mlp": 1.05452323, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 8.568915600255718, + "language_loss": 0.94295073, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96645802, + "num_input_tokens_seen": 13062995, + "step": 620, + "time_per_iteration": 2.464831829071045 + }, + { + "auxiliary_loss_clip": 0.01263081, + "auxiliary_loss_mlp": 0.01080006, + "balance_loss_clip": 1.07739973, + "balance_loss_mlp": 1.04858291, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.7012688981444795, + "language_loss": 0.77273905, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79616994, + "num_input_tokens_seen": 13084120, + "step": 621, + "time_per_iteration": 3.912515163421631 + }, + { + "auxiliary_loss_clip": 0.01260313, + "auxiliary_loss_mlp": 0.01065115, + "balance_loss_clip": 1.07862306, + "balance_loss_mlp": 1.03421617, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.7542784478081996, + "language_loss": 0.86684191, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89009619, + "num_input_tokens_seen": 13100035, + "step": 622, + "time_per_iteration": 2.4365720748901367 + }, + { + "auxiliary_loss_clip": 0.01262444, + "auxiliary_loss_mlp": 0.01072437, + "balance_loss_clip": 1.07722306, + "balance_loss_mlp": 1.04295695, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.546702535573485, + "language_loss": 0.8995741, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92292285, + "num_input_tokens_seen": 13118070, + "step": 623, + "time_per_iteration": 2.4824483394622803 + }, + { + "auxiliary_loss_clip": 0.01266726, + "auxiliary_loss_mlp": 0.0108606, + "balance_loss_clip": 1.08119535, + "balance_loss_mlp": 1.05547094, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.936198522054453, + "language_loss": 0.84139514, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86492294, + "num_input_tokens_seen": 13136355, + "step": 624, + "time_per_iteration": 2.4359848499298096 + }, + { + "auxiliary_loss_clip": 0.01266063, + "auxiliary_loss_mlp": 0.01072088, + "balance_loss_clip": 1.07721949, + "balance_loss_mlp": 1.04140353, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 5.219537613287356, + "language_loss": 0.66936731, + "learning_rate": 3.999398010688788e-06, + "loss": 0.69274879, + "num_input_tokens_seen": 13155435, + "step": 625, + "time_per_iteration": 2.543416738510132 + }, + { + "auxiliary_loss_clip": 0.01258637, + "auxiliary_loss_mlp": 0.010707, + "balance_loss_clip": 1.07651794, + "balance_loss_mlp": 1.03920448, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.247868113567358, + "language_loss": 0.77037531, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79366869, + "num_input_tokens_seen": 13174295, + "step": 626, + "time_per_iteration": 2.480976104736328 + }, + { + "auxiliary_loss_clip": 0.01261805, + "auxiliary_loss_mlp": 0.01073094, + "balance_loss_clip": 1.07801437, + "balance_loss_mlp": 1.04291058, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 4.121193813102297, + "language_loss": 0.81339717, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83674616, + "num_input_tokens_seen": 13192500, + "step": 627, + "time_per_iteration": 2.421220064163208 + }, + { + "auxiliary_loss_clip": 0.01266975, + "auxiliary_loss_mlp": 0.01083282, + "balance_loss_clip": 1.08011019, + "balance_loss_mlp": 1.05269301, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 1.7601007659337293, + "language_loss": 0.88900304, + "learning_rate": 3.999369004792719e-06, + "loss": 0.91250563, + "num_input_tokens_seen": 13213470, + "step": 628, + "time_per_iteration": 2.552432060241699 + }, + { + "auxiliary_loss_clip": 0.01260683, + "auxiliary_loss_mlp": 0.01068791, + "balance_loss_clip": 1.07397628, + "balance_loss_mlp": 1.03858352, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.537186875183631, + "language_loss": 0.80136067, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82465541, + "num_input_tokens_seen": 13232365, + "step": 629, + "time_per_iteration": 3.8882405757904053 + }, + { + "auxiliary_loss_clip": 0.01260661, + "auxiliary_loss_mlp": 0.01065117, + "balance_loss_clip": 1.0758512, + "balance_loss_mlp": 1.03701961, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 2.204272686387614, + "language_loss": 0.76897281, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79223061, + "num_input_tokens_seen": 13251920, + "step": 630, + "time_per_iteration": 2.5056071281433105 + }, + { + "auxiliary_loss_clip": 0.01267243, + "auxiliary_loss_mlp": 0.01072479, + "balance_loss_clip": 1.07765448, + "balance_loss_mlp": 1.04322505, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.9270989700722287, + "language_loss": 0.91422337, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93762058, + "num_input_tokens_seen": 13267440, + "step": 631, + "time_per_iteration": 2.472132921218872 + }, + { + "auxiliary_loss_clip": 0.01257158, + "auxiliary_loss_mlp": 0.01084108, + "balance_loss_clip": 1.07479417, + "balance_loss_mlp": 1.05137289, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.5804960485578536, + "language_loss": 0.92244303, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94585568, + "num_input_tokens_seen": 13287850, + "step": 632, + "time_per_iteration": 2.4485199451446533 + }, + { + "auxiliary_loss_clip": 0.01259846, + "auxiliary_loss_mlp": 0.01060899, + "balance_loss_clip": 1.07679248, + "balance_loss_mlp": 1.03143024, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 1.9805665928769092, + "language_loss": 0.83465672, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85786414, + "num_input_tokens_seen": 13307760, + "step": 633, + "time_per_iteration": 2.466552734375 + }, + { + "auxiliary_loss_clip": 0.01258808, + "auxiliary_loss_mlp": 0.01067657, + "balance_loss_clip": 1.07370448, + "balance_loss_mlp": 1.03787875, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.5150595097192527, + "language_loss": 0.69698572, + "learning_rate": 3.999308945971392e-06, + "loss": 0.72025037, + "num_input_tokens_seen": 13331230, + "step": 634, + "time_per_iteration": 2.514230251312256 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.05036831, + "balance_loss_mlp": 1.02446449, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8877660343974255, + "language_loss": 0.61627066, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63793254, + "num_input_tokens_seen": 13394760, + "step": 635, + "time_per_iteration": 3.073153495788574 + }, + { + "auxiliary_loss_clip": 0.01256249, + "auxiliary_loss_mlp": 0.01069508, + "balance_loss_clip": 1.07511365, + "balance_loss_mlp": 1.0398488, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.0984765841741964, + "language_loss": 0.83610684, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.85936445, + "num_input_tokens_seen": 13412775, + "step": 636, + "time_per_iteration": 2.434026002883911 + }, + { + "auxiliary_loss_clip": 0.01260591, + "auxiliary_loss_mlp": 0.01085734, + "balance_loss_clip": 1.07622647, + "balance_loss_mlp": 1.05609834, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 4.051571397768213, + "language_loss": 0.79518807, + "learning_rate": 3.999277893066632e-06, + "loss": 0.81865132, + "num_input_tokens_seen": 13427835, + "step": 637, + "time_per_iteration": 2.4016730785369873 + }, + { + "auxiliary_loss_clip": 0.01261088, + "auxiliary_loss_mlp": 0.01083561, + "balance_loss_clip": 1.07386768, + "balance_loss_mlp": 1.05213702, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.037983417461886, + "language_loss": 0.84279132, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86623782, + "num_input_tokens_seen": 13447295, + "step": 638, + "time_per_iteration": 2.4417881965637207 + }, + { + "auxiliary_loss_clip": 0.01266254, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_clip": 1.07597232, + "balance_loss_mlp": 1.04014754, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.987038793932823, + "language_loss": 0.70083934, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72421157, + "num_input_tokens_seen": 13468455, + "step": 639, + "time_per_iteration": 2.482313394546509 + }, + { + "auxiliary_loss_clip": 0.01261843, + "auxiliary_loss_mlp": 0.01080159, + "balance_loss_clip": 1.0761714, + "balance_loss_mlp": 1.04885411, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 4.966707070815759, + "language_loss": 0.85352004, + "learning_rate": 3.999246157846526e-06, + "loss": 0.87694007, + "num_input_tokens_seen": 13489085, + "step": 640, + "time_per_iteration": 2.466991662979126 + }, + { + "auxiliary_loss_clip": 0.01264265, + "auxiliary_loss_mlp": 0.01079416, + "balance_loss_clip": 1.07724833, + "balance_loss_mlp": 1.04721808, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.531304311129772, + "language_loss": 0.81679559, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84023237, + "num_input_tokens_seen": 13509120, + "step": 641, + "time_per_iteration": 2.465254306793213 + }, + { + "auxiliary_loss_clip": 0.01129297, + "auxiliary_loss_mlp": 0.0104019, + "balance_loss_clip": 1.04514909, + "balance_loss_mlp": 1.03394306, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.905696320499197, + "language_loss": 0.65436268, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67605758, + "num_input_tokens_seen": 13562005, + "step": 642, + "time_per_iteration": 3.0387585163116455 + }, + { + "auxiliary_loss_clip": 0.01256157, + "auxiliary_loss_mlp": 0.0106528, + "balance_loss_clip": 1.07324219, + "balance_loss_mlp": 1.03621721, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 2.016604674419226, + "language_loss": 0.79470116, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81791556, + "num_input_tokens_seen": 13582185, + "step": 643, + "time_per_iteration": 2.5422375202178955 + }, + { + "auxiliary_loss_clip": 0.0125573, + "auxiliary_loss_mlp": 0.01079586, + "balance_loss_clip": 1.07254612, + "balance_loss_mlp": 1.05010533, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.0843812112786866, + "language_loss": 0.82801706, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85137022, + "num_input_tokens_seen": 13599555, + "step": 644, + "time_per_iteration": 2.4398646354675293 + }, + { + "auxiliary_loss_clip": 0.01257496, + "auxiliary_loss_mlp": 0.01071312, + "balance_loss_clip": 1.07318318, + "balance_loss_mlp": 1.0403657, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.10051597095615, + "language_loss": 0.82543528, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84872341, + "num_input_tokens_seen": 13621160, + "step": 645, + "time_per_iteration": 2.5400338172912598 + }, + { + "auxiliary_loss_clip": 0.01261446, + "auxiliary_loss_mlp": 0.010726, + "balance_loss_clip": 1.07568455, + "balance_loss_mlp": 1.042714, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.2832252696520436, + "language_loss": 0.81611836, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.83945882, + "num_input_tokens_seen": 13641915, + "step": 646, + "time_per_iteration": 2.449458599090576 + }, + { + "auxiliary_loss_clip": 0.01260638, + "auxiliary_loss_mlp": 0.01077743, + "balance_loss_clip": 1.07934451, + "balance_loss_mlp": 1.04772615, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.9365363216708278, + "language_loss": 0.82047635, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84386009, + "num_input_tokens_seen": 13661410, + "step": 647, + "time_per_iteration": 2.4424870014190674 + }, + { + "auxiliary_loss_clip": 0.01258995, + "auxiliary_loss_mlp": 0.01067193, + "balance_loss_clip": 1.07617271, + "balance_loss_mlp": 1.03839183, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 2.0187099789516636, + "language_loss": 0.84484053, + "learning_rate": 3.999158194912106e-06, + "loss": 0.86810237, + "num_input_tokens_seen": 13681705, + "step": 648, + "time_per_iteration": 2.5082828998565674 + }, + { + "auxiliary_loss_clip": 0.01257026, + "auxiliary_loss_mlp": 0.01068855, + "balance_loss_clip": 1.07544446, + "balance_loss_mlp": 1.03983927, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 2.1777793180753995, + "language_loss": 0.84592193, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.8691808, + "num_input_tokens_seen": 13700400, + "step": 649, + "time_per_iteration": 2.431652069091797 + }, + { + "auxiliary_loss_clip": 0.01257253, + "auxiliary_loss_mlp": 0.01068883, + "balance_loss_clip": 1.07459962, + "balance_loss_mlp": 1.03857994, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 2.000114919570514, + "language_loss": 0.79797256, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82123393, + "num_input_tokens_seen": 13720145, + "step": 650, + "time_per_iteration": 2.43290376663208 + }, + { + "auxiliary_loss_clip": 0.0125295, + "auxiliary_loss_mlp": 0.01072096, + "balance_loss_clip": 1.07065392, + "balance_loss_mlp": 1.04267526, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.129028658944843, + "language_loss": 0.78410476, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80735523, + "num_input_tokens_seen": 13737500, + "step": 651, + "time_per_iteration": 2.451014995574951 + }, + { + "auxiliary_loss_clip": 0.01256079, + "auxiliary_loss_mlp": 0.0108135, + "balance_loss_clip": 1.0723772, + "balance_loss_mlp": 1.04909205, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.5917430053325248, + "language_loss": 0.87535274, + "learning_rate": 3.999112394032757e-06, + "loss": 0.89872706, + "num_input_tokens_seen": 13754750, + "step": 652, + "time_per_iteration": 2.408135414123535 + }, + { + "auxiliary_loss_clip": 0.01247954, + "auxiliary_loss_mlp": 0.01073135, + "balance_loss_clip": 1.07130647, + "balance_loss_mlp": 1.04481101, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 2.5613227399826575, + "language_loss": 0.79121542, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81442636, + "num_input_tokens_seen": 13771990, + "step": 653, + "time_per_iteration": 2.4804530143737793 + }, + { + "auxiliary_loss_clip": 0.01262485, + "auxiliary_loss_mlp": 0.01067164, + "balance_loss_clip": 1.07503653, + "balance_loss_mlp": 1.03757656, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.468649153254435, + "language_loss": 0.85788345, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88117999, + "num_input_tokens_seen": 13792750, + "step": 654, + "time_per_iteration": 2.486361265182495 + }, + { + "auxiliary_loss_clip": 0.01123325, + "auxiliary_loss_mlp": 0.01004716, + "balance_loss_clip": 1.04181552, + "balance_loss_mlp": 0.9991371, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7454570923921565, + "language_loss": 0.49952215, + "learning_rate": 3.999077247403041e-06, + "loss": 0.52080256, + "num_input_tokens_seen": 13858570, + "step": 655, + "time_per_iteration": 3.0884971618652344 + }, + { + "auxiliary_loss_clip": 0.01252298, + "auxiliary_loss_mlp": 0.01066418, + "balance_loss_clip": 1.07519937, + "balance_loss_mlp": 1.03823686, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.121351647785065, + "language_loss": 0.80925667, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83244383, + "num_input_tokens_seen": 13876335, + "step": 656, + "time_per_iteration": 2.445699453353882 + }, + { + "auxiliary_loss_clip": 0.01264017, + "auxiliary_loss_mlp": 0.01088852, + "balance_loss_clip": 1.0771935, + "balance_loss_mlp": 1.05513978, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.7648607333750785, + "language_loss": 0.76497459, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78850329, + "num_input_tokens_seen": 13892640, + "step": 657, + "time_per_iteration": 3.839083433151245 + }, + { + "auxiliary_loss_clip": 0.01258976, + "auxiliary_loss_mlp": 0.01069843, + "balance_loss_clip": 1.07516861, + "balance_loss_mlp": 1.03956354, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 1.9808577682281212, + "language_loss": 0.81637979, + "learning_rate": 3.999041418526457e-06, + "loss": 0.83966792, + "num_input_tokens_seen": 13910085, + "step": 658, + "time_per_iteration": 3.9408788681030273 + }, + { + "auxiliary_loss_clip": 0.01252321, + "auxiliary_loss_mlp": 0.01071427, + "balance_loss_clip": 1.07262564, + "balance_loss_mlp": 1.0395503, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.3868670510352774, + "language_loss": 0.91156626, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93480372, + "num_input_tokens_seen": 13928800, + "step": 659, + "time_per_iteration": 2.4411141872406006 + }, + { + "auxiliary_loss_clip": 0.01257269, + "auxiliary_loss_mlp": 0.0106362, + "balance_loss_clip": 1.07349491, + "balance_loss_mlp": 1.03454542, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 3.1138178575929834, + "language_loss": 0.79267091, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81587982, + "num_input_tokens_seen": 13948325, + "step": 660, + "time_per_iteration": 2.454603672027588 + }, + { + "auxiliary_loss_clip": 0.01257335, + "auxiliary_loss_mlp": 0.01070803, + "balance_loss_clip": 1.0772481, + "balance_loss_mlp": 1.03983259, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.5956905799791474, + "language_loss": 0.81669492, + "learning_rate": 3.999004907415231e-06, + "loss": 0.83997631, + "num_input_tokens_seen": 13969090, + "step": 661, + "time_per_iteration": 3.9009454250335693 + }, + { + "auxiliary_loss_clip": 0.01118857, + "auxiliary_loss_mlp": 0.01014931, + "balance_loss_clip": 1.03820455, + "balance_loss_mlp": 1.00940025, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.905396066804676, + "language_loss": 0.69317555, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71451342, + "num_input_tokens_seen": 14037555, + "step": 662, + "time_per_iteration": 3.167611837387085 + }, + { + "auxiliary_loss_clip": 0.01261102, + "auxiliary_loss_mlp": 0.01073137, + "balance_loss_clip": 1.07867825, + "balance_loss_mlp": 1.04228616, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.028592287723959, + "language_loss": 0.82921606, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85255843, + "num_input_tokens_seen": 14055765, + "step": 663, + "time_per_iteration": 2.419696807861328 + }, + { + "auxiliary_loss_clip": 0.01261404, + "auxiliary_loss_mlp": 0.01064177, + "balance_loss_clip": 1.07806182, + "balance_loss_mlp": 1.03308749, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.3301589870441486, + "language_loss": 0.87541443, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89867026, + "num_input_tokens_seen": 14074195, + "step": 664, + "time_per_iteration": 2.452430248260498 + }, + { + "auxiliary_loss_clip": 0.01251155, + "auxiliary_loss_mlp": 0.0106903, + "balance_loss_clip": 1.07425475, + "balance_loss_mlp": 1.03848863, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.787290365945044, + "language_loss": 0.84565032, + "learning_rate": 3.998955164701281e-06, + "loss": 0.86885214, + "num_input_tokens_seen": 14090215, + "step": 665, + "time_per_iteration": 2.406550407409668 + }, + { + "auxiliary_loss_clip": 0.01264208, + "auxiliary_loss_mlp": 0.01084956, + "balance_loss_clip": 1.07806146, + "balance_loss_mlp": 1.05257869, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 1.927662929201556, + "language_loss": 0.8166784, + "learning_rate": 3.998942539520158e-06, + "loss": 0.84017003, + "num_input_tokens_seen": 14112150, + "step": 666, + "time_per_iteration": 2.5004007816314697 + }, + { + "auxiliary_loss_clip": 0.01253594, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_clip": 1.07422829, + "balance_loss_mlp": 1.04151845, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 4.022324591474759, + "language_loss": 0.87163723, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89491075, + "num_input_tokens_seen": 14131475, + "step": 667, + "time_per_iteration": 2.446486473083496 + }, + { + "auxiliary_loss_clip": 0.01255034, + "auxiliary_loss_mlp": 0.01064477, + "balance_loss_clip": 1.07875991, + "balance_loss_mlp": 1.03494906, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 3.1849061670709804, + "language_loss": 0.80284679, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82604194, + "num_input_tokens_seen": 14146165, + "step": 668, + "time_per_iteration": 2.4341542720794678 + }, + { + "auxiliary_loss_clip": 0.01116425, + "auxiliary_loss_mlp": 0.01004066, + "balance_loss_clip": 1.03704739, + "balance_loss_mlp": 0.9984867, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.8096341441936321, + "language_loss": 0.60016406, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62136889, + "num_input_tokens_seen": 14215005, + "step": 669, + "time_per_iteration": 3.140928268432617 + }, + { + "auxiliary_loss_clip": 0.01254766, + "auxiliary_loss_mlp": 0.01075952, + "balance_loss_clip": 1.07341218, + "balance_loss_mlp": 1.0464592, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.8151190410849019, + "language_loss": 0.86360657, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88691378, + "num_input_tokens_seen": 14235510, + "step": 670, + "time_per_iteration": 3.8780951499938965 + }, + { + "auxiliary_loss_clip": 0.01251223, + "auxiliary_loss_mlp": 0.01070349, + "balance_loss_clip": 1.07565427, + "balance_loss_mlp": 1.04047561, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.8443247324680736, + "language_loss": 0.75059545, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77381122, + "num_input_tokens_seen": 14254565, + "step": 671, + "time_per_iteration": 2.4271161556243896 + }, + { + "auxiliary_loss_clip": 0.01258815, + "auxiliary_loss_mlp": 0.01081377, + "balance_loss_clip": 1.07755005, + "balance_loss_mlp": 1.0507164, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.0048837137762576, + "language_loss": 0.92272663, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94612849, + "num_input_tokens_seen": 14271885, + "step": 672, + "time_per_iteration": 2.499119281768799 + }, + { + "auxiliary_loss_clip": 0.0125585, + "auxiliary_loss_mlp": 0.01073224, + "balance_loss_clip": 1.07617164, + "balance_loss_mlp": 1.04071617, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 154.8314755091104, + "language_loss": 0.90439534, + "learning_rate": 3.998852040876622e-06, + "loss": 0.9276861, + "num_input_tokens_seen": 14289670, + "step": 673, + "time_per_iteration": 2.4557104110717773 + }, + { + "auxiliary_loss_clip": 0.01250888, + "auxiliary_loss_mlp": 0.01084441, + "balance_loss_clip": 1.07274067, + "balance_loss_mlp": 1.05256462, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.0413866286281044, + "language_loss": 0.75191206, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77526534, + "num_input_tokens_seen": 14309285, + "step": 674, + "time_per_iteration": 2.4782376289367676 + }, + { + "auxiliary_loss_clip": 0.01261986, + "auxiliary_loss_mlp": 0.01060543, + "balance_loss_clip": 1.07699943, + "balance_loss_mlp": 1.0300138, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 4.229139144699554, + "language_loss": 0.77920187, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80242717, + "num_input_tokens_seen": 14328300, + "step": 675, + "time_per_iteration": 2.4398982524871826 + }, + { + "auxiliary_loss_clip": 0.01256883, + "auxiliary_loss_mlp": 0.01083384, + "balance_loss_clip": 1.07616353, + "balance_loss_mlp": 1.05080354, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.9409042793116875, + "language_loss": 0.76989198, + "learning_rate": 3.998812118783757e-06, + "loss": 0.79329467, + "num_input_tokens_seen": 14346395, + "step": 676, + "time_per_iteration": 2.490421772003174 + }, + { + "auxiliary_loss_clip": 0.01258726, + "auxiliary_loss_mlp": 0.01077186, + "balance_loss_clip": 1.07695746, + "balance_loss_mlp": 1.0457623, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.6520758223933276, + "language_loss": 0.85731173, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.88067079, + "num_input_tokens_seen": 14364605, + "step": 677, + "time_per_iteration": 2.463644027709961 + }, + { + "auxiliary_loss_clip": 0.01252156, + "auxiliary_loss_mlp": 0.01066337, + "balance_loss_clip": 1.0749259, + "balance_loss_mlp": 1.03516388, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.76638587990851, + "language_loss": 0.76651323, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78969824, + "num_input_tokens_seen": 14385265, + "step": 678, + "time_per_iteration": 2.5072972774505615 + }, + { + "auxiliary_loss_clip": 0.01251636, + "auxiliary_loss_mlp": 0.01067974, + "balance_loss_clip": 1.07271516, + "balance_loss_mlp": 1.03843379, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.9478256007569177, + "language_loss": 0.81992412, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84312022, + "num_input_tokens_seen": 14406090, + "step": 679, + "time_per_iteration": 2.566763162612915 + }, + { + "auxiliary_loss_clip": 0.01257545, + "auxiliary_loss_mlp": 0.0106621, + "balance_loss_clip": 1.07954347, + "balance_loss_mlp": 1.03628874, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 2.2137683904269556, + "language_loss": 0.7624827, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78572023, + "num_input_tokens_seen": 14425130, + "step": 680, + "time_per_iteration": 2.50358510017395 + }, + { + "auxiliary_loss_clip": 0.01257013, + "auxiliary_loss_mlp": 0.01068095, + "balance_loss_clip": 1.07287097, + "balance_loss_mlp": 1.03536057, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 2.1063601778807244, + "language_loss": 0.83180553, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.85505658, + "num_input_tokens_seen": 14447355, + "step": 681, + "time_per_iteration": 2.4883453845977783 + }, + { + "auxiliary_loss_clip": 0.01255154, + "auxiliary_loss_mlp": 0.01067913, + "balance_loss_clip": 1.07377267, + "balance_loss_mlp": 1.0369544, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.7439493272688302, + "language_loss": 0.71629018, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73952085, + "num_input_tokens_seen": 14466790, + "step": 682, + "time_per_iteration": 2.4471304416656494 + }, + { + "auxiliary_loss_clip": 0.01254067, + "auxiliary_loss_mlp": 0.01069345, + "balance_loss_clip": 1.07352686, + "balance_loss_mlp": 1.03968561, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7481711242919278, + "language_loss": 0.72302699, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74626106, + "num_input_tokens_seen": 14485195, + "step": 683, + "time_per_iteration": 2.436079502105713 + }, + { + "auxiliary_loss_clip": 0.01252946, + "auxiliary_loss_mlp": 0.01074931, + "balance_loss_clip": 1.07975221, + "balance_loss_mlp": 1.04484224, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.5661070900519194, + "language_loss": 0.81802523, + "learning_rate": 3.998702324920417e-06, + "loss": 0.84130394, + "num_input_tokens_seen": 14503370, + "step": 684, + "time_per_iteration": 2.436115026473999 + }, + { + "auxiliary_loss_clip": 0.01256088, + "auxiliary_loss_mlp": 0.01070189, + "balance_loss_clip": 1.07820129, + "balance_loss_mlp": 1.03836012, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.4972977050108183, + "language_loss": 0.90579611, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92905885, + "num_input_tokens_seen": 14526415, + "step": 685, + "time_per_iteration": 2.525132894515991 + }, + { + "auxiliary_loss_clip": 0.0125456, + "auxiliary_loss_mlp": 0.01069206, + "balance_loss_clip": 1.07581711, + "balance_loss_mlp": 1.03849804, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.1691411393677678, + "language_loss": 0.87627232, + "learning_rate": 3.998674118534141e-06, + "loss": 0.89951009, + "num_input_tokens_seen": 14546595, + "step": 686, + "time_per_iteration": 2.4847657680511475 + }, + { + "auxiliary_loss_clip": 0.01260803, + "auxiliary_loss_mlp": 0.01073358, + "balance_loss_clip": 1.07670224, + "balance_loss_mlp": 1.0431509, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 3.673282954966934, + "language_loss": 0.71721756, + "learning_rate": 3.998659901655851e-06, + "loss": 0.74055922, + "num_input_tokens_seen": 14566590, + "step": 687, + "time_per_iteration": 2.4507224559783936 + }, + { + "auxiliary_loss_clip": 0.01254904, + "auxiliary_loss_mlp": 0.01073076, + "balance_loss_clip": 1.08055508, + "balance_loss_mlp": 1.04513383, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 2.5919824590067337, + "language_loss": 0.86054534, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88382512, + "num_input_tokens_seen": 14585965, + "step": 688, + "time_per_iteration": 2.439619541168213 + }, + { + "auxiliary_loss_clip": 0.01252299, + "auxiliary_loss_mlp": 0.01078352, + "balance_loss_clip": 1.07699347, + "balance_loss_mlp": 1.04900265, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 1.9162104523977503, + "language_loss": 0.83114219, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85444868, + "num_input_tokens_seen": 14606015, + "step": 689, + "time_per_iteration": 2.4513278007507324 + }, + { + "auxiliary_loss_clip": 0.01251225, + "auxiliary_loss_mlp": 0.01081332, + "balance_loss_clip": 1.07463551, + "balance_loss_mlp": 1.051458, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 2.7411862962892783, + "language_loss": 0.6816799, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70500553, + "num_input_tokens_seen": 14629955, + "step": 690, + "time_per_iteration": 2.514577865600586 + }, + { + "auxiliary_loss_clip": 0.01249539, + "auxiliary_loss_mlp": 0.01070711, + "balance_loss_clip": 1.07334483, + "balance_loss_mlp": 1.04155254, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.7302353157584556, + "language_loss": 0.75011325, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77331579, + "num_input_tokens_seen": 14648000, + "step": 691, + "time_per_iteration": 2.418124198913574 + }, + { + "auxiliary_loss_clip": 0.01250235, + "auxiliary_loss_mlp": 0.01078136, + "balance_loss_clip": 1.07584524, + "balance_loss_mlp": 1.04783297, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.2557879685501314, + "language_loss": 0.84467721, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86796093, + "num_input_tokens_seen": 14662235, + "step": 692, + "time_per_iteration": 2.4044322967529297 + }, + { + "auxiliary_loss_clip": 0.01254602, + "auxiliary_loss_mlp": 0.01074364, + "balance_loss_clip": 1.0725528, + "balance_loss_mlp": 1.04265404, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 2.737747381555863, + "language_loss": 0.88918161, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91247129, + "num_input_tokens_seen": 14676065, + "step": 693, + "time_per_iteration": 2.3951618671417236 + }, + { + "auxiliary_loss_clip": 0.01258863, + "auxiliary_loss_mlp": 0.01067343, + "balance_loss_clip": 1.07915449, + "balance_loss_mlp": 1.03832734, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.489018735392628, + "language_loss": 0.82142043, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84468246, + "num_input_tokens_seen": 14694955, + "step": 694, + "time_per_iteration": 2.504085063934326 + }, + { + "auxiliary_loss_clip": 0.01254346, + "auxiliary_loss_mlp": 0.01070062, + "balance_loss_clip": 1.07257199, + "balance_loss_mlp": 1.04045033, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.1133967367238613, + "language_loss": 0.83256173, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85580581, + "num_input_tokens_seen": 14715510, + "step": 695, + "time_per_iteration": 2.4612696170806885 + }, + { + "auxiliary_loss_clip": 0.01252249, + "auxiliary_loss_mlp": 0.0107507, + "balance_loss_clip": 1.07482481, + "balance_loss_mlp": 1.04460025, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 3.5489659777446927, + "language_loss": 0.84465879, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86793196, + "num_input_tokens_seen": 14731755, + "step": 696, + "time_per_iteration": 2.4057557582855225 + }, + { + "auxiliary_loss_clip": 0.01250396, + "auxiliary_loss_mlp": 0.01081083, + "balance_loss_clip": 1.07380092, + "balance_loss_mlp": 1.04944479, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.7704279178473663, + "language_loss": 0.93085051, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95416528, + "num_input_tokens_seen": 14750810, + "step": 697, + "time_per_iteration": 2.4682528972625732 + }, + { + "auxiliary_loss_clip": 0.01247048, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_clip": 1.07382703, + "balance_loss_mlp": 1.04353642, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.4553431717693313, + "language_loss": 0.83536416, + "learning_rate": 3.998498514015987e-06, + "loss": 0.85855705, + "num_input_tokens_seen": 14768435, + "step": 698, + "time_per_iteration": 3.89860200881958 + }, + { + "auxiliary_loss_clip": 0.01249781, + "auxiliary_loss_mlp": 0.01090198, + "balance_loss_clip": 1.07220173, + "balance_loss_mlp": 1.05903661, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 2.0546515103765315, + "language_loss": 0.91211164, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93551141, + "num_input_tokens_seen": 14786690, + "step": 699, + "time_per_iteration": 4.005714178085327 + }, + { + "auxiliary_loss_clip": 0.01122423, + "auxiliary_loss_mlp": 0.01022566, + "balance_loss_clip": 1.04350424, + "balance_loss_mlp": 1.01760733, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.9149620836394229, + "language_loss": 0.67897081, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70042062, + "num_input_tokens_seen": 14853840, + "step": 700, + "time_per_iteration": 3.0998878479003906 + }, + { + "auxiliary_loss_clip": 0.01256868, + "auxiliary_loss_mlp": 0.01078249, + "balance_loss_clip": 1.07669377, + "balance_loss_mlp": 1.0459913, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.352589732732269, + "language_loss": 0.88765502, + "learning_rate": 3.998452907725016e-06, + "loss": 0.91100621, + "num_input_tokens_seen": 14869580, + "step": 701, + "time_per_iteration": 2.4350883960723877 + }, + { + "auxiliary_loss_clip": 0.01252601, + "auxiliary_loss_mlp": 0.01079532, + "balance_loss_clip": 1.07827008, + "balance_loss_mlp": 1.04851365, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 1.7709688648205595, + "language_loss": 0.67017531, + "learning_rate": 3.998437554064184e-06, + "loss": 0.6934967, + "num_input_tokens_seen": 14891065, + "step": 702, + "time_per_iteration": 3.847105026245117 + }, + { + "auxiliary_loss_clip": 0.01114392, + "auxiliary_loss_mlp": 0.01004215, + "balance_loss_clip": 1.03603685, + "balance_loss_mlp": 0.9993515, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8539485325663473, + "language_loss": 0.60765803, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62884414, + "num_input_tokens_seen": 14954815, + "step": 703, + "time_per_iteration": 3.0836164951324463 + }, + { + "auxiliary_loss_clip": 0.01113946, + "auxiliary_loss_mlp": 0.01004111, + "balance_loss_clip": 1.03610575, + "balance_loss_mlp": 0.99929476, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0230139196778778, + "language_loss": 0.57703382, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59821433, + "num_input_tokens_seen": 15003050, + "step": 704, + "time_per_iteration": 2.8816943168640137 + }, + { + "auxiliary_loss_clip": 0.01254053, + "auxiliary_loss_mlp": 0.01070757, + "balance_loss_clip": 1.07682896, + "balance_loss_mlp": 1.04002523, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.79673651920846, + "language_loss": 0.87507331, + "learning_rate": 3.998391038398319e-06, + "loss": 0.89832139, + "num_input_tokens_seen": 15021990, + "step": 705, + "time_per_iteration": 2.443605422973633 + }, + { + "auxiliary_loss_clip": 0.01239564, + "auxiliary_loss_mlp": 0.01064785, + "balance_loss_clip": 1.0693562, + "balance_loss_mlp": 1.03665161, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7734619197901045, + "language_loss": 0.71384442, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73688793, + "num_input_tokens_seen": 15040700, + "step": 706, + "time_per_iteration": 2.4361302852630615 + }, + { + "auxiliary_loss_clip": 0.01246439, + "auxiliary_loss_mlp": 0.01064966, + "balance_loss_clip": 1.07366717, + "balance_loss_mlp": 1.0344727, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.2973349432100383, + "language_loss": 0.9349333, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95804739, + "num_input_tokens_seen": 15056725, + "step": 707, + "time_per_iteration": 2.4641900062561035 + }, + { + "auxiliary_loss_clip": 0.01249327, + "auxiliary_loss_mlp": 0.0106753, + "balance_loss_clip": 1.06948733, + "balance_loss_mlp": 1.03684556, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 3.026825626417623, + "language_loss": 0.81420982, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83737838, + "num_input_tokens_seen": 15077550, + "step": 708, + "time_per_iteration": 2.5150551795959473 + }, + { + "auxiliary_loss_clip": 0.01254389, + "auxiliary_loss_mlp": 0.01070311, + "balance_loss_clip": 1.07522869, + "balance_loss_mlp": 1.03900635, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.7253654484221346, + "language_loss": 0.82290018, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84614718, + "num_input_tokens_seen": 15094955, + "step": 709, + "time_per_iteration": 2.4121899604797363 + }, + { + "auxiliary_loss_clip": 0.01256049, + "auxiliary_loss_mlp": 0.01065528, + "balance_loss_clip": 1.07549858, + "balance_loss_mlp": 1.0359875, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.803077649870431, + "language_loss": 0.85320508, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87642086, + "num_input_tokens_seen": 15113395, + "step": 710, + "time_per_iteration": 2.4099626541137695 + }, + { + "auxiliary_loss_clip": 0.01253329, + "auxiliary_loss_mlp": 0.01071008, + "balance_loss_clip": 1.07694256, + "balance_loss_mlp": 1.04146743, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 6.470515878727774, + "language_loss": 0.84515542, + "learning_rate": 3.998295961044662e-06, + "loss": 0.86839885, + "num_input_tokens_seen": 15132920, + "step": 711, + "time_per_iteration": 2.414721965789795 + }, + { + "auxiliary_loss_clip": 0.01246018, + "auxiliary_loss_mlp": 0.01066282, + "balance_loss_clip": 1.07099307, + "balance_loss_mlp": 1.03633678, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 1.5953376374125516, + "language_loss": 0.85238987, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.8755129, + "num_input_tokens_seen": 15153115, + "step": 712, + "time_per_iteration": 3.8499321937561035 + }, + { + "auxiliary_loss_clip": 0.01253561, + "auxiliary_loss_mlp": 0.01070449, + "balance_loss_clip": 1.07162249, + "balance_loss_mlp": 1.0417558, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.0876084195808757, + "language_loss": 0.90908551, + "learning_rate": 3.998263662382328e-06, + "loss": 0.93232566, + "num_input_tokens_seen": 15172770, + "step": 713, + "time_per_iteration": 2.444467782974243 + }, + { + "auxiliary_loss_clip": 0.01111671, + "auxiliary_loss_mlp": 0.010062, + "balance_loss_clip": 1.03579378, + "balance_loss_mlp": 1.00174141, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8827621272586572, + "language_loss": 0.63751328, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65869194, + "num_input_tokens_seen": 15240055, + "step": 714, + "time_per_iteration": 3.1185355186462402 + }, + { + "auxiliary_loss_clip": 0.01249694, + "auxiliary_loss_mlp": 0.01079437, + "balance_loss_clip": 1.07638133, + "balance_loss_mlp": 1.04980135, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 1.836196220182354, + "language_loss": 0.7482512, + "learning_rate": 3.998231060622563e-06, + "loss": 0.77154243, + "num_input_tokens_seen": 15261585, + "step": 715, + "time_per_iteration": 2.529588460922241 + }, + { + "auxiliary_loss_clip": 0.01252768, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_clip": 1.07683158, + "balance_loss_mlp": 1.04104125, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 1.811335695106842, + "language_loss": 0.72693121, + "learning_rate": 3.998214646082688e-06, + "loss": 0.75018758, + "num_input_tokens_seen": 15281160, + "step": 716, + "time_per_iteration": 2.5528860092163086 + }, + { + "auxiliary_loss_clip": 0.01106508, + "auxiliary_loss_mlp": 0.01013021, + "balance_loss_clip": 1.03073525, + "balance_loss_mlp": 1.00810957, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.905104007154065, + "language_loss": 0.65571296, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67690825, + "num_input_tokens_seen": 15344505, + "step": 717, + "time_per_iteration": 3.07222580909729 + }, + { + "auxiliary_loss_clip": 0.01105179, + "auxiliary_loss_mlp": 0.01013412, + "balance_loss_clip": 1.02983522, + "balance_loss_mlp": 1.00871491, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9917085058474688, + "language_loss": 0.58785248, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60903835, + "num_input_tokens_seen": 15404050, + "step": 718, + "time_per_iteration": 2.8557965755462646 + }, + { + "auxiliary_loss_clip": 0.01248862, + "auxiliary_loss_mlp": 0.01071296, + "balance_loss_clip": 1.07675099, + "balance_loss_mlp": 1.0397768, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 2.0740100101820285, + "language_loss": 0.91461396, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93781555, + "num_input_tokens_seen": 15424190, + "step": 719, + "time_per_iteration": 2.4573326110839844 + }, + { + "auxiliary_loss_clip": 0.01245481, + "auxiliary_loss_mlp": 0.01069038, + "balance_loss_clip": 1.07039428, + "balance_loss_mlp": 1.0411545, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 2.1170329039921736, + "language_loss": 0.66386628, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68701148, + "num_input_tokens_seen": 15446500, + "step": 720, + "time_per_iteration": 2.5171873569488525 + }, + { + "auxiliary_loss_clip": 0.01244414, + "auxiliary_loss_mlp": 0.01071861, + "balance_loss_clip": 1.07207811, + "balance_loss_mlp": 1.04167759, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 2.0306355377615075, + "language_loss": 0.77513242, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.79829514, + "num_input_tokens_seen": 15465830, + "step": 721, + "time_per_iteration": 2.48276948928833 + }, + { + "auxiliary_loss_clip": 0.01250521, + "auxiliary_loss_mlp": 0.01084896, + "balance_loss_clip": 1.0755955, + "balance_loss_mlp": 1.05588019, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 4.780620135551904, + "language_loss": 0.88010025, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90345442, + "num_input_tokens_seen": 15479985, + "step": 722, + "time_per_iteration": 2.4006330966949463 + }, + { + "auxiliary_loss_clip": 0.0125604, + "auxiliary_loss_mlp": 0.01071379, + "balance_loss_clip": 1.07815051, + "balance_loss_mlp": 1.04288769, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 1.857117448846299, + "language_loss": 0.8459543, + "learning_rate": 3.998097622708792e-06, + "loss": 0.86922848, + "num_input_tokens_seen": 15501545, + "step": 723, + "time_per_iteration": 2.5043692588806152 + }, + { + "auxiliary_loss_clip": 0.01256876, + "auxiliary_loss_mlp": 0.01078308, + "balance_loss_clip": 1.07969964, + "balance_loss_mlp": 1.0490067, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.8423858779507039, + "language_loss": 0.82761866, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85097051, + "num_input_tokens_seen": 15521725, + "step": 724, + "time_per_iteration": 2.476010799407959 + }, + { + "auxiliary_loss_clip": 0.01251249, + "auxiliary_loss_mlp": 0.01081116, + "balance_loss_clip": 1.07604146, + "balance_loss_mlp": 1.04983568, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 2.1531230975731668, + "language_loss": 0.79788589, + "learning_rate": 3.998063505536971e-06, + "loss": 0.82120961, + "num_input_tokens_seen": 15540910, + "step": 725, + "time_per_iteration": 2.434398889541626 + }, + { + "auxiliary_loss_clip": 0.01262261, + "auxiliary_loss_mlp": 0.01072826, + "balance_loss_clip": 1.07884598, + "balance_loss_mlp": 1.04235601, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 3.8605208509133186, + "language_loss": 0.87033564, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89368653, + "num_input_tokens_seen": 15558640, + "step": 726, + "time_per_iteration": 2.4085304737091064 + }, + { + "auxiliary_loss_clip": 0.01108923, + "auxiliary_loss_mlp": 0.01025217, + "balance_loss_clip": 1.03441858, + "balance_loss_mlp": 1.02083039, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9157901031070013, + "language_loss": 0.55886817, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58020955, + "num_input_tokens_seen": 15612975, + "step": 727, + "time_per_iteration": 3.143498420715332 + }, + { + "auxiliary_loss_clip": 0.01252015, + "auxiliary_loss_mlp": 0.01079618, + "balance_loss_clip": 1.07516587, + "balance_loss_mlp": 1.04960155, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 5.701577455857072, + "language_loss": 0.82508349, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84839976, + "num_input_tokens_seen": 15631070, + "step": 728, + "time_per_iteration": 2.413825035095215 + }, + { + "auxiliary_loss_clip": 0.01247272, + "auxiliary_loss_mlp": 0.01067415, + "balance_loss_clip": 1.07573342, + "balance_loss_mlp": 1.03944826, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.7026289199998015, + "language_loss": 0.77094638, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79409325, + "num_input_tokens_seen": 15647825, + "step": 729, + "time_per_iteration": 2.4478037357330322 + }, + { + "auxiliary_loss_clip": 0.01254578, + "auxiliary_loss_mlp": 0.01073023, + "balance_loss_clip": 1.07481039, + "balance_loss_mlp": 1.04360223, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.368203296202391, + "language_loss": 0.95461488, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97789085, + "num_input_tokens_seen": 15668260, + "step": 730, + "time_per_iteration": 2.464538335800171 + }, + { + "auxiliary_loss_clip": 0.01246264, + "auxiliary_loss_mlp": 0.01064678, + "balance_loss_clip": 1.06989312, + "balance_loss_mlp": 1.0344702, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.3544281616894396, + "language_loss": 0.88344109, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90655053, + "num_input_tokens_seen": 15685630, + "step": 731, + "time_per_iteration": 2.406344413757324 + }, + { + "auxiliary_loss_clip": 0.0125121, + "auxiliary_loss_mlp": 0.01072602, + "balance_loss_clip": 1.07553399, + "balance_loss_mlp": 1.04526711, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.054444421617323, + "language_loss": 0.89042616, + "learning_rate": 3.997941708816791e-06, + "loss": 0.91366422, + "num_input_tokens_seen": 15698645, + "step": 732, + "time_per_iteration": 2.3852972984313965 + }, + { + "auxiliary_loss_clip": 0.01250944, + "auxiliary_loss_mlp": 0.01082614, + "balance_loss_clip": 1.07445002, + "balance_loss_mlp": 1.05333614, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.423901433698813, + "language_loss": 0.85925555, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88259113, + "num_input_tokens_seen": 15716775, + "step": 733, + "time_per_iteration": 2.4376189708709717 + }, + { + "auxiliary_loss_clip": 0.01254181, + "auxiliary_loss_mlp": 0.01083919, + "balance_loss_clip": 1.07551825, + "balance_loss_mlp": 1.05244803, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.440523927063239, + "language_loss": 0.9122839, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93566489, + "num_input_tokens_seen": 15733320, + "step": 734, + "time_per_iteration": 2.4012246131896973 + }, + { + "auxiliary_loss_clip": 0.01245896, + "auxiliary_loss_mlp": 0.01067347, + "balance_loss_clip": 1.07495344, + "balance_loss_mlp": 1.0397861, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 2.0245880609341627, + "language_loss": 0.7797246, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.80285704, + "num_input_tokens_seen": 15752705, + "step": 735, + "time_per_iteration": 2.4949140548706055 + }, + { + "auxiliary_loss_clip": 0.01242568, + "auxiliary_loss_mlp": 0.0106953, + "balance_loss_clip": 1.06950974, + "balance_loss_mlp": 1.04096782, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.8849329285375793, + "language_loss": 0.88545996, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.9085809, + "num_input_tokens_seen": 15772800, + "step": 736, + "time_per_iteration": 2.489738702774048 + }, + { + "auxiliary_loss_clip": 0.01244363, + "auxiliary_loss_mlp": 0.01075442, + "balance_loss_clip": 1.07455826, + "balance_loss_mlp": 1.04759479, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 2.3237462468456216, + "language_loss": 0.8420583, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86525631, + "num_input_tokens_seen": 15793665, + "step": 737, + "time_per_iteration": 2.467203378677368 + }, + { + "auxiliary_loss_clip": 0.01250499, + "auxiliary_loss_mlp": 0.01070637, + "balance_loss_clip": 1.07582057, + "balance_loss_mlp": 1.03904676, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.223191437344619, + "language_loss": 0.84847426, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87168574, + "num_input_tokens_seen": 15813175, + "step": 738, + "time_per_iteration": 2.544058322906494 + }, + { + "auxiliary_loss_clip": 0.01106566, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.03339362, + "balance_loss_mlp": 1.02741742, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8776913506231073, + "language_loss": 0.5912987, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61267734, + "num_input_tokens_seen": 15872050, + "step": 739, + "time_per_iteration": 4.420870065689087 + }, + { + "auxiliary_loss_clip": 0.01246158, + "auxiliary_loss_mlp": 0.01066014, + "balance_loss_clip": 1.07544637, + "balance_loss_mlp": 1.03799987, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 4.285229232219046, + "language_loss": 0.91446948, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93759108, + "num_input_tokens_seen": 15891085, + "step": 740, + "time_per_iteration": 3.9254229068756104 + }, + { + "auxiliary_loss_clip": 0.01253388, + "auxiliary_loss_mlp": 0.01065933, + "balance_loss_clip": 1.0788691, + "balance_loss_mlp": 1.03770387, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.0328094989924472, + "language_loss": 0.72078276, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74397594, + "num_input_tokens_seen": 15914225, + "step": 741, + "time_per_iteration": 2.5874316692352295 + }, + { + "auxiliary_loss_clip": 0.01242385, + "auxiliary_loss_mlp": 0.01073057, + "balance_loss_clip": 1.07244647, + "balance_loss_mlp": 1.04591322, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.7399598978029773, + "language_loss": 0.88827926, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9114337, + "num_input_tokens_seen": 15934540, + "step": 742, + "time_per_iteration": 2.492107629776001 + }, + { + "auxiliary_loss_clip": 0.01243522, + "auxiliary_loss_mlp": 0.01062216, + "balance_loss_clip": 1.07289732, + "balance_loss_mlp": 1.03268766, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.0759581236612212, + "language_loss": 0.84108043, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86413783, + "num_input_tokens_seen": 15952560, + "step": 743, + "time_per_iteration": 3.8907907009124756 + }, + { + "auxiliary_loss_clip": 0.01249372, + "auxiliary_loss_mlp": 0.01070067, + "balance_loss_clip": 1.07495451, + "balance_loss_mlp": 1.04243386, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 1.9776090638538169, + "language_loss": 0.80339324, + "learning_rate": 3.997724277684479e-06, + "loss": 0.82658762, + "num_input_tokens_seen": 15970620, + "step": 744, + "time_per_iteration": 2.421088218688965 + }, + { + "auxiliary_loss_clip": 0.01243892, + "auxiliary_loss_mlp": 0.01069603, + "balance_loss_clip": 1.07338834, + "balance_loss_mlp": 1.04134989, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.155363339058357, + "language_loss": 0.85119784, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87433279, + "num_input_tokens_seen": 15987325, + "step": 745, + "time_per_iteration": 2.4431419372558594 + }, + { + "auxiliary_loss_clip": 0.01242561, + "auxiliary_loss_mlp": 0.01060488, + "balance_loss_clip": 1.07208848, + "balance_loss_mlp": 1.03212798, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 2.0306773809641645, + "language_loss": 0.69037014, + "learning_rate": 3.997686978575302e-06, + "loss": 0.7134006, + "num_input_tokens_seen": 16008310, + "step": 746, + "time_per_iteration": 2.520010232925415 + }, + { + "auxiliary_loss_clip": 0.01248499, + "auxiliary_loss_mlp": 0.01080993, + "balance_loss_clip": 1.07689559, + "balance_loss_mlp": 1.0507139, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.027773356786609, + "language_loss": 0.68753242, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71082735, + "num_input_tokens_seen": 16029620, + "step": 747, + "time_per_iteration": 2.489347457885742 + }, + { + "auxiliary_loss_clip": 0.01247068, + "auxiliary_loss_mlp": 0.01082001, + "balance_loss_clip": 1.07372975, + "balance_loss_mlp": 1.05171013, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 2.222781072726067, + "language_loss": 0.67069829, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69398904, + "num_input_tokens_seen": 16049065, + "step": 748, + "time_per_iteration": 2.445081949234009 + }, + { + "auxiliary_loss_clip": 0.01248518, + "auxiliary_loss_mlp": 0.0108278, + "balance_loss_clip": 1.07691574, + "balance_loss_mlp": 1.0539676, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 3.3761615157959044, + "language_loss": 0.77063, + "learning_rate": 3.997630461769647e-06, + "loss": 0.79394293, + "num_input_tokens_seen": 16066765, + "step": 749, + "time_per_iteration": 2.408351421356201 + }, + { + "auxiliary_loss_clip": 0.0125047, + "auxiliary_loss_mlp": 0.01075392, + "balance_loss_clip": 1.07683969, + "balance_loss_mlp": 1.04712749, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.066376357597193, + "language_loss": 0.89057088, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91382957, + "num_input_tokens_seen": 16085980, + "step": 750, + "time_per_iteration": 2.4045424461364746 + }, + { + "auxiliary_loss_clip": 0.01247921, + "auxiliary_loss_mlp": 0.0107428, + "balance_loss_clip": 1.07252192, + "balance_loss_mlp": 1.04299903, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.1970989501687117, + "language_loss": 0.74923944, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77246153, + "num_input_tokens_seen": 16106260, + "step": 751, + "time_per_iteration": 2.458491086959839 + }, + { + "auxiliary_loss_clip": 0.01243877, + "auxiliary_loss_mlp": 0.01074794, + "balance_loss_clip": 1.07184422, + "balance_loss_mlp": 1.04657733, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.249119525804231, + "language_loss": 0.68872362, + "learning_rate": 3.997573263210883e-06, + "loss": 0.71191031, + "num_input_tokens_seen": 16123475, + "step": 752, + "time_per_iteration": 3.8274989128112793 + }, + { + "auxiliary_loss_clip": 0.01244186, + "auxiliary_loss_mlp": 0.01055604, + "balance_loss_clip": 1.07202995, + "balance_loss_mlp": 1.02788758, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.811191199786401, + "language_loss": 0.92228347, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94528139, + "num_input_tokens_seen": 16138335, + "step": 753, + "time_per_iteration": 2.39551043510437 + }, + { + "auxiliary_loss_clip": 0.0124754, + "auxiliary_loss_mlp": 0.01080026, + "balance_loss_clip": 1.07486701, + "balance_loss_mlp": 1.05155897, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 1.943082383614292, + "language_loss": 0.90947223, + "learning_rate": 3.997534752096277e-06, + "loss": 0.9327479, + "num_input_tokens_seen": 16157110, + "step": 754, + "time_per_iteration": 2.4465839862823486 + }, + { + "auxiliary_loss_clip": 0.01237917, + "auxiliary_loss_mlp": 0.01076607, + "balance_loss_clip": 1.07329619, + "balance_loss_mlp": 1.04705501, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.742287200105714, + "language_loss": 0.78404289, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80718809, + "num_input_tokens_seen": 16174155, + "step": 755, + "time_per_iteration": 2.4328012466430664 + }, + { + "auxiliary_loss_clip": 0.01249621, + "auxiliary_loss_mlp": 0.0107927, + "balance_loss_clip": 1.07553124, + "balance_loss_mlp": 1.05037403, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 2.1245709302457727, + "language_loss": 0.78772664, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.81101561, + "num_input_tokens_seen": 16192240, + "step": 756, + "time_per_iteration": 2.475555896759033 + }, + { + "auxiliary_loss_clip": 0.01100446, + "auxiliary_loss_mlp": 0.01009318, + "balance_loss_clip": 1.02805901, + "balance_loss_mlp": 1.00519359, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8108839241336978, + "language_loss": 0.6267004, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64779806, + "num_input_tokens_seen": 16255775, + "step": 757, + "time_per_iteration": 3.094111204147339 + }, + { + "auxiliary_loss_clip": 0.01242802, + "auxiliary_loss_mlp": 0.01070572, + "balance_loss_clip": 1.07228386, + "balance_loss_mlp": 1.04239106, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.5099986841513127, + "language_loss": 0.84421402, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86734784, + "num_input_tokens_seen": 16277015, + "step": 758, + "time_per_iteration": 2.5214457511901855 + }, + { + "auxiliary_loss_clip": 0.0123762, + "auxiliary_loss_mlp": 0.01067028, + "balance_loss_clip": 1.06859028, + "balance_loss_mlp": 1.03962147, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 1.8459511129995805, + "language_loss": 0.88360369, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90665019, + "num_input_tokens_seen": 16296005, + "step": 759, + "time_per_iteration": 2.4645683765411377 + }, + { + "auxiliary_loss_clip": 0.01246298, + "auxiliary_loss_mlp": 0.01075232, + "balance_loss_clip": 1.07530844, + "balance_loss_mlp": 1.04523873, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.1813349608179036, + "language_loss": 0.73778182, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.76099718, + "num_input_tokens_seen": 16315300, + "step": 760, + "time_per_iteration": 2.4760265350341797 + }, + { + "auxiliary_loss_clip": 0.01247296, + "auxiliary_loss_mlp": 0.0107503, + "balance_loss_clip": 1.07866728, + "balance_loss_mlp": 1.04746842, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.836733069731927, + "language_loss": 0.82199907, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84522235, + "num_input_tokens_seen": 16333820, + "step": 761, + "time_per_iteration": 2.4397497177124023 + }, + { + "auxiliary_loss_clip": 0.01238658, + "auxiliary_loss_mlp": 0.01073499, + "balance_loss_clip": 1.07076049, + "balance_loss_mlp": 1.04497194, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7291262180580171, + "language_loss": 0.79676831, + "learning_rate": 3.997377677828266e-06, + "loss": 0.8198899, + "num_input_tokens_seen": 16355290, + "step": 762, + "time_per_iteration": 2.508659839630127 + }, + { + "auxiliary_loss_clip": 0.01098846, + "auxiliary_loss_mlp": 0.01007822, + "balance_loss_clip": 1.02734065, + "balance_loss_mlp": 1.00407887, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.0056811481484476, + "language_loss": 0.58699518, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60806185, + "num_input_tokens_seen": 16415995, + "step": 763, + "time_per_iteration": 3.074852705001831 + }, + { + "auxiliary_loss_clip": 0.01242907, + "auxiliary_loss_mlp": 0.01075411, + "balance_loss_clip": 1.07143605, + "balance_loss_mlp": 1.04677737, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 3.3146740283022456, + "language_loss": 0.87486815, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.89805138, + "num_input_tokens_seen": 16433120, + "step": 764, + "time_per_iteration": 2.4355008602142334 + }, + { + "auxiliary_loss_clip": 0.0124562, + "auxiliary_loss_mlp": 0.0107726, + "balance_loss_clip": 1.07394803, + "balance_loss_mlp": 1.04927015, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 3.5725578575044756, + "language_loss": 0.86178911, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88501781, + "num_input_tokens_seen": 16453360, + "step": 765, + "time_per_iteration": 2.5214710235595703 + }, + { + "auxiliary_loss_clip": 0.01245418, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_clip": 1.07229853, + "balance_loss_mlp": 1.04004097, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 2.539872066747102, + "language_loss": 0.88364756, + "learning_rate": 3.997297322892056e-06, + "loss": 0.9068163, + "num_input_tokens_seen": 16471160, + "step": 766, + "time_per_iteration": 2.4418790340423584 + }, + { + "auxiliary_loss_clip": 0.012415, + "auxiliary_loss_mlp": 0.01071402, + "balance_loss_clip": 1.07098866, + "balance_loss_mlp": 1.04305422, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.872176549864573, + "language_loss": 0.84154224, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86467123, + "num_input_tokens_seen": 16488940, + "step": 767, + "time_per_iteration": 2.438732862472534 + }, + { + "auxiliary_loss_clip": 0.01242659, + "auxiliary_loss_mlp": 0.01067343, + "balance_loss_clip": 1.0753231, + "balance_loss_mlp": 1.03812504, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 1.960837422660921, + "language_loss": 0.86731279, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89041281, + "num_input_tokens_seen": 16509505, + "step": 768, + "time_per_iteration": 2.5055665969848633 + }, + { + "auxiliary_loss_clip": 0.0123915, + "auxiliary_loss_mlp": 0.01071061, + "balance_loss_clip": 1.06766546, + "balance_loss_mlp": 1.04321408, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.554753687181264, + "language_loss": 0.75032467, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77342677, + "num_input_tokens_seen": 16528840, + "step": 769, + "time_per_iteration": 2.470245838165283 + }, + { + "auxiliary_loss_clip": 0.01239138, + "auxiliary_loss_mlp": 0.01070979, + "balance_loss_clip": 1.0727787, + "balance_loss_mlp": 1.04389489, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 2.244827031965447, + "language_loss": 0.86172175, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88482296, + "num_input_tokens_seen": 16548335, + "step": 770, + "time_per_iteration": 2.4535813331604004 + }, + { + "auxiliary_loss_clip": 0.0124724, + "auxiliary_loss_mlp": 0.01066073, + "balance_loss_clip": 1.07162595, + "balance_loss_mlp": 1.03768897, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.2292546964167723, + "language_loss": 0.87144184, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89457488, + "num_input_tokens_seen": 16567725, + "step": 771, + "time_per_iteration": 2.4532744884490967 + }, + { + "auxiliary_loss_clip": 0.01245668, + "auxiliary_loss_mlp": 0.01079526, + "balance_loss_clip": 1.07032025, + "balance_loss_mlp": 1.04929495, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 3.0795561129775706, + "language_loss": 0.83903807, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86229002, + "num_input_tokens_seen": 16588175, + "step": 772, + "time_per_iteration": 2.472412347793579 + }, + { + "auxiliary_loss_clip": 0.01244517, + "auxiliary_loss_mlp": 0.01066757, + "balance_loss_clip": 1.07529116, + "balance_loss_mlp": 1.03949428, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 1.890734955139772, + "language_loss": 0.74083066, + "learning_rate": 3.997153785881557e-06, + "loss": 0.76394343, + "num_input_tokens_seen": 16607735, + "step": 773, + "time_per_iteration": 2.45942759513855 + }, + { + "auxiliary_loss_clip": 0.01239239, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.07242084, + "balance_loss_mlp": 1.04041624, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.2821545827956657, + "language_loss": 0.78728008, + "learning_rate": 3.997132977658996e-06, + "loss": 0.81037247, + "num_input_tokens_seen": 16627225, + "step": 774, + "time_per_iteration": 2.476518154144287 + }, + { + "auxiliary_loss_clip": 0.01237272, + "auxiliary_loss_mlp": 0.0106597, + "balance_loss_clip": 1.06983602, + "balance_loss_mlp": 1.03961241, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.1059083581549087, + "language_loss": 0.73319328, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75622576, + "num_input_tokens_seen": 16647785, + "step": 775, + "time_per_iteration": 2.558424949645996 + }, + { + "auxiliary_loss_clip": 0.01240222, + "auxiliary_loss_mlp": 0.01060326, + "balance_loss_clip": 1.07021809, + "balance_loss_mlp": 1.03185868, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 2.2912775810863524, + "language_loss": 0.77023041, + "learning_rate": 3.997091134020217e-06, + "loss": 0.7932359, + "num_input_tokens_seen": 16667555, + "step": 776, + "time_per_iteration": 2.4305427074432373 + }, + { + "auxiliary_loss_clip": 0.01234629, + "auxiliary_loss_mlp": 0.01068567, + "balance_loss_clip": 1.0697217, + "balance_loss_mlp": 1.04241228, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 2.3810790501855075, + "language_loss": 0.71150017, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73453218, + "num_input_tokens_seen": 16686875, + "step": 777, + "time_per_iteration": 2.5050745010375977 + }, + { + "auxiliary_loss_clip": 0.01239633, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_clip": 1.07125223, + "balance_loss_mlp": 1.05289757, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.816307619397008, + "language_loss": 0.76685005, + "learning_rate": 3.997048987461856e-06, + "loss": 0.7900629, + "num_input_tokens_seen": 16706420, + "step": 778, + "time_per_iteration": 2.4952847957611084 + }, + { + "auxiliary_loss_clip": 0.01236499, + "auxiliary_loss_mlp": 0.01068806, + "balance_loss_clip": 1.07019961, + "balance_loss_mlp": 1.03999329, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.1419663011832117, + "language_loss": 0.78967977, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81273282, + "num_input_tokens_seen": 16726390, + "step": 779, + "time_per_iteration": 3.9866929054260254 + }, + { + "auxiliary_loss_clip": 0.01229632, + "auxiliary_loss_mlp": 0.01064917, + "balance_loss_clip": 1.06723094, + "balance_loss_mlp": 1.03776145, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 2.2182127286050464, + "language_loss": 0.77214313, + "learning_rate": 3.997006537990308e-06, + "loss": 0.79508853, + "num_input_tokens_seen": 16748965, + "step": 780, + "time_per_iteration": 3.8876240253448486 + }, + { + "auxiliary_loss_clip": 0.01235072, + "auxiliary_loss_mlp": 0.0107275, + "balance_loss_clip": 1.07098305, + "balance_loss_mlp": 1.0461787, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 1.7469597059844197, + "language_loss": 0.76571733, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78879559, + "num_input_tokens_seen": 16768620, + "step": 781, + "time_per_iteration": 2.4632863998413086 + }, + { + "auxiliary_loss_clip": 0.0124723, + "auxiliary_loss_mlp": 0.01076726, + "balance_loss_clip": 1.07363629, + "balance_loss_mlp": 1.04735339, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.1491272305857447, + "language_loss": 0.73750043, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76074004, + "num_input_tokens_seen": 16789755, + "step": 782, + "time_per_iteration": 2.513108730316162 + }, + { + "auxiliary_loss_clip": 0.0123921, + "auxiliary_loss_mlp": 0.01068131, + "balance_loss_clip": 1.07346845, + "balance_loss_mlp": 1.04200029, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 2.1980178318270758, + "language_loss": 0.80083412, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82390749, + "num_input_tokens_seen": 16807585, + "step": 783, + "time_per_iteration": 2.460252285003662 + }, + { + "auxiliary_loss_clip": 0.01231595, + "auxiliary_loss_mlp": 0.01063987, + "balance_loss_clip": 1.070171, + "balance_loss_mlp": 1.03817868, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.0672900096109053, + "language_loss": 0.81743932, + "learning_rate": 3.996920730333448e-06, + "loss": 0.84039515, + "num_input_tokens_seen": 16827220, + "step": 784, + "time_per_iteration": 2.4419291019439697 + }, + { + "auxiliary_loss_clip": 0.01235998, + "auxiliary_loss_mlp": 0.01067481, + "balance_loss_clip": 1.06646657, + "balance_loss_mlp": 1.04018211, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.4988910101615276, + "language_loss": 0.80716878, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83020359, + "num_input_tokens_seen": 16846230, + "step": 785, + "time_per_iteration": 3.938485622406006 + }, + { + "auxiliary_loss_clip": 0.01241275, + "auxiliary_loss_mlp": 0.01070035, + "balance_loss_clip": 1.07560015, + "balance_loss_mlp": 1.04388058, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 1.9337675212387733, + "language_loss": 0.89416564, + "learning_rate": 3.996877372161152e-06, + "loss": 0.91727877, + "num_input_tokens_seen": 16865325, + "step": 786, + "time_per_iteration": 2.429950475692749 + }, + { + "auxiliary_loss_clip": 0.01238163, + "auxiliary_loss_mlp": 0.01064041, + "balance_loss_clip": 1.06411028, + "balance_loss_mlp": 1.0335474, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 2.648514436884273, + "language_loss": 0.76800102, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.79102302, + "num_input_tokens_seen": 16882930, + "step": 787, + "time_per_iteration": 2.412330389022827 + }, + { + "auxiliary_loss_clip": 0.01244032, + "auxiliary_loss_mlp": 0.01067331, + "balance_loss_clip": 1.07510829, + "balance_loss_mlp": 1.03883958, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 6.499051222319767, + "language_loss": 0.81525028, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83836389, + "num_input_tokens_seen": 16900710, + "step": 788, + "time_per_iteration": 2.4669766426086426 + }, + { + "auxiliary_loss_clip": 0.01235729, + "auxiliary_loss_mlp": 0.01076326, + "balance_loss_clip": 1.07162452, + "balance_loss_mlp": 1.04630911, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.142722615220005, + "language_loss": 0.84583473, + "learning_rate": 3.996811766991355e-06, + "loss": 0.86895537, + "num_input_tokens_seen": 16919210, + "step": 789, + "time_per_iteration": 2.4730541706085205 + }, + { + "auxiliary_loss_clip": 0.01241282, + "auxiliary_loss_mlp": 0.01074065, + "balance_loss_clip": 1.07235622, + "balance_loss_mlp": 1.04613388, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 1.9837400621210797, + "language_loss": 0.81986022, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84301364, + "num_input_tokens_seen": 16937125, + "step": 790, + "time_per_iteration": 2.4003517627716064 + }, + { + "auxiliary_loss_clip": 0.01235746, + "auxiliary_loss_mlp": 0.01067444, + "balance_loss_clip": 1.06868207, + "balance_loss_mlp": 1.0383805, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.7898890851803135, + "language_loss": 0.88114351, + "learning_rate": 3.996767651613597e-06, + "loss": 0.9041754, + "num_input_tokens_seen": 16958610, + "step": 791, + "time_per_iteration": 2.593871593475342 + }, + { + "auxiliary_loss_clip": 0.01239211, + "auxiliary_loss_mlp": 0.01069518, + "balance_loss_clip": 1.07153952, + "balance_loss_mlp": 1.04002607, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.616579866300993, + "language_loss": 0.9011417, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92422903, + "num_input_tokens_seen": 16977300, + "step": 792, + "time_per_iteration": 2.4150002002716064 + }, + { + "auxiliary_loss_clip": 0.01239782, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_clip": 1.0697186, + "balance_loss_mlp": 1.05232716, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 2.9535998828848316, + "language_loss": 0.73351938, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75671178, + "num_input_tokens_seen": 16994950, + "step": 793, + "time_per_iteration": 3.9140913486480713 + }, + { + "auxiliary_loss_clip": 0.01243389, + "auxiliary_loss_mlp": 0.01072442, + "balance_loss_clip": 1.07144475, + "balance_loss_mlp": 1.04285383, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.0913620413473226, + "language_loss": 0.86046457, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88362294, + "num_input_tokens_seen": 17014760, + "step": 794, + "time_per_iteration": 2.4610555171966553 + }, + { + "auxiliary_loss_clip": 0.01238609, + "auxiliary_loss_mlp": 0.01074966, + "balance_loss_clip": 1.06853342, + "balance_loss_mlp": 1.04542553, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 2.4523193173335223, + "language_loss": 0.69006181, + "learning_rate": 3.996678512253272e-06, + "loss": 0.71319753, + "num_input_tokens_seen": 17032715, + "step": 795, + "time_per_iteration": 2.5496227741241455 + }, + { + "auxiliary_loss_clip": 0.01236153, + "auxiliary_loss_mlp": 0.01076765, + "balance_loss_clip": 1.0692116, + "balance_loss_mlp": 1.04712999, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 2.3412111784164966, + "language_loss": 0.80878103, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83191019, + "num_input_tokens_seen": 17052215, + "step": 796, + "time_per_iteration": 2.477062940597534 + }, + { + "auxiliary_loss_clip": 0.0123876, + "auxiliary_loss_mlp": 0.01062563, + "balance_loss_clip": 1.07054293, + "balance_loss_mlp": 1.03322566, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 3.265517921664374, + "language_loss": 0.81445682, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83747011, + "num_input_tokens_seen": 17069225, + "step": 797, + "time_per_iteration": 2.4054341316223145 + }, + { + "auxiliary_loss_clip": 0.01101109, + "auxiliary_loss_mlp": 0.01007865, + "balance_loss_clip": 1.0311625, + "balance_loss_mlp": 1.00369263, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9194786550075268, + "language_loss": 0.64493906, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66602886, + "num_input_tokens_seen": 17126680, + "step": 798, + "time_per_iteration": 2.9645533561706543 + }, + { + "auxiliary_loss_clip": 0.01241551, + "auxiliary_loss_mlp": 0.0107016, + "balance_loss_clip": 1.06646121, + "balance_loss_mlp": 1.04197931, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 2.434010776772705, + "language_loss": 0.90982503, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93294221, + "num_input_tokens_seen": 17144835, + "step": 799, + "time_per_iteration": 2.440366506576538 + }, + { + "auxiliary_loss_clip": 0.01239856, + "auxiliary_loss_mlp": 0.01076045, + "balance_loss_clip": 1.07341468, + "balance_loss_mlp": 1.04681468, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.3868609638626648, + "language_loss": 0.86524975, + "learning_rate": 3.996565384488748e-06, + "loss": 0.88840878, + "num_input_tokens_seen": 17165030, + "step": 800, + "time_per_iteration": 2.454692840576172 + }, + { + "auxiliary_loss_clip": 0.01239009, + "auxiliary_loss_mlp": 0.01071679, + "balance_loss_clip": 1.06763756, + "balance_loss_mlp": 1.04414201, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 2.8216505102319505, + "language_loss": 0.84029329, + "learning_rate": 3.996542531802518e-06, + "loss": 0.8634001, + "num_input_tokens_seen": 17184895, + "step": 801, + "time_per_iteration": 2.431854009628296 + }, + { + "auxiliary_loss_clip": 0.01239581, + "auxiliary_loss_mlp": 0.01079426, + "balance_loss_clip": 1.07035422, + "balance_loss_mlp": 1.05137599, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.9165313716345285, + "language_loss": 0.7969256, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82011563, + "num_input_tokens_seen": 17208225, + "step": 802, + "time_per_iteration": 2.6429035663604736 + }, + { + "auxiliary_loss_clip": 0.01237789, + "auxiliary_loss_mlp": 0.01067505, + "balance_loss_clip": 1.07067502, + "balance_loss_mlp": 1.03971767, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.0257270038351227, + "language_loss": 0.86290431, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88595724, + "num_input_tokens_seen": 17226305, + "step": 803, + "time_per_iteration": 2.476884603500366 + }, + { + "auxiliary_loss_clip": 0.01232682, + "auxiliary_loss_mlp": 0.0106685, + "balance_loss_clip": 1.06954777, + "balance_loss_mlp": 1.0384779, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.8909254805060502, + "language_loss": 0.85074073, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87373608, + "num_input_tokens_seen": 17244545, + "step": 804, + "time_per_iteration": 2.4400839805603027 + }, + { + "auxiliary_loss_clip": 0.0123586, + "auxiliary_loss_mlp": 0.0107114, + "balance_loss_clip": 1.06857228, + "balance_loss_mlp": 1.0430069, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 1.946571414738605, + "language_loss": 0.85954505, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88261509, + "num_input_tokens_seen": 17265730, + "step": 805, + "time_per_iteration": 2.4777321815490723 + }, + { + "auxiliary_loss_clip": 0.01232081, + "auxiliary_loss_mlp": 0.01068958, + "balance_loss_clip": 1.06558788, + "balance_loss_mlp": 1.04165912, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 2.0139309401057135, + "language_loss": 0.68099433, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.7040047, + "num_input_tokens_seen": 17284820, + "step": 806, + "time_per_iteration": 2.460287570953369 + }, + { + "auxiliary_loss_clip": 0.01231663, + "auxiliary_loss_mlp": 0.01062113, + "balance_loss_clip": 1.06721449, + "balance_loss_mlp": 1.03337145, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.0599051077682726, + "language_loss": 0.7683779, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79131567, + "num_input_tokens_seen": 17305085, + "step": 807, + "time_per_iteration": 2.445340633392334 + }, + { + "auxiliary_loss_clip": 0.01229005, + "auxiliary_loss_mlp": 0.01071949, + "balance_loss_clip": 1.0628674, + "balance_loss_mlp": 1.04360151, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.3490510168236343, + "language_loss": 0.86825609, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.89126563, + "num_input_tokens_seen": 17322715, + "step": 808, + "time_per_iteration": 2.4186079502105713 + }, + { + "auxiliary_loss_clip": 0.01237223, + "auxiliary_loss_mlp": 0.01072115, + "balance_loss_clip": 1.06740308, + "balance_loss_mlp": 1.04407644, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 2.1229802666749693, + "language_loss": 0.90070331, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92379665, + "num_input_tokens_seen": 17341455, + "step": 809, + "time_per_iteration": 2.44983172416687 + }, + { + "auxiliary_loss_clip": 0.01235948, + "auxiliary_loss_mlp": 0.01079491, + "balance_loss_clip": 1.07026458, + "balance_loss_mlp": 1.05173922, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.0516863387799957, + "language_loss": 0.8425746, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86572891, + "num_input_tokens_seen": 17360765, + "step": 810, + "time_per_iteration": 2.458997964859009 + }, + { + "auxiliary_loss_clip": 0.01236245, + "auxiliary_loss_mlp": 0.010657, + "balance_loss_clip": 1.0674758, + "balance_loss_mlp": 1.03728068, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.6853823260856213, + "language_loss": 0.80609804, + "learning_rate": 3.99630984108452e-06, + "loss": 0.82911754, + "num_input_tokens_seen": 17380625, + "step": 811, + "time_per_iteration": 2.441333055496216 + }, + { + "auxiliary_loss_clip": 0.01227444, + "auxiliary_loss_mlp": 0.01071589, + "balance_loss_clip": 1.06416488, + "balance_loss_mlp": 1.04345584, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 2.9451090541006715, + "language_loss": 0.74374521, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.76673555, + "num_input_tokens_seen": 17399355, + "step": 812, + "time_per_iteration": 2.431432008743286 + }, + { + "auxiliary_loss_clip": 0.01228958, + "auxiliary_loss_mlp": 0.01078879, + "balance_loss_clip": 1.06819725, + "balance_loss_mlp": 1.0518899, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.8873582154487936, + "language_loss": 0.9005689, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92364728, + "num_input_tokens_seen": 17418240, + "step": 813, + "time_per_iteration": 2.4605443477630615 + }, + { + "auxiliary_loss_clip": 0.01227252, + "auxiliary_loss_mlp": 0.01065114, + "balance_loss_clip": 1.06511796, + "balance_loss_mlp": 1.03824472, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.000096554449938, + "language_loss": 0.74541187, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.76833558, + "num_input_tokens_seen": 17436250, + "step": 814, + "time_per_iteration": 2.435209035873413 + }, + { + "auxiliary_loss_clip": 0.01229653, + "auxiliary_loss_mlp": 0.01074413, + "balance_loss_clip": 1.06439364, + "balance_loss_mlp": 1.04571927, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.141065182964396, + "language_loss": 0.83546644, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.85850716, + "num_input_tokens_seen": 17455750, + "step": 815, + "time_per_iteration": 2.467069387435913 + }, + { + "auxiliary_loss_clip": 0.01234913, + "auxiliary_loss_mlp": 0.01060462, + "balance_loss_clip": 1.06746876, + "balance_loss_mlp": 1.03166068, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.462878866424542, + "language_loss": 0.90575987, + "learning_rate": 3.996190656910043e-06, + "loss": 0.92871362, + "num_input_tokens_seen": 17474995, + "step": 816, + "time_per_iteration": 2.4726932048797607 + }, + { + "auxiliary_loss_clip": 0.01234976, + "auxiliary_loss_mlp": 0.0106455, + "balance_loss_clip": 1.06679976, + "balance_loss_mlp": 1.03611863, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 2.2332923563673717, + "language_loss": 0.79795063, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82094586, + "num_input_tokens_seen": 17493395, + "step": 817, + "time_per_iteration": 2.4487578868865967 + }, + { + "auxiliary_loss_clip": 0.01229192, + "auxiliary_loss_mlp": 0.01083167, + "balance_loss_clip": 1.06625581, + "balance_loss_mlp": 1.05536699, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 2.543633851474409, + "language_loss": 0.85016382, + "learning_rate": 3.996142453363656e-06, + "loss": 0.87328744, + "num_input_tokens_seen": 17514565, + "step": 818, + "time_per_iteration": 2.477248191833496 + }, + { + "auxiliary_loss_clip": 0.0123594, + "auxiliary_loss_mlp": 0.01070774, + "balance_loss_clip": 1.06567323, + "balance_loss_mlp": 1.04121041, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.3127434235885618, + "language_loss": 0.75353169, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77659881, + "num_input_tokens_seen": 17534590, + "step": 819, + "time_per_iteration": 3.9822440147399902 + }, + { + "auxiliary_loss_clip": 0.01232117, + "auxiliary_loss_mlp": 0.01064494, + "balance_loss_clip": 1.06848478, + "balance_loss_mlp": 1.03892338, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.2592270455559746, + "language_loss": 0.85086131, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87382746, + "num_input_tokens_seen": 17551900, + "step": 820, + "time_per_iteration": 2.4077768325805664 + }, + { + "auxiliary_loss_clip": 0.01228371, + "auxiliary_loss_mlp": 0.01066033, + "balance_loss_clip": 1.06266582, + "balance_loss_mlp": 1.03818619, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 2.0350754778947606, + "language_loss": 0.90447831, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92742234, + "num_input_tokens_seen": 17571485, + "step": 821, + "time_per_iteration": 3.9568443298339844 + }, + { + "auxiliary_loss_clip": 0.01226881, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_clip": 1.06260562, + "balance_loss_mlp": 1.05431581, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 1.880553917478659, + "language_loss": 0.89557123, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91865534, + "num_input_tokens_seen": 17591410, + "step": 822, + "time_per_iteration": 2.442838668823242 + }, + { + "auxiliary_loss_clip": 0.01229754, + "auxiliary_loss_mlp": 0.01065813, + "balance_loss_clip": 1.06679881, + "balance_loss_mlp": 1.03574848, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 1.828024873536172, + "language_loss": 0.67288631, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69584197, + "num_input_tokens_seen": 17612010, + "step": 823, + "time_per_iteration": 2.4995510578155518 + }, + { + "auxiliary_loss_clip": 0.01091125, + "auxiliary_loss_mlp": 0.01006897, + "balance_loss_clip": 1.02103853, + "balance_loss_mlp": 1.00322485, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.371227997919765, + "language_loss": 0.62275982, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64374, + "num_input_tokens_seen": 17673430, + "step": 824, + "time_per_iteration": 3.1009676456451416 + }, + { + "auxiliary_loss_clip": 0.01230541, + "auxiliary_loss_mlp": 0.01075686, + "balance_loss_clip": 1.06408072, + "balance_loss_mlp": 1.04652715, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 1.8402962095048574, + "language_loss": 0.90357554, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92663783, + "num_input_tokens_seen": 17689545, + "step": 825, + "time_per_iteration": 2.4253194332122803 + }, + { + "auxiliary_loss_clip": 0.01229858, + "auxiliary_loss_mlp": 0.01067922, + "balance_loss_clip": 1.06400132, + "balance_loss_mlp": 1.03895378, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 13.409175094123542, + "language_loss": 0.66978091, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69275874, + "num_input_tokens_seen": 17705965, + "step": 826, + "time_per_iteration": 3.8194074630737305 + }, + { + "auxiliary_loss_clip": 0.01230079, + "auxiliary_loss_mlp": 0.01069082, + "balance_loss_clip": 1.06426632, + "balance_loss_mlp": 1.04062629, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 1.7581373668976847, + "language_loss": 0.78497219, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80796379, + "num_input_tokens_seen": 17724580, + "step": 827, + "time_per_iteration": 2.4274981021881104 + }, + { + "auxiliary_loss_clip": 0.01232638, + "auxiliary_loss_mlp": 0.01086513, + "balance_loss_clip": 1.06492507, + "balance_loss_mlp": 1.05644846, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 1.7850049554092975, + "language_loss": 0.78769529, + "learning_rate": 3.995896894144294e-06, + "loss": 0.81088674, + "num_input_tokens_seen": 17747755, + "step": 828, + "time_per_iteration": 2.5983362197875977 + }, + { + "auxiliary_loss_clip": 0.01219777, + "auxiliary_loss_mlp": 0.01061993, + "balance_loss_clip": 1.05871415, + "balance_loss_mlp": 1.03515887, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.0418137714870985, + "language_loss": 0.83529603, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85811371, + "num_input_tokens_seen": 17768550, + "step": 829, + "time_per_iteration": 2.460244655609131 + }, + { + "auxiliary_loss_clip": 0.0122665, + "auxiliary_loss_mlp": 0.01080849, + "balance_loss_clip": 1.06164551, + "balance_loss_mlp": 1.04949689, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.0047721408152173, + "language_loss": 0.75678015, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77985513, + "num_input_tokens_seen": 17786080, + "step": 830, + "time_per_iteration": 2.402846097946167 + }, + { + "auxiliary_loss_clip": 0.01227575, + "auxiliary_loss_mlp": 0.01074342, + "balance_loss_clip": 1.06262529, + "balance_loss_mlp": 1.04512429, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 2.098754861963188, + "language_loss": 0.79487383, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81789303, + "num_input_tokens_seen": 17803635, + "step": 831, + "time_per_iteration": 2.4041030406951904 + }, + { + "auxiliary_loss_clip": 0.01178692, + "auxiliary_loss_mlp": 0.01077632, + "balance_loss_clip": 1.05687284, + "balance_loss_mlp": 1.05023742, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.764145010820449, + "language_loss": 0.91071451, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93327767, + "num_input_tokens_seen": 17822190, + "step": 832, + "time_per_iteration": 2.6146745681762695 + }, + { + "auxiliary_loss_clip": 0.01191694, + "auxiliary_loss_mlp": 0.01076365, + "balance_loss_clip": 1.05623138, + "balance_loss_mlp": 1.05021, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 2.461869744431905, + "language_loss": 0.83010626, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85278684, + "num_input_tokens_seen": 17846915, + "step": 833, + "time_per_iteration": 4.177637100219727 + }, + { + "auxiliary_loss_clip": 0.01200999, + "auxiliary_loss_mlp": 0.01061439, + "balance_loss_clip": 1.06012893, + "balance_loss_mlp": 1.03285241, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.302439903311429, + "language_loss": 0.82291353, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84553796, + "num_input_tokens_seen": 17867270, + "step": 834, + "time_per_iteration": 2.6502063274383545 + }, + { + "auxiliary_loss_clip": 0.01214686, + "auxiliary_loss_mlp": 0.01070458, + "balance_loss_clip": 1.06098747, + "balance_loss_mlp": 1.04181218, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 3.38457451517319, + "language_loss": 0.91800714, + "learning_rate": 3.995720499401282e-06, + "loss": 0.9408586, + "num_input_tokens_seen": 17884880, + "step": 835, + "time_per_iteration": 2.4583771228790283 + }, + { + "auxiliary_loss_clip": 0.01225701, + "auxiliary_loss_mlp": 0.01069059, + "balance_loss_clip": 1.05825794, + "balance_loss_mlp": 1.03965032, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.8045881350358135, + "language_loss": 0.76385713, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78680468, + "num_input_tokens_seen": 17903695, + "step": 836, + "time_per_iteration": 2.4993746280670166 + }, + { + "auxiliary_loss_clip": 0.01210819, + "auxiliary_loss_mlp": 0.01075207, + "balance_loss_clip": 1.06137836, + "balance_loss_mlp": 1.04808688, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.596284073134643, + "language_loss": 0.83927035, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.86213064, + "num_input_tokens_seen": 17920745, + "step": 837, + "time_per_iteration": 2.489959716796875 + }, + { + "auxiliary_loss_clip": 0.01196308, + "auxiliary_loss_mlp": 0.00797706, + "balance_loss_clip": 1.06093454, + "balance_loss_mlp": 1.00051856, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 4.850537473509262, + "language_loss": 0.72737455, + "learning_rate": 3.995643766466275e-06, + "loss": 0.74731469, + "num_input_tokens_seen": 17938220, + "step": 838, + "time_per_iteration": 2.5099668502807617 + }, + { + "auxiliary_loss_clip": 0.01191185, + "auxiliary_loss_mlp": 0.01069964, + "balance_loss_clip": 1.05405366, + "balance_loss_mlp": 1.04254603, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.7202706742063685, + "language_loss": 0.8322916, + "learning_rate": 3.995618037469953e-06, + "loss": 0.8549031, + "num_input_tokens_seen": 17957325, + "step": 839, + "time_per_iteration": 2.499276638031006 + }, + { + "auxiliary_loss_clip": 0.01221879, + "auxiliary_loss_mlp": 0.0107312, + "balance_loss_clip": 1.06067443, + "balance_loss_mlp": 1.04577339, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.412685271619069, + "language_loss": 0.85709155, + "learning_rate": 3.995592232799595e-06, + "loss": 0.8800416, + "num_input_tokens_seen": 17975875, + "step": 840, + "time_per_iteration": 2.4684126377105713 + }, + { + "auxiliary_loss_clip": 0.0119563, + "auxiliary_loss_mlp": 0.01064976, + "balance_loss_clip": 1.05545425, + "balance_loss_mlp": 1.03621125, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 1.7619397969340063, + "language_loss": 0.94405484, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96666086, + "num_input_tokens_seen": 17994340, + "step": 841, + "time_per_iteration": 2.5375781059265137 + }, + { + "auxiliary_loss_clip": 0.01227714, + "auxiliary_loss_mlp": 0.01071969, + "balance_loss_clip": 1.06312692, + "balance_loss_mlp": 1.0430969, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.4339156647895033, + "language_loss": 0.77705437, + "learning_rate": 3.995540396440688e-06, + "loss": 0.80005121, + "num_input_tokens_seen": 18015260, + "step": 842, + "time_per_iteration": 2.5282416343688965 + }, + { + "auxiliary_loss_clip": 0.01215567, + "auxiliary_loss_mlp": 0.01070051, + "balance_loss_clip": 1.06379008, + "balance_loss_mlp": 1.0426805, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 4.652788483674908, + "language_loss": 0.78117394, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80403018, + "num_input_tokens_seen": 18033960, + "step": 843, + "time_per_iteration": 2.4849913120269775 + }, + { + "auxiliary_loss_clip": 0.01214167, + "auxiliary_loss_mlp": 0.0106266, + "balance_loss_clip": 1.06261659, + "balance_loss_mlp": 1.03713679, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.0869192539844357, + "language_loss": 0.83367014, + "learning_rate": 3.995488257397417e-06, + "loss": 0.8564384, + "num_input_tokens_seen": 18056700, + "step": 844, + "time_per_iteration": 2.612112283706665 + }, + { + "auxiliary_loss_clip": 0.01213421, + "auxiliary_loss_mlp": 0.01067144, + "balance_loss_clip": 1.06074679, + "balance_loss_mlp": 1.04073954, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.137996664233096, + "language_loss": 0.76321322, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78601885, + "num_input_tokens_seen": 18075815, + "step": 845, + "time_per_iteration": 2.4834036827087402 + }, + { + "auxiliary_loss_clip": 0.01205436, + "auxiliary_loss_mlp": 0.01071374, + "balance_loss_clip": 1.05704093, + "balance_loss_mlp": 1.04363441, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 1.909089931846759, + "language_loss": 0.877693, + "learning_rate": 3.99543581567769e-06, + "loss": 0.90046108, + "num_input_tokens_seen": 18095095, + "step": 846, + "time_per_iteration": 2.4773426055908203 + }, + { + "auxiliary_loss_clip": 0.0120617, + "auxiliary_loss_mlp": 0.01069254, + "balance_loss_clip": 1.06009746, + "balance_loss_mlp": 1.04284954, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.695731149306032, + "language_loss": 0.8756249, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89837921, + "num_input_tokens_seen": 18112675, + "step": 847, + "time_per_iteration": 2.483109474182129 + }, + { + "auxiliary_loss_clip": 0.01172536, + "auxiliary_loss_mlp": 0.01065373, + "balance_loss_clip": 1.05837035, + "balance_loss_mlp": 1.03776383, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.2783644447253137, + "language_loss": 0.81838596, + "learning_rate": 3.995383071289462e-06, + "loss": 0.840765, + "num_input_tokens_seen": 18130745, + "step": 848, + "time_per_iteration": 2.5975418090820312 + }, + { + "auxiliary_loss_clip": 0.01225235, + "auxiliary_loss_mlp": 0.0107119, + "balance_loss_clip": 1.06577611, + "balance_loss_mlp": 1.04395103, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.8562488427320754, + "language_loss": 0.87720466, + "learning_rate": 3.995356585597158e-06, + "loss": 0.9001689, + "num_input_tokens_seen": 18152410, + "step": 849, + "time_per_iteration": 2.5288310050964355 + }, + { + "auxiliary_loss_clip": 0.01221448, + "auxiliary_loss_mlp": 0.01060326, + "balance_loss_clip": 1.06115496, + "balance_loss_mlp": 1.03368306, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.8955226226505284, + "language_loss": 0.82833147, + "learning_rate": 3.995330024240732e-06, + "loss": 0.8511492, + "num_input_tokens_seen": 18170870, + "step": 850, + "time_per_iteration": 2.4306674003601074 + }, + { + "auxiliary_loss_clip": 0.012098, + "auxiliary_loss_mlp": 0.0106349, + "balance_loss_clip": 1.05954313, + "balance_loss_mlp": 1.03701353, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.507728107163912, + "language_loss": 0.65038192, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67311478, + "num_input_tokens_seen": 18191555, + "step": 851, + "time_per_iteration": 2.622093915939331 + }, + { + "auxiliary_loss_clip": 0.01207935, + "auxiliary_loss_mlp": 0.0107275, + "balance_loss_clip": 1.05911851, + "balance_loss_mlp": 1.04434264, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.386106011599839, + "language_loss": 0.83459967, + "learning_rate": 3.995276674539547e-06, + "loss": 0.85740656, + "num_input_tokens_seen": 18208620, + "step": 852, + "time_per_iteration": 2.485888719558716 + }, + { + "auxiliary_loss_clip": 0.01198026, + "auxiliary_loss_mlp": 0.01075765, + "balance_loss_clip": 1.06176138, + "balance_loss_mlp": 1.04814458, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 2.073101658166736, + "language_loss": 0.80419934, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82693732, + "num_input_tokens_seen": 18226370, + "step": 853, + "time_per_iteration": 2.492105484008789 + }, + { + "auxiliary_loss_clip": 0.01221289, + "auxiliary_loss_mlp": 0.0106926, + "balance_loss_clip": 1.06139517, + "balance_loss_mlp": 1.04134095, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 2.332269190504179, + "language_loss": 0.75495136, + "learning_rate": 3.995223022193999e-06, + "loss": 0.77785689, + "num_input_tokens_seen": 18247075, + "step": 854, + "time_per_iteration": 2.4944069385528564 + }, + { + "auxiliary_loss_clip": 0.01200473, + "auxiliary_loss_mlp": 0.01069892, + "balance_loss_clip": 1.05951333, + "balance_loss_mlp": 1.04159141, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.2162091490106413, + "language_loss": 0.81547236, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83817607, + "num_input_tokens_seen": 18265680, + "step": 855, + "time_per_iteration": 2.5825555324554443 + }, + { + "auxiliary_loss_clip": 0.0107136, + "auxiliary_loss_mlp": 0.00766713, + "balance_loss_clip": 1.02948904, + "balance_loss_mlp": 1.00051546, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 0.9942383132758487, + "language_loss": 0.65723801, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67561865, + "num_input_tokens_seen": 18327015, + "step": 856, + "time_per_iteration": 3.1011388301849365 + }, + { + "auxiliary_loss_clip": 0.01191223, + "auxiliary_loss_mlp": 0.01059113, + "balance_loss_clip": 1.05770421, + "balance_loss_mlp": 1.03229117, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.969303829652318, + "language_loss": 0.76915562, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.791659, + "num_input_tokens_seen": 18345235, + "step": 857, + "time_per_iteration": 2.523027181625366 + }, + { + "auxiliary_loss_clip": 0.0118069, + "auxiliary_loss_mlp": 0.01061451, + "balance_loss_clip": 1.05304587, + "balance_loss_mlp": 1.03356743, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.3905064468330943, + "language_loss": 0.8902756, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91269696, + "num_input_tokens_seen": 18362350, + "step": 858, + "time_per_iteration": 2.535240411758423 + }, + { + "auxiliary_loss_clip": 0.01196628, + "auxiliary_loss_mlp": 0.01059962, + "balance_loss_clip": 1.06116319, + "balance_loss_mlp": 1.03319895, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.8940883074159334, + "language_loss": 0.75124407, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77380991, + "num_input_tokens_seen": 18383390, + "step": 859, + "time_per_iteration": 4.003504037857056 + }, + { + "auxiliary_loss_clip": 0.01190692, + "auxiliary_loss_mlp": 0.0107453, + "balance_loss_clip": 1.05780053, + "balance_loss_mlp": 1.04423845, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.869091248240738, + "language_loss": 0.90736568, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93001789, + "num_input_tokens_seen": 18399220, + "step": 860, + "time_per_iteration": 2.509953498840332 + }, + { + "auxiliary_loss_clip": 0.01227545, + "auxiliary_loss_mlp": 0.01062592, + "balance_loss_clip": 1.06628597, + "balance_loss_mlp": 1.03562653, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.948260287705401, + "language_loss": 0.82418263, + "learning_rate": 3.99503285577813e-06, + "loss": 0.84708405, + "num_input_tokens_seen": 18419005, + "step": 861, + "time_per_iteration": 3.950298547744751 + }, + { + "auxiliary_loss_clip": 0.011982, + "auxiliary_loss_mlp": 0.01060372, + "balance_loss_clip": 1.06008196, + "balance_loss_mlp": 1.03325236, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 1.6973841410895438, + "language_loss": 0.78429914, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80688477, + "num_input_tokens_seen": 18440550, + "step": 862, + "time_per_iteration": 2.6051254272460938 + }, + { + "auxiliary_loss_clip": 0.01192922, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_clip": 1.06008816, + "balance_loss_mlp": 1.04518056, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 2.1241859790696553, + "language_loss": 0.8909955, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91363961, + "num_input_tokens_seen": 18461950, + "step": 863, + "time_per_iteration": 2.5761871337890625 + }, + { + "auxiliary_loss_clip": 0.0120317, + "auxiliary_loss_mlp": 0.01064457, + "balance_loss_clip": 1.06452072, + "balance_loss_mlp": 1.03598976, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.9175266999137004, + "language_loss": 0.75846553, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78114176, + "num_input_tokens_seen": 18480555, + "step": 864, + "time_per_iteration": 2.554403305053711 + }, + { + "auxiliary_loss_clip": 0.01189975, + "auxiliary_loss_mlp": 0.0107295, + "balance_loss_clip": 1.0570879, + "balance_loss_mlp": 1.04410112, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.8808909474563305, + "language_loss": 0.7881037, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81073296, + "num_input_tokens_seen": 18499645, + "step": 865, + "time_per_iteration": 2.5132880210876465 + }, + { + "auxiliary_loss_clip": 0.01210393, + "auxiliary_loss_mlp": 0.01067568, + "balance_loss_clip": 1.06000113, + "balance_loss_mlp": 1.03993487, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 3.574554389746405, + "language_loss": 0.86017549, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88295513, + "num_input_tokens_seen": 18516810, + "step": 866, + "time_per_iteration": 3.8746252059936523 + }, + { + "auxiliary_loss_clip": 0.01190881, + "auxiliary_loss_mlp": 0.01067368, + "balance_loss_clip": 1.06580484, + "balance_loss_mlp": 1.03953207, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.2565447090295514, + "language_loss": 0.87351185, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89609438, + "num_input_tokens_seen": 18532510, + "step": 867, + "time_per_iteration": 2.532383680343628 + }, + { + "auxiliary_loss_clip": 0.01171831, + "auxiliary_loss_mlp": 0.01075512, + "balance_loss_clip": 1.05886555, + "balance_loss_mlp": 1.05049014, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.5057044802546473, + "language_loss": 0.63670397, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65917742, + "num_input_tokens_seen": 18557380, + "step": 868, + "time_per_iteration": 2.640751838684082 + }, + { + "auxiliary_loss_clip": 0.01227444, + "auxiliary_loss_mlp": 0.01072082, + "balance_loss_clip": 1.06528854, + "balance_loss_mlp": 1.04219651, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.1005293423911953, + "language_loss": 0.8380146, + "learning_rate": 3.994810983642281e-06, + "loss": 0.86100996, + "num_input_tokens_seen": 18575720, + "step": 869, + "time_per_iteration": 2.4836316108703613 + }, + { + "auxiliary_loss_clip": 0.01214459, + "auxiliary_loss_mlp": 0.01060661, + "balance_loss_clip": 1.0631299, + "balance_loss_mlp": 1.03380334, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 5.055641785141173, + "language_loss": 0.87790811, + "learning_rate": 3.994782909218751e-06, + "loss": 0.90065932, + "num_input_tokens_seen": 18592185, + "step": 870, + "time_per_iteration": 2.4708313941955566 + }, + { + "auxiliary_loss_clip": 0.01225631, + "auxiliary_loss_mlp": 0.01065366, + "balance_loss_clip": 1.06469536, + "balance_loss_mlp": 1.03891325, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 2.497487235887047, + "language_loss": 0.80579704, + "learning_rate": 3.994754759152854e-06, + "loss": 0.82870698, + "num_input_tokens_seen": 18609560, + "step": 871, + "time_per_iteration": 2.4551045894622803 + }, + { + "auxiliary_loss_clip": 0.01194868, + "auxiliary_loss_mlp": 0.01069888, + "balance_loss_clip": 1.06409335, + "balance_loss_mlp": 1.04374552, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 2.8416296589484524, + "language_loss": 0.81543493, + "learning_rate": 3.994726533445656e-06, + "loss": 0.83808243, + "num_input_tokens_seen": 18629405, + "step": 872, + "time_per_iteration": 2.5270702838897705 + }, + { + "auxiliary_loss_clip": 0.01073976, + "auxiliary_loss_mlp": 0.01014194, + "balance_loss_clip": 1.0278163, + "balance_loss_mlp": 1.01004505, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8805670996508453, + "language_loss": 0.61577356, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63665521, + "num_input_tokens_seen": 18681480, + "step": 873, + "time_per_iteration": 4.351958751678467 + }, + { + "auxiliary_loss_clip": 0.01195709, + "auxiliary_loss_mlp": 0.01059967, + "balance_loss_clip": 1.06125712, + "balance_loss_mlp": 1.03270411, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 2.6729393417192453, + "language_loss": 0.89082885, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91338563, + "num_input_tokens_seen": 18700390, + "step": 874, + "time_per_iteration": 2.5410587787628174 + }, + { + "auxiliary_loss_clip": 0.01196579, + "auxiliary_loss_mlp": 0.01067976, + "balance_loss_clip": 1.05964923, + "balance_loss_mlp": 1.04084373, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 1.9955694571422589, + "language_loss": 0.74790943, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77055496, + "num_input_tokens_seen": 18721280, + "step": 875, + "time_per_iteration": 2.61405348777771 + }, + { + "auxiliary_loss_clip": 0.01204247, + "auxiliary_loss_mlp": 0.01057706, + "balance_loss_clip": 1.06175554, + "balance_loss_mlp": 1.02995431, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 2.0243003716286654, + "language_loss": 0.92933851, + "learning_rate": 3.99461287422531e-06, + "loss": 0.95195806, + "num_input_tokens_seen": 18741545, + "step": 876, + "time_per_iteration": 2.521303415298462 + }, + { + "auxiliary_loss_clip": 0.01093645, + "auxiliary_loss_mlp": 0.01003513, + "balance_loss_clip": 1.02197409, + "balance_loss_mlp": 0.99929333, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8636004238973327, + "language_loss": 0.62864506, + "learning_rate": 3.994584270327722e-06, + "loss": 0.64961666, + "num_input_tokens_seen": 18801400, + "step": 877, + "time_per_iteration": 3.0433216094970703 + }, + { + "auxiliary_loss_clip": 0.01197915, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_clip": 1.05908251, + "balance_loss_mlp": 1.04703712, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.9654833560040843, + "language_loss": 0.85329771, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87602758, + "num_input_tokens_seen": 18819670, + "step": 878, + "time_per_iteration": 2.5386528968811035 + }, + { + "auxiliary_loss_clip": 0.01226211, + "auxiliary_loss_mlp": 0.01062162, + "balance_loss_clip": 1.06297863, + "balance_loss_mlp": 1.03513741, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 1.759349287192384, + "language_loss": 0.83095241, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85383612, + "num_input_tokens_seen": 18840580, + "step": 879, + "time_per_iteration": 2.5388169288635254 + }, + { + "auxiliary_loss_clip": 0.01193718, + "auxiliary_loss_mlp": 0.01071702, + "balance_loss_clip": 1.06229305, + "balance_loss_mlp": 1.04416502, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 1.9228788524012972, + "language_loss": 0.84215546, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86480975, + "num_input_tokens_seen": 18859295, + "step": 880, + "time_per_iteration": 2.5041861534118652 + }, + { + "auxiliary_loss_clip": 0.01185634, + "auxiliary_loss_mlp": 0.01067518, + "balance_loss_clip": 1.06018114, + "balance_loss_mlp": 1.04107761, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 2.3451045462492117, + "language_loss": 0.86973321, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89226472, + "num_input_tokens_seen": 18877485, + "step": 881, + "time_per_iteration": 2.5471794605255127 + }, + { + "auxiliary_loss_clip": 0.01207465, + "auxiliary_loss_mlp": 0.01065209, + "balance_loss_clip": 1.0597496, + "balance_loss_mlp": 1.03655136, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.744744129778095, + "language_loss": 0.88042551, + "learning_rate": 3.994440116339046e-06, + "loss": 0.90315229, + "num_input_tokens_seen": 18898275, + "step": 882, + "time_per_iteration": 2.537738084793091 + }, + { + "auxiliary_loss_clip": 0.01226593, + "auxiliary_loss_mlp": 0.01064667, + "balance_loss_clip": 1.06392276, + "balance_loss_mlp": 1.03591371, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 3.7770039807333724, + "language_loss": 0.6924786, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71539116, + "num_input_tokens_seen": 18920665, + "step": 883, + "time_per_iteration": 2.55863356590271 + }, + { + "auxiliary_loss_clip": 0.0117173, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.05536366, + "balance_loss_mlp": 1.0404954, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 1.941419342075546, + "language_loss": 0.76233119, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78472078, + "num_input_tokens_seen": 18939835, + "step": 884, + "time_per_iteration": 2.5535781383514404 + }, + { + "auxiliary_loss_clip": 0.01175098, + "auxiliary_loss_mlp": 0.01064046, + "balance_loss_clip": 1.06494296, + "balance_loss_mlp": 1.03810644, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.5329088433853735, + "language_loss": 0.85485196, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87724334, + "num_input_tokens_seen": 18958405, + "step": 885, + "time_per_iteration": 2.572176218032837 + }, + { + "auxiliary_loss_clip": 0.01188756, + "auxiliary_loss_mlp": 0.01069357, + "balance_loss_clip": 1.05544567, + "balance_loss_mlp": 1.04098547, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.7212553650227385, + "language_loss": 0.86587512, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88845628, + "num_input_tokens_seen": 18975445, + "step": 886, + "time_per_iteration": 2.4905951023101807 + }, + { + "auxiliary_loss_clip": 0.01174678, + "auxiliary_loss_mlp": 0.01069945, + "balance_loss_clip": 1.05799317, + "balance_loss_mlp": 1.04146552, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 1.9371065870499078, + "language_loss": 0.89428759, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91673386, + "num_input_tokens_seen": 18991930, + "step": 887, + "time_per_iteration": 2.52738881111145 + }, + { + "auxiliary_loss_clip": 0.01151298, + "auxiliary_loss_mlp": 0.01077746, + "balance_loss_clip": 1.04702842, + "balance_loss_mlp": 1.04747844, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 5.359491548668891, + "language_loss": 0.75126338, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77355385, + "num_input_tokens_seen": 19009790, + "step": 888, + "time_per_iteration": 2.607337713241577 + }, + { + "auxiliary_loss_clip": 0.01167135, + "auxiliary_loss_mlp": 0.01075044, + "balance_loss_clip": 1.05773044, + "balance_loss_mlp": 1.04679132, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 11.891086296640053, + "language_loss": 0.88721037, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90963215, + "num_input_tokens_seen": 19030170, + "step": 889, + "time_per_iteration": 2.6040265560150146 + }, + { + "auxiliary_loss_clip": 0.01217064, + "auxiliary_loss_mlp": 0.01055636, + "balance_loss_clip": 1.06152868, + "balance_loss_mlp": 1.02920771, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 2.080233791266751, + "language_loss": 0.88412642, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90685344, + "num_input_tokens_seen": 19048075, + "step": 890, + "time_per_iteration": 2.449506998062134 + }, + { + "auxiliary_loss_clip": 0.01196121, + "auxiliary_loss_mlp": 0.01072347, + "balance_loss_clip": 1.05773759, + "balance_loss_mlp": 1.04628778, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.5416643515864705, + "language_loss": 0.93595612, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95864075, + "num_input_tokens_seen": 19067465, + "step": 891, + "time_per_iteration": 2.565561056137085 + }, + { + "auxiliary_loss_clip": 0.01194868, + "auxiliary_loss_mlp": 0.01072087, + "balance_loss_clip": 1.05811143, + "balance_loss_mlp": 1.0427134, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.148466100420052, + "language_loss": 0.72020185, + "learning_rate": 3.994146136297893e-06, + "loss": 0.7428714, + "num_input_tokens_seen": 19085505, + "step": 892, + "time_per_iteration": 2.4417145252227783 + }, + { + "auxiliary_loss_clip": 0.01198411, + "auxiliary_loss_mlp": 0.0079776, + "balance_loss_clip": 1.06165934, + "balance_loss_mlp": 1.00019407, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 2.2197938670811324, + "language_loss": 0.82533729, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84529901, + "num_input_tokens_seen": 19104360, + "step": 893, + "time_per_iteration": 2.5682311058044434 + }, + { + "auxiliary_loss_clip": 0.01198661, + "auxiliary_loss_mlp": 0.0106897, + "balance_loss_clip": 1.06173587, + "balance_loss_mlp": 1.04291129, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.6176276412582458, + "language_loss": 0.81711006, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83978641, + "num_input_tokens_seen": 19124680, + "step": 894, + "time_per_iteration": 2.5556201934814453 + }, + { + "auxiliary_loss_clip": 0.01205087, + "auxiliary_loss_mlp": 0.01063455, + "balance_loss_clip": 1.05897701, + "balance_loss_mlp": 1.03635859, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.276206181079277, + "language_loss": 0.75518048, + "learning_rate": 3.994056467679221e-06, + "loss": 0.77786589, + "num_input_tokens_seen": 19142895, + "step": 895, + "time_per_iteration": 2.473027467727661 + }, + { + "auxiliary_loss_clip": 0.01202816, + "auxiliary_loss_mlp": 0.01063328, + "balance_loss_clip": 1.06678331, + "balance_loss_mlp": 1.03654182, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.4254460083761074, + "language_loss": 0.8644954, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88715684, + "num_input_tokens_seen": 19163125, + "step": 896, + "time_per_iteration": 2.5322916507720947 + }, + { + "auxiliary_loss_clip": 0.01224999, + "auxiliary_loss_mlp": 0.00796734, + "balance_loss_clip": 1.06336451, + "balance_loss_mlp": 1.00024295, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.5604505113776015, + "language_loss": 0.88300252, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90321982, + "num_input_tokens_seen": 19179385, + "step": 897, + "time_per_iteration": 2.41622257232666 + }, + { + "auxiliary_loss_clip": 0.01204755, + "auxiliary_loss_mlp": 0.01065114, + "balance_loss_clip": 1.06193042, + "balance_loss_mlp": 1.0365988, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.6433249573088438, + "language_loss": 0.9036411, + "learning_rate": 3.993966118527175e-06, + "loss": 0.92633981, + "num_input_tokens_seen": 19198725, + "step": 898, + "time_per_iteration": 4.023061513900757 + }, + { + "auxiliary_loss_clip": 0.01204909, + "auxiliary_loss_mlp": 0.01082829, + "balance_loss_clip": 1.06122231, + "balance_loss_mlp": 1.05584049, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.024975324749003, + "language_loss": 0.91888297, + "learning_rate": 3.993935850918845e-06, + "loss": 0.9417603, + "num_input_tokens_seen": 19212380, + "step": 899, + "time_per_iteration": 2.505399227142334 + }, + { + "auxiliary_loss_clip": 0.01192243, + "auxiliary_loss_mlp": 0.0107274, + "balance_loss_clip": 1.05994487, + "balance_loss_mlp": 1.04454744, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.207694958755704, + "language_loss": 0.75333345, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77598333, + "num_input_tokens_seen": 19232235, + "step": 900, + "time_per_iteration": 3.9387872219085693 + }, + { + "auxiliary_loss_clip": 0.01217315, + "auxiliary_loss_mlp": 0.01057997, + "balance_loss_clip": 1.06369627, + "balance_loss_mlp": 1.03265285, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.233349388301411, + "language_loss": 0.74144924, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76420242, + "num_input_tokens_seen": 19251460, + "step": 901, + "time_per_iteration": 2.4992516040802 + }, + { + "auxiliary_loss_clip": 0.01185799, + "auxiliary_loss_mlp": 0.01070615, + "balance_loss_clip": 1.06202388, + "balance_loss_mlp": 1.04589152, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.1249302965470447, + "language_loss": 0.85013276, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87269694, + "num_input_tokens_seen": 19269060, + "step": 902, + "time_per_iteration": 2.6636290550231934 + }, + { + "auxiliary_loss_clip": 0.01168717, + "auxiliary_loss_mlp": 0.01070557, + "balance_loss_clip": 1.05301118, + "balance_loss_mlp": 1.04321074, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 1.8612943531485155, + "language_loss": 0.86597848, + "learning_rate": 3.993814024394569e-06, + "loss": 0.88837123, + "num_input_tokens_seen": 19288620, + "step": 903, + "time_per_iteration": 2.6270058155059814 + }, + { + "auxiliary_loss_clip": 0.01209654, + "auxiliary_loss_mlp": 0.01064749, + "balance_loss_clip": 1.06046891, + "balance_loss_mlp": 1.03933334, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.149592034855251, + "language_loss": 0.75275266, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77549672, + "num_input_tokens_seen": 19306615, + "step": 904, + "time_per_iteration": 2.4654436111450195 + }, + { + "auxiliary_loss_clip": 0.01208456, + "auxiliary_loss_mlp": 0.01077974, + "balance_loss_clip": 1.05971813, + "balance_loss_mlp": 1.05193889, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.091155730420312, + "language_loss": 0.85872817, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88159245, + "num_input_tokens_seen": 19321680, + "step": 905, + "time_per_iteration": 3.8718619346618652 + }, + { + "auxiliary_loss_clip": 0.01200092, + "auxiliary_loss_mlp": 0.0107542, + "balance_loss_clip": 1.06639266, + "balance_loss_mlp": 1.05017161, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.9744525332853664, + "language_loss": 0.7424897, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76524484, + "num_input_tokens_seen": 19339760, + "step": 906, + "time_per_iteration": 2.519153356552124 + }, + { + "auxiliary_loss_clip": 0.01194211, + "auxiliary_loss_mlp": 0.01068681, + "balance_loss_clip": 1.06094432, + "balance_loss_mlp": 1.0422281, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 1.9402921035478111, + "language_loss": 0.87230611, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89493507, + "num_input_tokens_seen": 19359585, + "step": 907, + "time_per_iteration": 2.5565602779388428 + }, + { + "auxiliary_loss_clip": 0.01207175, + "auxiliary_loss_mlp": 0.01076659, + "balance_loss_clip": 1.06300211, + "balance_loss_mlp": 1.05044472, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 2.2067043128721657, + "language_loss": 0.87127674, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89411509, + "num_input_tokens_seen": 19378590, + "step": 908, + "time_per_iteration": 2.4730541706085205 + }, + { + "auxiliary_loss_clip": 0.01203216, + "auxiliary_loss_mlp": 0.01071808, + "balance_loss_clip": 1.06370616, + "balance_loss_mlp": 1.04427111, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.706020366074503, + "language_loss": 0.89825344, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.9210037, + "num_input_tokens_seen": 19397910, + "step": 909, + "time_per_iteration": 2.502551794052124 + }, + { + "auxiliary_loss_clip": 0.01201587, + "auxiliary_loss_mlp": 0.01071366, + "balance_loss_clip": 1.06263542, + "balance_loss_mlp": 1.04427016, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.3326296899581824, + "language_loss": 0.71532094, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73805046, + "num_input_tokens_seen": 19415950, + "step": 910, + "time_per_iteration": 2.5533487796783447 + }, + { + "auxiliary_loss_clip": 0.01190894, + "auxiliary_loss_mlp": 0.01060935, + "balance_loss_clip": 1.05753136, + "balance_loss_mlp": 1.03593683, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 1.6356599433799432, + "language_loss": 0.8356818, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85820007, + "num_input_tokens_seen": 19435275, + "step": 911, + "time_per_iteration": 2.531658172607422 + }, + { + "auxiliary_loss_clip": 0.01192202, + "auxiliary_loss_mlp": 0.01080441, + "balance_loss_clip": 1.05618119, + "balance_loss_mlp": 1.05336881, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 10.688976457387904, + "language_loss": 0.76237464, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78510106, + "num_input_tokens_seen": 19452090, + "step": 912, + "time_per_iteration": 3.8775603771209717 + }, + { + "auxiliary_loss_clip": 0.01188936, + "auxiliary_loss_mlp": 0.01053503, + "balance_loss_clip": 1.05893528, + "balance_loss_mlp": 1.02800417, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.092957740039875, + "language_loss": 0.8229531, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84537745, + "num_input_tokens_seen": 19470865, + "step": 913, + "time_per_iteration": 2.487596273422241 + }, + { + "auxiliary_loss_clip": 0.01198591, + "auxiliary_loss_mlp": 0.01058953, + "balance_loss_clip": 1.06045806, + "balance_loss_mlp": 1.03389573, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 1.7919055336678702, + "language_loss": 0.83522213, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85779756, + "num_input_tokens_seen": 19492145, + "step": 914, + "time_per_iteration": 2.5156304836273193 + }, + { + "auxiliary_loss_clip": 0.01211538, + "auxiliary_loss_mlp": 0.00794784, + "balance_loss_clip": 1.06328416, + "balance_loss_mlp": 1.00032938, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.0827512516886193, + "language_loss": 0.9015584, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92162168, + "num_input_tokens_seen": 19511015, + "step": 915, + "time_per_iteration": 2.513103485107422 + }, + { + "auxiliary_loss_clip": 0.01202333, + "auxiliary_loss_mlp": 0.01059073, + "balance_loss_clip": 1.06331205, + "balance_loss_mlp": 1.03507638, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 2.017182296941591, + "language_loss": 0.89555848, + "learning_rate": 3.993409734157064e-06, + "loss": 0.9181726, + "num_input_tokens_seen": 19529040, + "step": 916, + "time_per_iteration": 2.4621684551239014 + }, + { + "auxiliary_loss_clip": 0.01178178, + "auxiliary_loss_mlp": 0.01071131, + "balance_loss_clip": 1.05952334, + "balance_loss_mlp": 1.04545307, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 2.0479586780988397, + "language_loss": 0.80300701, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82550007, + "num_input_tokens_seen": 19549540, + "step": 917, + "time_per_iteration": 2.561722755432129 + }, + { + "auxiliary_loss_clip": 0.01142962, + "auxiliary_loss_mlp": 0.01063778, + "balance_loss_clip": 1.05268645, + "balance_loss_mlp": 1.03700399, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 2.4303982647237223, + "language_loss": 0.79513824, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81720567, + "num_input_tokens_seen": 19567570, + "step": 918, + "time_per_iteration": 2.688284397125244 + }, + { + "auxiliary_loss_clip": 0.01202058, + "auxiliary_loss_mlp": 0.01060876, + "balance_loss_clip": 1.05969715, + "balance_loss_mlp": 1.03544915, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 2.409333522760644, + "language_loss": 0.88955724, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91218656, + "num_input_tokens_seen": 19585330, + "step": 919, + "time_per_iteration": 2.506772994995117 + }, + { + "auxiliary_loss_clip": 0.01214925, + "auxiliary_loss_mlp": 0.01071932, + "balance_loss_clip": 1.06115568, + "balance_loss_mlp": 1.04617107, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.146889508560325, + "language_loss": 0.87322319, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89609182, + "num_input_tokens_seen": 19604970, + "step": 920, + "time_per_iteration": 2.5155625343322754 + }, + { + "auxiliary_loss_clip": 0.01191859, + "auxiliary_loss_mlp": 0.0106758, + "balance_loss_clip": 1.06179988, + "balance_loss_mlp": 1.04233193, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.0871826547894416, + "language_loss": 0.66088027, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68347466, + "num_input_tokens_seen": 19626235, + "step": 921, + "time_per_iteration": 2.661132335662842 + }, + { + "auxiliary_loss_clip": 0.01211615, + "auxiliary_loss_mlp": 0.01066141, + "balance_loss_clip": 1.06232381, + "balance_loss_mlp": 1.03829408, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.0556249882735744, + "language_loss": 0.72214878, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74492633, + "num_input_tokens_seen": 19644305, + "step": 922, + "time_per_iteration": 2.535003900527954 + }, + { + "auxiliary_loss_clip": 0.01191548, + "auxiliary_loss_mlp": 0.01069409, + "balance_loss_clip": 1.05616534, + "balance_loss_mlp": 1.04287291, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 3.311606615180674, + "language_loss": 0.81949306, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84210259, + "num_input_tokens_seen": 19662130, + "step": 923, + "time_per_iteration": 2.5394012928009033 + }, + { + "auxiliary_loss_clip": 0.01204862, + "auxiliary_loss_mlp": 0.010609, + "balance_loss_clip": 1.05900168, + "balance_loss_mlp": 1.03523421, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 1.9680182978832548, + "language_loss": 0.78892833, + "learning_rate": 3.993154590414675e-06, + "loss": 0.8115859, + "num_input_tokens_seen": 19680715, + "step": 924, + "time_per_iteration": 2.457176923751831 + }, + { + "auxiliary_loss_clip": 0.01168116, + "auxiliary_loss_mlp": 0.01062122, + "balance_loss_clip": 1.05679893, + "balance_loss_mlp": 1.03544259, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.9972539443582407, + "language_loss": 1.02071929, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04302168, + "num_input_tokens_seen": 19700535, + "step": 925, + "time_per_iteration": 2.5740957260131836 + }, + { + "auxiliary_loss_clip": 0.0116886, + "auxiliary_loss_mlp": 0.01055487, + "balance_loss_clip": 1.05516768, + "balance_loss_mlp": 1.03015542, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.0307829460136033, + "language_loss": 0.81123626, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83347976, + "num_input_tokens_seen": 19718825, + "step": 926, + "time_per_iteration": 2.586808919906616 + }, + { + "auxiliary_loss_clip": 0.0121356, + "auxiliary_loss_mlp": 0.01064619, + "balance_loss_clip": 1.06380677, + "balance_loss_mlp": 1.03803515, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 3.1186474941980262, + "language_loss": 0.73389959, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75668144, + "num_input_tokens_seen": 19739080, + "step": 927, + "time_per_iteration": 2.5152552127838135 + }, + { + "auxiliary_loss_clip": 0.01093654, + "auxiliary_loss_mlp": 0.01016439, + "balance_loss_clip": 1.0289191, + "balance_loss_mlp": 1.01138449, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.8428765284264085, + "language_loss": 0.59822297, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.61932391, + "num_input_tokens_seen": 19802960, + "step": 928, + "time_per_iteration": 3.0696444511413574 + }, + { + "auxiliary_loss_clip": 0.01205899, + "auxiliary_loss_mlp": 0.01068324, + "balance_loss_clip": 1.06224442, + "balance_loss_mlp": 1.04251528, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 9.859094601811266, + "language_loss": 0.95128113, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97402334, + "num_input_tokens_seen": 19822765, + "step": 929, + "time_per_iteration": 2.5362470149993896 + }, + { + "auxiliary_loss_clip": 0.01176576, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.05679452, + "balance_loss_mlp": 1.04206944, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.659615419259833, + "language_loss": 0.71901304, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74148893, + "num_input_tokens_seen": 19843590, + "step": 930, + "time_per_iteration": 2.6562440395355225 + }, + { + "auxiliary_loss_clip": 0.01193149, + "auxiliary_loss_mlp": 0.0106204, + "balance_loss_clip": 1.06113338, + "balance_loss_mlp": 1.03590941, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 5.400461766772312, + "language_loss": 0.85967803, + "learning_rate": 3.992927371636776e-06, + "loss": 0.88222992, + "num_input_tokens_seen": 19860230, + "step": 931, + "time_per_iteration": 2.495847463607788 + }, + { + "auxiliary_loss_clip": 0.01209077, + "auxiliary_loss_mlp": 0.00795836, + "balance_loss_clip": 1.06042206, + "balance_loss_mlp": 1.00036752, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.84057041572846, + "language_loss": 0.83427596, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85432506, + "num_input_tokens_seen": 19880795, + "step": 932, + "time_per_iteration": 2.533517599105835 + }, + { + "auxiliary_loss_clip": 0.01204536, + "auxiliary_loss_mlp": 0.01066631, + "balance_loss_clip": 1.06301975, + "balance_loss_mlp": 1.03951049, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.835248038238831, + "language_loss": 0.73789299, + "learning_rate": 3.992861771819365e-06, + "loss": 0.76060462, + "num_input_tokens_seen": 19897960, + "step": 933, + "time_per_iteration": 2.458385944366455 + }, + { + "auxiliary_loss_clip": 0.01159274, + "auxiliary_loss_mlp": 0.01070368, + "balance_loss_clip": 1.05337858, + "balance_loss_mlp": 1.04390311, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 9.33243598470413, + "language_loss": 0.86876404, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89106047, + "num_input_tokens_seen": 19913315, + "step": 934, + "time_per_iteration": 2.5505881309509277 + }, + { + "auxiliary_loss_clip": 0.01171787, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_clip": 1.06180501, + "balance_loss_mlp": 1.04274917, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.585809132220728, + "language_loss": 0.80315053, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82554841, + "num_input_tokens_seen": 19928790, + "step": 935, + "time_per_iteration": 2.5647218227386475 + }, + { + "auxiliary_loss_clip": 0.0109016, + "auxiliary_loss_mlp": 0.01008188, + "balance_loss_clip": 1.02579832, + "balance_loss_mlp": 1.00337148, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8283114319719378, + "language_loss": 0.69135749, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71234095, + "num_input_tokens_seen": 19988785, + "step": 936, + "time_per_iteration": 2.9346718788146973 + }, + { + "auxiliary_loss_clip": 0.01216401, + "auxiliary_loss_mlp": 0.01063293, + "balance_loss_clip": 1.06070495, + "balance_loss_mlp": 1.03771067, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.4315288179496606, + "language_loss": 0.76039088, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78318775, + "num_input_tokens_seen": 20007685, + "step": 937, + "time_per_iteration": 2.4780523777008057 + }, + { + "auxiliary_loss_clip": 0.01078739, + "auxiliary_loss_mlp": 0.01004555, + "balance_loss_clip": 1.02400601, + "balance_loss_mlp": 0.99966747, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8557666669475316, + "language_loss": 0.64351153, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66434443, + "num_input_tokens_seen": 20072750, + "step": 938, + "time_per_iteration": 4.465017557144165 + }, + { + "auxiliary_loss_clip": 0.01175056, + "auxiliary_loss_mlp": 0.01066604, + "balance_loss_clip": 1.05353963, + "balance_loss_mlp": 1.03918624, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 2.450471075269209, + "language_loss": 0.79377878, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81619537, + "num_input_tokens_seen": 20089070, + "step": 939, + "time_per_iteration": 2.5608301162719727 + }, + { + "auxiliary_loss_clip": 0.01180277, + "auxiliary_loss_mlp": 0.01069081, + "balance_loss_clip": 1.05342984, + "balance_loss_mlp": 1.04341555, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.6434464037393746, + "language_loss": 0.73801792, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76051146, + "num_input_tokens_seen": 20108790, + "step": 940, + "time_per_iteration": 3.9671545028686523 + }, + { + "auxiliary_loss_clip": 0.01202515, + "auxiliary_loss_mlp": 0.01064832, + "balance_loss_clip": 1.06290126, + "balance_loss_mlp": 1.03858232, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 2.574850061917643, + "language_loss": 0.70767975, + "learning_rate": 3.992596349869216e-06, + "loss": 0.73035318, + "num_input_tokens_seen": 20128455, + "step": 941, + "time_per_iteration": 2.5131137371063232 + }, + { + "auxiliary_loss_clip": 0.01138376, + "auxiliary_loss_mlp": 0.01064804, + "balance_loss_clip": 1.0504086, + "balance_loss_mlp": 1.03873301, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.062434150939125, + "language_loss": 0.81075907, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83279085, + "num_input_tokens_seen": 20145775, + "step": 942, + "time_per_iteration": 2.5941317081451416 + }, + { + "auxiliary_loss_clip": 0.01190923, + "auxiliary_loss_mlp": 0.01067418, + "balance_loss_clip": 1.05523157, + "balance_loss_mlp": 1.04060805, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.283659991218431, + "language_loss": 0.8839485, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.90653193, + "num_input_tokens_seen": 20164315, + "step": 943, + "time_per_iteration": 2.480616569519043 + }, + { + "auxiliary_loss_clip": 0.01203698, + "auxiliary_loss_mlp": 0.01057774, + "balance_loss_clip": 1.06482887, + "balance_loss_mlp": 1.03309798, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.522838912418345, + "language_loss": 0.75230771, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77492237, + "num_input_tokens_seen": 20182760, + "step": 944, + "time_per_iteration": 3.8858606815338135 + }, + { + "auxiliary_loss_clip": 0.01209635, + "auxiliary_loss_mlp": 0.01067437, + "balance_loss_clip": 1.06387556, + "balance_loss_mlp": 1.04297483, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7718823629578908, + "language_loss": 0.79593635, + "learning_rate": 3.992461825426906e-06, + "loss": 0.81870711, + "num_input_tokens_seen": 20203830, + "step": 945, + "time_per_iteration": 2.5220866203308105 + }, + { + "auxiliary_loss_clip": 0.01201604, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_clip": 1.0626508, + "balance_loss_mlp": 1.04099143, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.367268793765993, + "language_loss": 0.82295799, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84563339, + "num_input_tokens_seen": 20220365, + "step": 946, + "time_per_iteration": 2.446208953857422 + }, + { + "auxiliary_loss_clip": 0.01222191, + "auxiliary_loss_mlp": 0.01062797, + "balance_loss_clip": 1.06460702, + "balance_loss_mlp": 1.03660631, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.2585902877458883, + "language_loss": 0.78966153, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81251144, + "num_input_tokens_seen": 20238640, + "step": 947, + "time_per_iteration": 2.430302858352661 + }, + { + "auxiliary_loss_clip": 0.01181108, + "auxiliary_loss_mlp": 0.01070659, + "balance_loss_clip": 1.06060457, + "balance_loss_mlp": 1.04425466, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 2.740265706906397, + "language_loss": 0.85379434, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.87631202, + "num_input_tokens_seen": 20251025, + "step": 948, + "time_per_iteration": 2.4824702739715576 + }, + { + "auxiliary_loss_clip": 0.01216776, + "auxiliary_loss_mlp": 0.01065071, + "balance_loss_clip": 1.06173873, + "balance_loss_mlp": 1.03839183, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.9195229933315245, + "language_loss": 0.87337923, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89619768, + "num_input_tokens_seen": 20269775, + "step": 949, + "time_per_iteration": 2.4357810020446777 + }, + { + "auxiliary_loss_clip": 0.01197381, + "auxiliary_loss_mlp": 0.01070154, + "balance_loss_clip": 1.06254649, + "balance_loss_mlp": 1.04607403, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.2889220456081474, + "language_loss": 0.79531634, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81799173, + "num_input_tokens_seen": 20287715, + "step": 950, + "time_per_iteration": 2.459195137023926 + }, + { + "auxiliary_loss_clip": 0.01187662, + "auxiliary_loss_mlp": 0.01076197, + "balance_loss_clip": 1.05814958, + "balance_loss_mlp": 1.05029297, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.854543503821481, + "language_loss": 0.82244879, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84508735, + "num_input_tokens_seen": 20307070, + "step": 951, + "time_per_iteration": 2.558215379714966 + }, + { + "auxiliary_loss_clip": 0.01172877, + "auxiliary_loss_mlp": 0.010633, + "balance_loss_clip": 1.05225694, + "balance_loss_mlp": 1.03608477, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 3.3047105586186225, + "language_loss": 0.86471164, + "learning_rate": 3.992223498859958e-06, + "loss": 0.8870734, + "num_input_tokens_seen": 20324945, + "step": 952, + "time_per_iteration": 3.9505715370178223 + }, + { + "auxiliary_loss_clip": 0.01194223, + "auxiliary_loss_mlp": 0.0106403, + "balance_loss_clip": 1.05683088, + "balance_loss_mlp": 1.0353955, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.8828414317405202, + "language_loss": 0.79357338, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81615591, + "num_input_tokens_seen": 20346135, + "step": 953, + "time_per_iteration": 2.524160146713257 + }, + { + "auxiliary_loss_clip": 0.01192601, + "auxiliary_loss_mlp": 0.01069305, + "balance_loss_clip": 1.06085563, + "balance_loss_mlp": 1.04220939, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.3901964105786804, + "language_loss": 0.86948138, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89210045, + "num_input_tokens_seen": 20364450, + "step": 954, + "time_per_iteration": 2.5554678440093994 + }, + { + "auxiliary_loss_clip": 0.01213544, + "auxiliary_loss_mlp": 0.01059869, + "balance_loss_clip": 1.06484091, + "balance_loss_mlp": 1.03438222, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.6558697722758837, + "language_loss": 0.88371158, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90644574, + "num_input_tokens_seen": 20383500, + "step": 955, + "time_per_iteration": 2.4942455291748047 + }, + { + "auxiliary_loss_clip": 0.01184188, + "auxiliary_loss_mlp": 0.01066048, + "balance_loss_clip": 1.05368924, + "balance_loss_mlp": 1.03943992, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 2.0943857442109675, + "language_loss": 0.89460146, + "learning_rate": 3.992085650224914e-06, + "loss": 0.91710377, + "num_input_tokens_seen": 20400295, + "step": 956, + "time_per_iteration": 2.4868409633636475 + }, + { + "auxiliary_loss_clip": 0.01171033, + "auxiliary_loss_mlp": 0.01057906, + "balance_loss_clip": 1.05891931, + "balance_loss_mlp": 1.03163242, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.6598385382833427, + "language_loss": 0.75417459, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77646399, + "num_input_tokens_seen": 20419085, + "step": 957, + "time_per_iteration": 2.527536392211914 + }, + { + "auxiliary_loss_clip": 0.01180208, + "auxiliary_loss_mlp": 0.01080076, + "balance_loss_clip": 1.05620849, + "balance_loss_mlp": 1.05070317, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 2.0134577146799817, + "language_loss": 0.79826361, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82086647, + "num_input_tokens_seen": 20437465, + "step": 958, + "time_per_iteration": 2.5466723442077637 + }, + { + "auxiliary_loss_clip": 0.01188362, + "auxiliary_loss_mlp": 0.01060967, + "balance_loss_clip": 1.05922198, + "balance_loss_mlp": 1.03710163, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.655388233344215, + "language_loss": 0.88244408, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90493739, + "num_input_tokens_seen": 20456235, + "step": 959, + "time_per_iteration": 2.5283312797546387 + }, + { + "auxiliary_loss_clip": 0.01176688, + "auxiliary_loss_mlp": 0.0106219, + "balance_loss_clip": 1.06054735, + "balance_loss_mlp": 1.03739417, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.363614319763566, + "language_loss": 0.78689623, + "learning_rate": 3.991946592948529e-06, + "loss": 0.80928504, + "num_input_tokens_seen": 20476825, + "step": 960, + "time_per_iteration": 2.65584397315979 + }, + { + "auxiliary_loss_clip": 0.01138388, + "auxiliary_loss_mlp": 0.01066996, + "balance_loss_clip": 1.0550952, + "balance_loss_mlp": 1.03939903, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 2.2378815060524637, + "language_loss": 0.9297992, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95185304, + "num_input_tokens_seen": 20496965, + "step": 961, + "time_per_iteration": 2.657447099685669 + }, + { + "auxiliary_loss_clip": 0.01186804, + "auxiliary_loss_mlp": 0.01070619, + "balance_loss_clip": 1.05780625, + "balance_loss_mlp": 1.04330778, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.3991687804123973, + "language_loss": 0.68117642, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70375061, + "num_input_tokens_seen": 20518035, + "step": 962, + "time_per_iteration": 2.601824998855591 + }, + { + "auxiliary_loss_clip": 0.01166185, + "auxiliary_loss_mlp": 0.01067977, + "balance_loss_clip": 1.05485857, + "balance_loss_mlp": 1.04287124, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.4746542096212027, + "language_loss": 0.88265854, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90500015, + "num_input_tokens_seen": 20534740, + "step": 963, + "time_per_iteration": 2.5483686923980713 + }, + { + "auxiliary_loss_clip": 0.01187344, + "auxiliary_loss_mlp": 0.01065698, + "balance_loss_clip": 1.06324244, + "balance_loss_mlp": 1.03891158, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.6158051836208, + "language_loss": 0.8479535, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87048393, + "num_input_tokens_seen": 20553485, + "step": 964, + "time_per_iteration": 2.549250841140747 + }, + { + "auxiliary_loss_clip": 0.01191775, + "auxiliary_loss_mlp": 0.01069926, + "balance_loss_clip": 1.06184161, + "balance_loss_mlp": 1.04250765, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 10.287166380122107, + "language_loss": 0.78113985, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80375683, + "num_input_tokens_seen": 20572155, + "step": 965, + "time_per_iteration": 2.5139150619506836 + }, + { + "auxiliary_loss_clip": 0.0116143, + "auxiliary_loss_mlp": 0.01068899, + "balance_loss_clip": 1.05478215, + "balance_loss_mlp": 1.041839, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 1.9887217446437944, + "language_loss": 0.81236637, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83466965, + "num_input_tokens_seen": 20590395, + "step": 966, + "time_per_iteration": 2.5853271484375 + }, + { + "auxiliary_loss_clip": 0.01202509, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_clip": 1.06201518, + "balance_loss_mlp": 1.03986919, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.990587216198467, + "language_loss": 0.76532745, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78799778, + "num_input_tokens_seen": 20608435, + "step": 967, + "time_per_iteration": 2.4858269691467285 + }, + { + "auxiliary_loss_clip": 0.01074487, + "auxiliary_loss_mlp": 0.01022903, + "balance_loss_clip": 1.02141571, + "balance_loss_mlp": 1.01839685, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.7865977074040174, + "language_loss": 0.57313925, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59411317, + "num_input_tokens_seen": 20668575, + "step": 968, + "time_per_iteration": 2.9934651851654053 + }, + { + "auxiliary_loss_clip": 0.01196191, + "auxiliary_loss_mlp": 0.01063586, + "balance_loss_clip": 1.06278896, + "balance_loss_mlp": 1.03598952, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.1186182011120596, + "language_loss": 0.82340956, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84600729, + "num_input_tokens_seen": 20687355, + "step": 969, + "time_per_iteration": 2.5105457305908203 + }, + { + "auxiliary_loss_clip": 0.01206443, + "auxiliary_loss_mlp": 0.00794822, + "balance_loss_clip": 1.06297612, + "balance_loss_mlp": 1.00046468, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 5.32946590959027, + "language_loss": 0.78271806, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80273074, + "num_input_tokens_seen": 20705710, + "step": 970, + "time_per_iteration": 2.5435903072357178 + }, + { + "auxiliary_loss_clip": 0.0119039, + "auxiliary_loss_mlp": 0.01061728, + "balance_loss_clip": 1.06416023, + "balance_loss_mlp": 1.03465557, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.6724844296381685, + "language_loss": 0.92327487, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94579601, + "num_input_tokens_seen": 20722405, + "step": 971, + "time_per_iteration": 2.5370614528656006 + }, + { + "auxiliary_loss_clip": 0.01186694, + "auxiliary_loss_mlp": 0.01061849, + "balance_loss_clip": 1.0573144, + "balance_loss_mlp": 1.03533733, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 2.379949600539884, + "language_loss": 0.86010075, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88258612, + "num_input_tokens_seen": 20741480, + "step": 972, + "time_per_iteration": 2.5467824935913086 + }, + { + "auxiliary_loss_clip": 0.01171214, + "auxiliary_loss_mlp": 0.01063488, + "balance_loss_clip": 1.05934048, + "balance_loss_mlp": 1.03950357, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.7857839519974754, + "language_loss": 0.87435973, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89670676, + "num_input_tokens_seen": 20759685, + "step": 973, + "time_per_iteration": 2.5741240978240967 + }, + { + "auxiliary_loss_clip": 0.01208944, + "auxiliary_loss_mlp": 0.00797078, + "balance_loss_clip": 1.06413627, + "balance_loss_mlp": 1.00051081, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 3.5088839556875553, + "language_loss": 0.75006479, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77012497, + "num_input_tokens_seen": 20778180, + "step": 974, + "time_per_iteration": 2.481875419616699 + }, + { + "auxiliary_loss_clip": 0.01201419, + "auxiliary_loss_mlp": 0.00795011, + "balance_loss_clip": 1.06292391, + "balance_loss_mlp": 1.00048375, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.6009366468010462, + "language_loss": 0.76809549, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78805977, + "num_input_tokens_seen": 20802705, + "step": 975, + "time_per_iteration": 2.6791670322418213 + }, + { + "auxiliary_loss_clip": 0.01221144, + "auxiliary_loss_mlp": 0.01067123, + "balance_loss_clip": 1.06496537, + "balance_loss_mlp": 1.04221976, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.0404362708543475, + "language_loss": 0.7663027, + "learning_rate": 3.99137827912794e-06, + "loss": 0.7891854, + "num_input_tokens_seen": 20822540, + "step": 976, + "time_per_iteration": 2.4751837253570557 + }, + { + "auxiliary_loss_clip": 0.01178941, + "auxiliary_loss_mlp": 0.01068818, + "balance_loss_clip": 1.05525541, + "balance_loss_mlp": 1.04091072, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 2.0173579111680104, + "language_loss": 0.87487674, + "learning_rate": 3.991342117593679e-06, + "loss": 0.8973543, + "num_input_tokens_seen": 20844175, + "step": 977, + "time_per_iteration": 4.012849569320679 + }, + { + "auxiliary_loss_clip": 0.01185744, + "auxiliary_loss_mlp": 0.01062289, + "balance_loss_clip": 1.06113994, + "balance_loss_mlp": 1.03580081, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.5009914410473604, + "language_loss": 0.79332882, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81580913, + "num_input_tokens_seen": 20864730, + "step": 978, + "time_per_iteration": 2.514894723892212 + }, + { + "auxiliary_loss_clip": 0.01135933, + "auxiliary_loss_mlp": 0.01072337, + "balance_loss_clip": 1.05586851, + "balance_loss_mlp": 1.04463279, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.5127035666391317, + "language_loss": 0.80754822, + "learning_rate": 3.991269567990855e-06, + "loss": 0.82963085, + "num_input_tokens_seen": 20885200, + "step": 979, + "time_per_iteration": 2.9153053760528564 + }, + { + "auxiliary_loss_clip": 0.01069152, + "auxiliary_loss_mlp": 0.01006529, + "balance_loss_clip": 1.02209163, + "balance_loss_mlp": 1.00180864, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.9384488569295661, + "language_loss": 0.58999121, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61074805, + "num_input_tokens_seen": 20940325, + "step": 980, + "time_per_iteration": 4.398587226867676 + }, + { + "auxiliary_loss_clip": 0.01218074, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_clip": 1.06672049, + "balance_loss_mlp": 1.03969908, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.3090350685316237, + "language_loss": 0.86624789, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88910323, + "num_input_tokens_seen": 20958220, + "step": 981, + "time_per_iteration": 2.4500675201416016 + }, + { + "auxiliary_loss_clip": 0.01193257, + "auxiliary_loss_mlp": 0.01058483, + "balance_loss_clip": 1.0630337, + "balance_loss_mlp": 1.03449869, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 2.5260912049718125, + "language_loss": 0.79626209, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81877953, + "num_input_tokens_seen": 20978920, + "step": 982, + "time_per_iteration": 2.5495247840881348 + }, + { + "auxiliary_loss_clip": 0.01197566, + "auxiliary_loss_mlp": 0.01063929, + "balance_loss_clip": 1.06309795, + "balance_loss_mlp": 1.03820348, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.492966056942262, + "language_loss": 0.84572786, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86834282, + "num_input_tokens_seen": 20999490, + "step": 983, + "time_per_iteration": 2.547924041748047 + }, + { + "auxiliary_loss_clip": 0.01202999, + "auxiliary_loss_mlp": 0.01073967, + "balance_loss_clip": 1.06158876, + "balance_loss_mlp": 1.04933798, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.9972272137558984, + "language_loss": 0.84206986, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86483949, + "num_input_tokens_seen": 21017865, + "step": 984, + "time_per_iteration": 3.965256452560425 + }, + { + "auxiliary_loss_clip": 0.0119417, + "auxiliary_loss_mlp": 0.01060794, + "balance_loss_clip": 1.05964458, + "balance_loss_mlp": 1.03661835, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.2600528913368776, + "language_loss": 0.77106273, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.79361242, + "num_input_tokens_seen": 21035900, + "step": 985, + "time_per_iteration": 2.486788749694824 + }, + { + "auxiliary_loss_clip": 0.01154665, + "auxiliary_loss_mlp": 0.01064188, + "balance_loss_clip": 1.05355668, + "balance_loss_mlp": 1.03818798, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 1.8957518874536308, + "language_loss": 0.90697217, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92916071, + "num_input_tokens_seen": 21053235, + "step": 986, + "time_per_iteration": 2.584097146987915 + }, + { + "auxiliary_loss_clip": 0.01204323, + "auxiliary_loss_mlp": 0.01064785, + "balance_loss_clip": 1.05781233, + "balance_loss_mlp": 1.0365684, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 2.377118221142195, + "language_loss": 0.75447041, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.77716148, + "num_input_tokens_seen": 21073090, + "step": 987, + "time_per_iteration": 2.5198678970336914 + }, + { + "auxiliary_loss_clip": 0.01205158, + "auxiliary_loss_mlp": 0.01059068, + "balance_loss_clip": 1.0627861, + "balance_loss_mlp": 1.0335809, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.6052248911736773, + "language_loss": 0.71293694, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73557919, + "num_input_tokens_seen": 21094895, + "step": 988, + "time_per_iteration": 2.6472394466400146 + }, + { + "auxiliary_loss_clip": 0.01052188, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.02419829, + "balance_loss_mlp": 1.02265263, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9397371156807897, + "language_loss": 0.7112999, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73209059, + "num_input_tokens_seen": 21147555, + "step": 989, + "time_per_iteration": 3.2319490909576416 + }, + { + "auxiliary_loss_clip": 0.01182007, + "auxiliary_loss_mlp": 0.01069936, + "balance_loss_clip": 1.05879974, + "balance_loss_mlp": 1.04130161, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 7.8430449482202995, + "language_loss": 0.78317797, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80569744, + "num_input_tokens_seen": 21167845, + "step": 990, + "time_per_iteration": 2.851012706756592 + }, + { + "auxiliary_loss_clip": 0.01197664, + "auxiliary_loss_mlp": 0.0105787, + "balance_loss_clip": 1.0602746, + "balance_loss_mlp": 1.03169191, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 8.824496811730755, + "language_loss": 0.86360085, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88615614, + "num_input_tokens_seen": 21185085, + "step": 991, + "time_per_iteration": 2.4899368286132812 + }, + { + "auxiliary_loss_clip": 0.01220655, + "auxiliary_loss_mlp": 0.01064698, + "balance_loss_clip": 1.06351423, + "balance_loss_mlp": 1.0390563, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.0513231270054146, + "language_loss": 0.77197653, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79483008, + "num_input_tokens_seen": 21204230, + "step": 992, + "time_per_iteration": 3.8426074981689453 + }, + { + "auxiliary_loss_clip": 0.01149311, + "auxiliary_loss_mlp": 0.01063889, + "balance_loss_clip": 1.05638957, + "balance_loss_mlp": 1.03968954, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.1461067241695697, + "language_loss": 0.74455929, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.76669133, + "num_input_tokens_seen": 21222655, + "step": 993, + "time_per_iteration": 2.567333698272705 + }, + { + "auxiliary_loss_clip": 0.01161588, + "auxiliary_loss_mlp": 0.0108602, + "balance_loss_clip": 1.05418086, + "balance_loss_mlp": 1.05724287, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 2.5242753814110404, + "language_loss": 0.78927255, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81174862, + "num_input_tokens_seen": 21242310, + "step": 994, + "time_per_iteration": 2.6249077320098877 + }, + { + "auxiliary_loss_clip": 0.01217111, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_clip": 1.06416619, + "balance_loss_mlp": 1.05939293, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.55392456785408, + "language_loss": 0.79843104, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82144535, + "num_input_tokens_seen": 21261410, + "step": 995, + "time_per_iteration": 2.460832118988037 + }, + { + "auxiliary_loss_clip": 0.01168338, + "auxiliary_loss_mlp": 0.01071625, + "balance_loss_clip": 1.05815387, + "balance_loss_mlp": 1.04542327, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 3.4794336175889558, + "language_loss": 0.87120283, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89360249, + "num_input_tokens_seen": 21280080, + "step": 996, + "time_per_iteration": 2.5735085010528564 + }, + { + "auxiliary_loss_clip": 0.01184154, + "auxiliary_loss_mlp": 0.01085733, + "balance_loss_clip": 1.05874109, + "balance_loss_mlp": 1.05511999, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 3.9924334161807997, + "language_loss": 0.87588775, + "learning_rate": 3.990603031255718e-06, + "loss": 0.89858663, + "num_input_tokens_seen": 21296765, + "step": 997, + "time_per_iteration": 2.5707802772521973 + }, + { + "auxiliary_loss_clip": 0.01079288, + "auxiliary_loss_mlp": 0.01002647, + "balance_loss_clip": 1.02948439, + "balance_loss_mlp": 0.99864179, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.0242668534636874, + "language_loss": 0.75415128, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77497059, + "num_input_tokens_seen": 21363345, + "step": 998, + "time_per_iteration": 3.19120454788208 + }, + { + "auxiliary_loss_clip": 0.01171934, + "auxiliary_loss_mlp": 0.01063469, + "balance_loss_clip": 1.05872834, + "balance_loss_mlp": 1.03755295, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.813518564143697, + "language_loss": 0.7597965, + "learning_rate": 3.990527461790013e-06, + "loss": 0.78215051, + "num_input_tokens_seen": 21385290, + "step": 999, + "time_per_iteration": 2.621809244155884 + }, + { + "auxiliary_loss_clip": 0.01200397, + "auxiliary_loss_mlp": 0.01065574, + "balance_loss_clip": 1.05797839, + "balance_loss_mlp": 1.0390377, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.8048619878194847, + "language_loss": 0.82966673, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85232651, + "num_input_tokens_seen": 21407625, + "step": 1000, + "time_per_iteration": 2.562167167663574 + }, + { + "auxiliary_loss_clip": 0.01186285, + "auxiliary_loss_mlp": 0.01067041, + "balance_loss_clip": 1.0585748, + "balance_loss_mlp": 1.03981411, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 2.075731386688293, + "language_loss": 0.86206549, + "learning_rate": 3.990451590400309e-06, + "loss": 0.88459873, + "num_input_tokens_seen": 21426835, + "step": 1001, + "time_per_iteration": 2.552652597427368 + }, + { + "auxiliary_loss_clip": 0.0119078, + "auxiliary_loss_mlp": 0.01061418, + "balance_loss_clip": 1.0606029, + "balance_loss_mlp": 1.03682566, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.6400946163180588, + "language_loss": 0.74272788, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76524991, + "num_input_tokens_seen": 21444920, + "step": 1002, + "time_per_iteration": 2.541564464569092 + }, + { + "auxiliary_loss_clip": 0.01215177, + "auxiliary_loss_mlp": 0.01064621, + "balance_loss_clip": 1.06463575, + "balance_loss_mlp": 1.03947937, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 2.44869339624523, + "language_loss": 0.75714922, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77994716, + "num_input_tokens_seen": 21463555, + "step": 1003, + "time_per_iteration": 2.492462396621704 + }, + { + "auxiliary_loss_clip": 0.01190521, + "auxiliary_loss_mlp": 0.01063149, + "balance_loss_clip": 1.06117618, + "balance_loss_mlp": 1.03780496, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.2426894898946292, + "language_loss": 0.70451212, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72704881, + "num_input_tokens_seen": 21481990, + "step": 1004, + "time_per_iteration": 2.505380392074585 + }, + { + "auxiliary_loss_clip": 0.01213634, + "auxiliary_loss_mlp": 0.01073608, + "balance_loss_clip": 1.06688714, + "balance_loss_mlp": 1.04841924, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.1913958796538595, + "language_loss": 0.83584976, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85872215, + "num_input_tokens_seen": 21500385, + "step": 1005, + "time_per_iteration": 2.454409122467041 + }, + { + "auxiliary_loss_clip": 0.01077635, + "auxiliary_loss_mlp": 0.01008211, + "balance_loss_clip": 1.02382421, + "balance_loss_mlp": 1.00413442, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.9057610005253366, + "language_loss": 0.59053707, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.6113956, + "num_input_tokens_seen": 21561040, + "step": 1006, + "time_per_iteration": 3.0940146446228027 + }, + { + "auxiliary_loss_clip": 0.01183907, + "auxiliary_loss_mlp": 0.0105699, + "balance_loss_clip": 1.0534662, + "balance_loss_mlp": 1.03122854, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 1.8610660382781197, + "language_loss": 0.74989927, + "learning_rate": 3.990222164802503e-06, + "loss": 0.77230823, + "num_input_tokens_seen": 21580655, + "step": 1007, + "time_per_iteration": 2.5223608016967773 + }, + { + "auxiliary_loss_clip": 0.01195221, + "auxiliary_loss_mlp": 0.01062574, + "balance_loss_clip": 1.0607276, + "balance_loss_mlp": 1.03666985, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.8551970539129299, + "language_loss": 0.80406237, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.82664031, + "num_input_tokens_seen": 21599650, + "step": 1008, + "time_per_iteration": 2.541649580001831 + }, + { + "auxiliary_loss_clip": 0.0117923, + "auxiliary_loss_mlp": 0.01060921, + "balance_loss_clip": 1.06263161, + "balance_loss_mlp": 1.03541017, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.786703838140289, + "language_loss": 0.77751368, + "learning_rate": 3.990145085832335e-06, + "loss": 0.79991519, + "num_input_tokens_seen": 21617550, + "step": 1009, + "time_per_iteration": 2.5705912113189697 + }, + { + "auxiliary_loss_clip": 0.01195552, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.06155717, + "balance_loss_mlp": 1.03212619, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 1.8163139389092389, + "language_loss": 0.92648929, + "learning_rate": 3.990106433146769e-06, + "loss": 0.94901216, + "num_input_tokens_seen": 21635865, + "step": 1010, + "time_per_iteration": 2.519193649291992 + }, + { + "auxiliary_loss_clip": 0.01150386, + "auxiliary_loss_mlp": 0.00797012, + "balance_loss_clip": 1.05245543, + "balance_loss_mlp": 1.00070167, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 2.848814211526652, + "language_loss": 0.72285813, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.7423321, + "num_input_tokens_seen": 21653945, + "step": 1011, + "time_per_iteration": 2.611058235168457 + }, + { + "auxiliary_loss_clip": 0.01194534, + "auxiliary_loss_mlp": 0.01075405, + "balance_loss_clip": 1.05865109, + "balance_loss_mlp": 1.04731917, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.218180920573816, + "language_loss": 0.87324697, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89594638, + "num_input_tokens_seen": 21671230, + "step": 1012, + "time_per_iteration": 2.5107831954956055 + }, + { + "auxiliary_loss_clip": 0.01186479, + "auxiliary_loss_mlp": 0.01067663, + "balance_loss_clip": 1.05733764, + "balance_loss_mlp": 1.04268861, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.9743549765846344, + "language_loss": 0.76680756, + "learning_rate": 3.989990022305734e-06, + "loss": 0.78934896, + "num_input_tokens_seen": 21691155, + "step": 1013, + "time_per_iteration": 2.5377447605133057 + }, + { + "auxiliary_loss_clip": 0.01208806, + "auxiliary_loss_mlp": 0.00796659, + "balance_loss_clip": 1.06782568, + "balance_loss_mlp": 1.00066793, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 3.096024299045627, + "language_loss": 0.86447775, + "learning_rate": 3.98995106776885e-06, + "loss": 0.88453245, + "num_input_tokens_seen": 21707405, + "step": 1014, + "time_per_iteration": 2.486562967300415 + }, + { + "auxiliary_loss_clip": 0.01215734, + "auxiliary_loss_mlp": 0.01073697, + "balance_loss_clip": 1.06763911, + "balance_loss_mlp": 1.04571855, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.106567470111203, + "language_loss": 0.73668265, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75957692, + "num_input_tokens_seen": 21728090, + "step": 1015, + "time_per_iteration": 2.5803024768829346 + }, + { + "auxiliary_loss_clip": 0.01186533, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_clip": 1.0599817, + "balance_loss_mlp": 1.04295087, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.8911453055783667, + "language_loss": 0.7949177, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81746805, + "num_input_tokens_seen": 21747950, + "step": 1016, + "time_per_iteration": 2.5735788345336914 + }, + { + "auxiliary_loss_clip": 0.01169694, + "auxiliary_loss_mlp": 0.01056535, + "balance_loss_clip": 1.05810738, + "balance_loss_mlp": 1.03182316, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.6783936998727615, + "language_loss": 0.75933701, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78159928, + "num_input_tokens_seen": 21767900, + "step": 1017, + "time_per_iteration": 4.023869037628174 + }, + { + "auxiliary_loss_clip": 0.01188508, + "auxiliary_loss_mlp": 0.01075234, + "balance_loss_clip": 1.06528234, + "balance_loss_mlp": 1.04870963, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 2.496669316430644, + "language_loss": 0.86019385, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88283128, + "num_input_tokens_seen": 21787375, + "step": 1018, + "time_per_iteration": 2.520742893218994 + }, + { + "auxiliary_loss_clip": 0.01170338, + "auxiliary_loss_mlp": 0.01076972, + "balance_loss_clip": 1.06001997, + "balance_loss_mlp": 1.05040073, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.601189104540428, + "language_loss": 0.77326751, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79574072, + "num_input_tokens_seen": 21806275, + "step": 1019, + "time_per_iteration": 3.8847053050994873 + }, + { + "auxiliary_loss_clip": 0.01165106, + "auxiliary_loss_mlp": 0.01063859, + "balance_loss_clip": 1.05543244, + "balance_loss_mlp": 1.03720367, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 2.0063728630840094, + "language_loss": 0.84259272, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86488235, + "num_input_tokens_seen": 21826430, + "step": 1020, + "time_per_iteration": 2.644388198852539 + }, + { + "auxiliary_loss_clip": 0.01197364, + "auxiliary_loss_mlp": 0.01064447, + "balance_loss_clip": 1.06226707, + "balance_loss_mlp": 1.0368979, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.7222363213381304, + "language_loss": 0.79342997, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81604815, + "num_input_tokens_seen": 21847800, + "step": 1021, + "time_per_iteration": 2.615569591522217 + }, + { + "auxiliary_loss_clip": 0.01187078, + "auxiliary_loss_mlp": 0.01065092, + "balance_loss_clip": 1.05875421, + "balance_loss_mlp": 1.04253793, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.3865686454030683, + "language_loss": 0.87889338, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.90141505, + "num_input_tokens_seen": 21863385, + "step": 1022, + "time_per_iteration": 2.495743989944458 + }, + { + "auxiliary_loss_clip": 0.01196236, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_clip": 1.06479502, + "balance_loss_mlp": 1.03775406, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 4.619568377746462, + "language_loss": 0.8310923, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85368937, + "num_input_tokens_seen": 21881880, + "step": 1023, + "time_per_iteration": 4.017882347106934 + }, + { + "auxiliary_loss_clip": 0.01090808, + "auxiliary_loss_mlp": 0.01004817, + "balance_loss_clip": 1.02629757, + "balance_loss_mlp": 1.00052547, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.888412106859327, + "language_loss": 0.65058625, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67154247, + "num_input_tokens_seen": 21940550, + "step": 1024, + "time_per_iteration": 3.069974422454834 + }, + { + "auxiliary_loss_clip": 0.01165168, + "auxiliary_loss_mlp": 0.0107371, + "balance_loss_clip": 1.05496383, + "balance_loss_mlp": 1.04604173, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 2.121794756724109, + "language_loss": 0.88124037, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90362918, + "num_input_tokens_seen": 21958390, + "step": 1025, + "time_per_iteration": 2.516308069229126 + }, + { + "auxiliary_loss_clip": 0.01195209, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_clip": 1.06423593, + "balance_loss_mlp": 1.03916466, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.4981344449463034, + "language_loss": 0.8456465, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86822891, + "num_input_tokens_seen": 21978625, + "step": 1026, + "time_per_iteration": 2.5562846660614014 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01066572, + "balance_loss_clip": 1.05574155, + "balance_loss_mlp": 1.04129982, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 2.032617569498257, + "language_loss": 0.82241923, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84470272, + "num_input_tokens_seen": 21996035, + "step": 1027, + "time_per_iteration": 2.5252492427825928 + }, + { + "auxiliary_loss_clip": 0.01157005, + "auxiliary_loss_mlp": 0.01062644, + "balance_loss_clip": 1.06036973, + "balance_loss_mlp": 1.03814685, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.3247272029673494, + "language_loss": 0.83882248, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86101902, + "num_input_tokens_seen": 22011625, + "step": 1028, + "time_per_iteration": 2.5363829135894775 + }, + { + "auxiliary_loss_clip": 0.01066666, + "auxiliary_loss_mlp": 0.01006677, + "balance_loss_clip": 1.02525914, + "balance_loss_mlp": 1.00250423, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.938503717730277, + "language_loss": 0.60523641, + "learning_rate": 3.989357695452323e-06, + "loss": 0.62596983, + "num_input_tokens_seen": 22066035, + "step": 1029, + "time_per_iteration": 2.882622718811035 + }, + { + "auxiliary_loss_clip": 0.01177325, + "auxiliary_loss_mlp": 0.01068267, + "balance_loss_clip": 1.05766487, + "balance_loss_mlp": 1.04144526, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.011737213740967, + "language_loss": 0.82558268, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84803855, + "num_input_tokens_seen": 22085015, + "step": 1030, + "time_per_iteration": 2.512664794921875 + }, + { + "auxiliary_loss_clip": 0.01221025, + "auxiliary_loss_mlp": 0.01071992, + "balance_loss_clip": 1.06801128, + "balance_loss_mlp": 1.04623115, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.392406222765845, + "language_loss": 0.79832, + "learning_rate": 3.989277296609237e-06, + "loss": 0.8212502, + "num_input_tokens_seen": 22102775, + "step": 1031, + "time_per_iteration": 3.907224655151367 + }, + { + "auxiliary_loss_clip": 0.01186534, + "auxiliary_loss_mlp": 0.01077717, + "balance_loss_clip": 1.06429172, + "balance_loss_mlp": 1.05126476, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.645382690591809, + "language_loss": 0.77479541, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79743791, + "num_input_tokens_seen": 22121680, + "step": 1032, + "time_per_iteration": 2.560107469558716 + }, + { + "auxiliary_loss_clip": 0.01195037, + "auxiliary_loss_mlp": 0.0107118, + "balance_loss_clip": 1.05725825, + "balance_loss_mlp": 1.04588401, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.0592952048287776, + "language_loss": 0.89325708, + "learning_rate": 3.989196596031776e-06, + "loss": 0.9159193, + "num_input_tokens_seen": 22138155, + "step": 1033, + "time_per_iteration": 2.467421054840088 + }, + { + "auxiliary_loss_clip": 0.01205014, + "auxiliary_loss_mlp": 0.01063819, + "balance_loss_clip": 1.06134713, + "balance_loss_mlp": 1.0397507, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.083132512730526, + "language_loss": 0.84880215, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87149048, + "num_input_tokens_seen": 22157420, + "step": 1034, + "time_per_iteration": 2.5169193744659424 + }, + { + "auxiliary_loss_clip": 0.01180821, + "auxiliary_loss_mlp": 0.01057388, + "balance_loss_clip": 1.0606215, + "balance_loss_mlp": 1.03206849, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 1.8084304540865253, + "language_loss": 0.80988908, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83227122, + "num_input_tokens_seen": 22178620, + "step": 1035, + "time_per_iteration": 2.609344959259033 + }, + { + "auxiliary_loss_clip": 0.01156566, + "auxiliary_loss_mlp": 0.01070915, + "balance_loss_clip": 1.05955982, + "balance_loss_mlp": 1.04344893, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 1.9728590211641406, + "language_loss": 0.78254431, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80481911, + "num_input_tokens_seen": 22197125, + "step": 1036, + "time_per_iteration": 2.60343599319458 + }, + { + "auxiliary_loss_clip": 0.01191981, + "auxiliary_loss_mlp": 0.01068304, + "balance_loss_clip": 1.06123936, + "balance_loss_mlp": 1.0438062, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 5.870494173108104, + "language_loss": 0.86879992, + "learning_rate": 3.989034289722739e-06, + "loss": 0.89140272, + "num_input_tokens_seen": 22217575, + "step": 1037, + "time_per_iteration": 2.5174145698547363 + }, + { + "auxiliary_loss_clip": 0.01195998, + "auxiliary_loss_mlp": 0.01058474, + "balance_loss_clip": 1.06177068, + "balance_loss_mlp": 1.03233099, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.4830649960583764, + "language_loss": 0.81060886, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83315355, + "num_input_tokens_seen": 22236840, + "step": 1038, + "time_per_iteration": 2.51792049407959 + }, + { + "auxiliary_loss_clip": 0.01146773, + "auxiliary_loss_mlp": 0.01075085, + "balance_loss_clip": 1.0510112, + "balance_loss_mlp": 1.0468328, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 2.897677889469806, + "language_loss": 0.85626376, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87848228, + "num_input_tokens_seen": 22256465, + "step": 1039, + "time_per_iteration": 2.5339274406433105 + }, + { + "auxiliary_loss_clip": 0.01191294, + "auxiliary_loss_mlp": 0.01067695, + "balance_loss_clip": 1.06102169, + "balance_loss_mlp": 1.04196978, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 2.059960598128164, + "language_loss": 0.80905986, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83164972, + "num_input_tokens_seen": 22274025, + "step": 1040, + "time_per_iteration": 2.53395938873291 + }, + { + "auxiliary_loss_clip": 0.01215388, + "auxiliary_loss_mlp": 0.01065328, + "balance_loss_clip": 1.06770039, + "balance_loss_mlp": 1.04021049, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.6915504988940095, + "language_loss": 0.69768, + "learning_rate": 3.988870776623685e-06, + "loss": 0.72048712, + "num_input_tokens_seen": 22292245, + "step": 1041, + "time_per_iteration": 2.485478639602661 + }, + { + "auxiliary_loss_clip": 0.01213082, + "auxiliary_loss_mlp": 0.01059797, + "balance_loss_clip": 1.06186485, + "balance_loss_mlp": 1.03372622, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 2.3485757627459543, + "language_loss": 0.81478155, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83751035, + "num_input_tokens_seen": 22311455, + "step": 1042, + "time_per_iteration": 2.460920810699463 + }, + { + "auxiliary_loss_clip": 0.0121077, + "auxiliary_loss_mlp": 0.0105343, + "balance_loss_clip": 1.06237745, + "balance_loss_mlp": 1.02914703, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 3.7585768080994555, + "language_loss": 0.76367766, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78631967, + "num_input_tokens_seen": 22333750, + "step": 1043, + "time_per_iteration": 2.589465856552124 + }, + { + "auxiliary_loss_clip": 0.01191795, + "auxiliary_loss_mlp": 0.01064212, + "balance_loss_clip": 1.0634551, + "balance_loss_mlp": 1.03990507, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 2.1986666986762162, + "language_loss": 0.92445707, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94701713, + "num_input_tokens_seen": 22351940, + "step": 1044, + "time_per_iteration": 2.487664222717285 + }, + { + "auxiliary_loss_clip": 0.0119295, + "auxiliary_loss_mlp": 0.01072112, + "balance_loss_clip": 1.05923772, + "balance_loss_mlp": 1.04730499, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 1.8701469714783363, + "language_loss": 0.85687792, + "learning_rate": 3.988706056833821e-06, + "loss": 0.87952858, + "num_input_tokens_seen": 22372085, + "step": 1045, + "time_per_iteration": 2.5214595794677734 + }, + { + "auxiliary_loss_clip": 0.01180839, + "auxiliary_loss_mlp": 0.01064639, + "balance_loss_clip": 1.0598588, + "balance_loss_mlp": 1.04038024, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.073879844672886, + "language_loss": 0.78065747, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80311227, + "num_input_tokens_seen": 22392020, + "step": 1046, + "time_per_iteration": 2.6058924198150635 + }, + { + "auxiliary_loss_clip": 0.01194631, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.061625, + "balance_loss_mlp": 1.04620731, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.8465371717335004, + "language_loss": 0.77436978, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79701608, + "num_input_tokens_seen": 22411180, + "step": 1047, + "time_per_iteration": 2.482666015625 + }, + { + "auxiliary_loss_clip": 0.01200111, + "auxiliary_loss_mlp": 0.01063988, + "balance_loss_clip": 1.06215417, + "balance_loss_mlp": 1.0389421, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.1202387834475616, + "language_loss": 0.77061433, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79325533, + "num_input_tokens_seen": 22435105, + "step": 1048, + "time_per_iteration": 2.6262168884277344 + }, + { + "auxiliary_loss_clip": 0.01185303, + "auxiliary_loss_mlp": 0.0106908, + "balance_loss_clip": 1.06333232, + "balance_loss_mlp": 1.04350996, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 4.126335033034052, + "language_loss": 0.77205813, + "learning_rate": 3.988540130453087e-06, + "loss": 0.79460192, + "num_input_tokens_seen": 22452710, + "step": 1049, + "time_per_iteration": 2.5870285034179688 + }, + { + "auxiliary_loss_clip": 0.01197387, + "auxiliary_loss_mlp": 0.0106386, + "balance_loss_clip": 1.06280255, + "balance_loss_mlp": 1.0391953, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.449676260834476, + "language_loss": 0.83009672, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85270917, + "num_input_tokens_seen": 22470175, + "step": 1050, + "time_per_iteration": 2.4861371517181396 + }, + { + "auxiliary_loss_clip": 0.01210368, + "auxiliary_loss_mlp": 0.01063384, + "balance_loss_clip": 1.06583667, + "balance_loss_mlp": 1.03970909, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 3.945449736777856, + "language_loss": 0.76846743, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79120493, + "num_input_tokens_seen": 22490020, + "step": 1051, + "time_per_iteration": 2.4860241413116455 + }, + { + "auxiliary_loss_clip": 0.01189692, + "auxiliary_loss_mlp": 0.01069607, + "balance_loss_clip": 1.06334102, + "balance_loss_mlp": 1.04419124, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.240795305588503, + "language_loss": 0.80445218, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82704514, + "num_input_tokens_seen": 22509685, + "step": 1052, + "time_per_iteration": 2.5313336849212646 + }, + { + "auxiliary_loss_clip": 0.01214595, + "auxiliary_loss_mlp": 0.01062141, + "balance_loss_clip": 1.0658772, + "balance_loss_mlp": 1.03739309, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.3734581604091938, + "language_loss": 0.78086019, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80362761, + "num_input_tokens_seen": 22527905, + "step": 1053, + "time_per_iteration": 2.438170909881592 + }, + { + "auxiliary_loss_clip": 0.0119727, + "auxiliary_loss_mlp": 0.00793467, + "balance_loss_clip": 1.06721151, + "balance_loss_mlp": 1.00067031, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.8207679177183418, + "language_loss": 0.84764218, + "learning_rate": 3.988331025862195e-06, + "loss": 0.86754954, + "num_input_tokens_seen": 22546335, + "step": 1054, + "time_per_iteration": 2.536095380783081 + }, + { + "auxiliary_loss_clip": 0.01174494, + "auxiliary_loss_mlp": 0.01065212, + "balance_loss_clip": 1.05568814, + "balance_loss_mlp": 1.04005909, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 2.1689338763619483, + "language_loss": 0.85559905, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87799609, + "num_input_tokens_seen": 22563885, + "step": 1055, + "time_per_iteration": 2.5076372623443604 + }, + { + "auxiliary_loss_clip": 0.01167399, + "auxiliary_loss_mlp": 0.01079706, + "balance_loss_clip": 1.05836511, + "balance_loss_mlp": 1.05412388, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.506568320837334, + "language_loss": 0.8079499, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83042097, + "num_input_tokens_seen": 22583035, + "step": 1056, + "time_per_iteration": 4.088842391967773 + }, + { + "auxiliary_loss_clip": 0.01149902, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_clip": 1.05275166, + "balance_loss_mlp": 1.0441668, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.540918933658966, + "language_loss": 0.81002557, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83223814, + "num_input_tokens_seen": 22605055, + "step": 1057, + "time_per_iteration": 2.7567241191864014 + }, + { + "auxiliary_loss_clip": 0.01142457, + "auxiliary_loss_mlp": 0.0106806, + "balance_loss_clip": 1.04921865, + "balance_loss_mlp": 1.04483843, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 1.8808407864041035, + "language_loss": 0.8335427, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85564792, + "num_input_tokens_seen": 22623760, + "step": 1058, + "time_per_iteration": 4.535060405731201 + }, + { + "auxiliary_loss_clip": 0.01183148, + "auxiliary_loss_mlp": 0.01069871, + "balance_loss_clip": 1.06061769, + "balance_loss_mlp": 1.04242873, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 2.274969760111609, + "language_loss": 0.87689525, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89942539, + "num_input_tokens_seen": 22643000, + "step": 1059, + "time_per_iteration": 2.5581676959991455 + }, + { + "auxiliary_loss_clip": 0.01172752, + "auxiliary_loss_mlp": 0.01067402, + "balance_loss_clip": 1.06159639, + "balance_loss_mlp": 1.0407114, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.235054686764936, + "language_loss": 0.91558373, + "learning_rate": 3.988077612246394e-06, + "loss": 0.9379853, + "num_input_tokens_seen": 22660460, + "step": 1060, + "time_per_iteration": 2.571329116821289 + }, + { + "auxiliary_loss_clip": 0.01176809, + "auxiliary_loss_mlp": 0.01065825, + "balance_loss_clip": 1.05855405, + "balance_loss_mlp": 1.04000401, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 2.2591918133626128, + "language_loss": 0.87255061, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89497691, + "num_input_tokens_seen": 22679270, + "step": 1061, + "time_per_iteration": 2.4955036640167236 + }, + { + "auxiliary_loss_clip": 0.01195745, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_clip": 1.06236887, + "balance_loss_mlp": 1.04061222, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.6494692201141854, + "language_loss": 0.77308345, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79572535, + "num_input_tokens_seen": 22699330, + "step": 1062, + "time_per_iteration": 4.047708511352539 + }, + { + "auxiliary_loss_clip": 0.01177425, + "auxiliary_loss_mlp": 0.01066252, + "balance_loss_clip": 1.06275332, + "balance_loss_mlp": 1.04167104, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.569206098588811, + "language_loss": 0.86583078, + "learning_rate": 3.987949887677459e-06, + "loss": 0.88826752, + "num_input_tokens_seen": 22717945, + "step": 1063, + "time_per_iteration": 2.5868282318115234 + }, + { + "auxiliary_loss_clip": 0.01212095, + "auxiliary_loss_mlp": 0.01066029, + "balance_loss_clip": 1.06251407, + "balance_loss_mlp": 1.03976679, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.6169252442400457, + "language_loss": 0.80558121, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82836246, + "num_input_tokens_seen": 22736790, + "step": 1064, + "time_per_iteration": 2.4577114582061768 + }, + { + "auxiliary_loss_clip": 0.01202064, + "auxiliary_loss_mlp": 0.01072518, + "balance_loss_clip": 1.06153643, + "balance_loss_mlp": 1.04591095, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.4441783144545366, + "language_loss": 0.84278572, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86553156, + "num_input_tokens_seen": 22754745, + "step": 1065, + "time_per_iteration": 2.47993803024292 + }, + { + "auxiliary_loss_clip": 0.01172624, + "auxiliary_loss_mlp": 0.01056515, + "balance_loss_clip": 1.06419849, + "balance_loss_mlp": 1.0327208, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.8287883584457774, + "language_loss": 0.6850872, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70737863, + "num_input_tokens_seen": 22776780, + "step": 1066, + "time_per_iteration": 2.7044577598571777 + }, + { + "auxiliary_loss_clip": 0.01214263, + "auxiliary_loss_mlp": 0.01075861, + "balance_loss_clip": 1.06740856, + "balance_loss_mlp": 1.04928935, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 2.433808195633437, + "language_loss": 0.9022187, + "learning_rate": 3.987778532894181e-06, + "loss": 0.92512, + "num_input_tokens_seen": 22793915, + "step": 1067, + "time_per_iteration": 2.462482213973999 + }, + { + "auxiliary_loss_clip": 0.01189558, + "auxiliary_loss_mlp": 0.01068743, + "balance_loss_clip": 1.06329322, + "balance_loss_mlp": 1.04482985, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.291214916183063, + "language_loss": 0.83819652, + "learning_rate": 3.987735505752391e-06, + "loss": 0.86077952, + "num_input_tokens_seen": 22812670, + "step": 1068, + "time_per_iteration": 2.511389970779419 + }, + { + "auxiliary_loss_clip": 0.01181739, + "auxiliary_loss_mlp": 0.01072239, + "balance_loss_clip": 1.05964184, + "balance_loss_mlp": 1.04740763, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.6141841752720567, + "language_loss": 0.8970297, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91956949, + "num_input_tokens_seen": 22832440, + "step": 1069, + "time_per_iteration": 2.550661325454712 + }, + { + "auxiliary_loss_clip": 0.01189983, + "auxiliary_loss_mlp": 0.01077938, + "balance_loss_clip": 1.0645088, + "balance_loss_mlp": 1.05168772, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.553019431934199, + "language_loss": 0.96245015, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98512942, + "num_input_tokens_seen": 22845495, + "step": 1070, + "time_per_iteration": 2.477825403213501 + }, + { + "auxiliary_loss_clip": 0.01142508, + "auxiliary_loss_mlp": 0.01059068, + "balance_loss_clip": 1.05719697, + "balance_loss_mlp": 1.03224611, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.725106928453437, + "language_loss": 0.88331175, + "learning_rate": 3.987605972082782e-06, + "loss": 0.9053275, + "num_input_tokens_seen": 22865390, + "step": 1071, + "time_per_iteration": 4.08437442779541 + }, + { + "auxiliary_loss_clip": 0.01158039, + "auxiliary_loss_mlp": 0.01056389, + "balance_loss_clip": 1.05491245, + "balance_loss_mlp": 1.03164101, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.793468765765354, + "language_loss": 0.7586925, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78083682, + "num_input_tokens_seen": 22885495, + "step": 1072, + "time_per_iteration": 2.558497905731201 + }, + { + "auxiliary_loss_clip": 0.01172764, + "auxiliary_loss_mlp": 0.01070593, + "balance_loss_clip": 1.06027102, + "balance_loss_mlp": 1.04352045, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 5.729236611729031, + "language_loss": 0.80529058, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82772422, + "num_input_tokens_seen": 22904845, + "step": 1073, + "time_per_iteration": 2.5578954219818115 + }, + { + "auxiliary_loss_clip": 0.01191529, + "auxiliary_loss_mlp": 0.01063736, + "balance_loss_clip": 1.06197047, + "balance_loss_mlp": 1.03934598, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7337324731993047, + "language_loss": 0.8053143, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82786691, + "num_input_tokens_seen": 22925940, + "step": 1074, + "time_per_iteration": 2.6316418647766113 + }, + { + "auxiliary_loss_clip": 0.01172382, + "auxiliary_loss_mlp": 0.01063352, + "balance_loss_clip": 1.05751443, + "balance_loss_mlp": 1.03773355, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.209401032968639, + "language_loss": 0.79456228, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81691957, + "num_input_tokens_seen": 22944375, + "step": 1075, + "time_per_iteration": 2.4974989891052246 + }, + { + "auxiliary_loss_clip": 0.01171239, + "auxiliary_loss_mlp": 0.01068312, + "balance_loss_clip": 1.05504656, + "balance_loss_mlp": 1.04389787, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 7.942488267141024, + "language_loss": 0.87918973, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90158522, + "num_input_tokens_seen": 22959145, + "step": 1076, + "time_per_iteration": 2.558539628982544 + }, + { + "auxiliary_loss_clip": 0.01186472, + "auxiliary_loss_mlp": 0.01057275, + "balance_loss_clip": 1.05667973, + "balance_loss_mlp": 1.03243148, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.0199849889578676, + "language_loss": 0.80726552, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82970303, + "num_input_tokens_seen": 22978100, + "step": 1077, + "time_per_iteration": 2.4722440242767334 + }, + { + "auxiliary_loss_clip": 0.01198839, + "auxiliary_loss_mlp": 0.01066645, + "balance_loss_clip": 1.06384277, + "balance_loss_mlp": 1.04010928, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.754388645738751, + "language_loss": 0.91646111, + "learning_rate": 3.987301088972986e-06, + "loss": 0.939116, + "num_input_tokens_seen": 22997285, + "step": 1078, + "time_per_iteration": 2.566697120666504 + }, + { + "auxiliary_loss_clip": 0.01217989, + "auxiliary_loss_mlp": 0.01064224, + "balance_loss_clip": 1.06510878, + "balance_loss_mlp": 1.03814089, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.1766847432965077, + "language_loss": 0.78933609, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81215823, + "num_input_tokens_seen": 23016285, + "step": 1079, + "time_per_iteration": 2.452397108078003 + }, + { + "auxiliary_loss_clip": 0.01154047, + "auxiliary_loss_mlp": 0.0106612, + "balance_loss_clip": 1.05251527, + "balance_loss_mlp": 1.04052567, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.0034096285595893, + "language_loss": 0.69555402, + "learning_rate": 3.987213301260294e-06, + "loss": 0.71775568, + "num_input_tokens_seen": 23036420, + "step": 1080, + "time_per_iteration": 2.5891592502593994 + }, + { + "auxiliary_loss_clip": 0.01180114, + "auxiliary_loss_mlp": 0.0106837, + "balance_loss_clip": 1.07167959, + "balance_loss_mlp": 1.04126167, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.790522606101361, + "language_loss": 0.725178, + "learning_rate": 3.987169294370123e-06, + "loss": 0.7476629, + "num_input_tokens_seen": 23056945, + "step": 1081, + "time_per_iteration": 2.626499891281128 + }, + { + "auxiliary_loss_clip": 0.01140139, + "auxiliary_loss_mlp": 0.01063146, + "balance_loss_clip": 1.04788423, + "balance_loss_mlp": 1.036538, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.517829268379828, + "language_loss": 0.84194809, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86398089, + "num_input_tokens_seen": 23074940, + "step": 1082, + "time_per_iteration": 2.5597856044769287 + }, + { + "auxiliary_loss_clip": 0.01202512, + "auxiliary_loss_mlp": 0.01065972, + "balance_loss_clip": 1.06180286, + "balance_loss_mlp": 1.03979385, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 3.210666436889839, + "language_loss": 0.82959962, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85228455, + "num_input_tokens_seen": 23093420, + "step": 1083, + "time_per_iteration": 2.506822109222412 + }, + { + "auxiliary_loss_clip": 0.01168052, + "auxiliary_loss_mlp": 0.01062328, + "balance_loss_clip": 1.05958092, + "balance_loss_mlp": 1.03568459, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.797694025564787, + "language_loss": 0.79181719, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81412095, + "num_input_tokens_seen": 23111550, + "step": 1084, + "time_per_iteration": 2.536992073059082 + }, + { + "auxiliary_loss_clip": 0.0117366, + "auxiliary_loss_mlp": 0.01065466, + "balance_loss_clip": 1.05713296, + "balance_loss_mlp": 1.03954983, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.0031267982401437, + "language_loss": 0.66456729, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68695855, + "num_input_tokens_seen": 23130335, + "step": 1085, + "time_per_iteration": 2.5872998237609863 + }, + { + "auxiliary_loss_clip": 0.01170329, + "auxiliary_loss_mlp": 0.01072361, + "balance_loss_clip": 1.05859733, + "balance_loss_mlp": 1.04738665, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 2.0414457049493997, + "language_loss": 0.76706672, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.78949362, + "num_input_tokens_seen": 23152380, + "step": 1086, + "time_per_iteration": 2.6307997703552246 + }, + { + "auxiliary_loss_clip": 0.01195198, + "auxiliary_loss_mlp": 0.0105905, + "balance_loss_clip": 1.06079555, + "balance_loss_mlp": 1.03374159, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.7457326206597887, + "language_loss": 0.85335886, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87590128, + "num_input_tokens_seen": 23171630, + "step": 1087, + "time_per_iteration": 2.470217227935791 + }, + { + "auxiliary_loss_clip": 0.01188, + "auxiliary_loss_mlp": 0.01061745, + "balance_loss_clip": 1.06498158, + "balance_loss_mlp": 1.03706908, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 2.7088979894374887, + "language_loss": 0.77703029, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.79952776, + "num_input_tokens_seen": 23192520, + "step": 1088, + "time_per_iteration": 2.569124937057495 + }, + { + "auxiliary_loss_clip": 0.01190799, + "auxiliary_loss_mlp": 0.01061681, + "balance_loss_clip": 1.06485307, + "balance_loss_mlp": 1.03839993, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 1.8314618847431274, + "language_loss": 0.71379608, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73632091, + "num_input_tokens_seen": 23210710, + "step": 1089, + "time_per_iteration": 2.540496587753296 + }, + { + "auxiliary_loss_clip": 0.01171179, + "auxiliary_loss_mlp": 0.00796092, + "balance_loss_clip": 1.05894947, + "balance_loss_mlp": 1.0007906, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.6388526170283193, + "language_loss": 0.85697711, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87664986, + "num_input_tokens_seen": 23230305, + "step": 1090, + "time_per_iteration": 2.537444591522217 + }, + { + "auxiliary_loss_clip": 0.0121328, + "auxiliary_loss_mlp": 0.01063279, + "balance_loss_clip": 1.06569326, + "balance_loss_mlp": 1.03843617, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 1.7476818198943462, + "language_loss": 0.71614128, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.73890686, + "num_input_tokens_seen": 23249015, + "step": 1091, + "time_per_iteration": 2.483433723449707 + }, + { + "auxiliary_loss_clip": 0.01121828, + "auxiliary_loss_mlp": 0.01069717, + "balance_loss_clip": 1.05140495, + "balance_loss_mlp": 1.04376483, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.3662320712311065, + "language_loss": 0.8290332, + "learning_rate": 3.986680245605936e-06, + "loss": 0.85094863, + "num_input_tokens_seen": 23265105, + "step": 1092, + "time_per_iteration": 2.9230129718780518 + }, + { + "auxiliary_loss_clip": 0.01212549, + "auxiliary_loss_mlp": 0.0106653, + "balance_loss_clip": 1.06103945, + "balance_loss_mlp": 1.0384686, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 1.9848530078041557, + "language_loss": 0.71070993, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73350066, + "num_input_tokens_seen": 23283950, + "step": 1093, + "time_per_iteration": 3.0072731971740723 + }, + { + "auxiliary_loss_clip": 0.01188276, + "auxiliary_loss_mlp": 0.0106507, + "balance_loss_clip": 1.06465626, + "balance_loss_mlp": 1.03824806, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.7175163467618353, + "language_loss": 0.87972271, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90225619, + "num_input_tokens_seen": 23305005, + "step": 1094, + "time_per_iteration": 2.552563190460205 + }, + { + "auxiliary_loss_clip": 0.01192506, + "auxiliary_loss_mlp": 0.01068295, + "balance_loss_clip": 1.06378317, + "balance_loss_mlp": 1.0405674, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 2.582835518653637, + "language_loss": 0.81442082, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83702886, + "num_input_tokens_seen": 23323220, + "step": 1095, + "time_per_iteration": 2.5536696910858154 + }, + { + "auxiliary_loss_clip": 0.01162873, + "auxiliary_loss_mlp": 0.01057891, + "balance_loss_clip": 1.06038225, + "balance_loss_mlp": 1.03489566, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.1982192492438495, + "language_loss": 0.6956259, + "learning_rate": 3.986500149519811e-06, + "loss": 0.71783358, + "num_input_tokens_seen": 23342235, + "step": 1096, + "time_per_iteration": 4.034250020980835 + }, + { + "auxiliary_loss_clip": 0.01202248, + "auxiliary_loss_mlp": 0.0107665, + "balance_loss_clip": 1.06417823, + "balance_loss_mlp": 1.05079341, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 2.249467289786796, + "language_loss": 0.77905083, + "learning_rate": 3.986454937173292e-06, + "loss": 0.80183971, + "num_input_tokens_seen": 23363680, + "step": 1097, + "time_per_iteration": 2.520798921585083 + }, + { + "auxiliary_loss_clip": 0.01210115, + "auxiliary_loss_mlp": 0.01068975, + "balance_loss_clip": 1.06258559, + "balance_loss_mlp": 1.04354775, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 3.379701718066528, + "language_loss": 0.77963519, + "learning_rate": 3.986409649500203e-06, + "loss": 0.8024261, + "num_input_tokens_seen": 23385590, + "step": 1098, + "time_per_iteration": 3.967559814453125 + }, + { + "auxiliary_loss_clip": 0.01195445, + "auxiliary_loss_mlp": 0.01076228, + "balance_loss_clip": 1.06020272, + "balance_loss_mlp": 1.04940593, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.920354706834002, + "language_loss": 0.82041609, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84313279, + "num_input_tokens_seen": 23402945, + "step": 1099, + "time_per_iteration": 2.479102373123169 + }, + { + "auxiliary_loss_clip": 0.01176277, + "auxiliary_loss_mlp": 0.01061042, + "balance_loss_clip": 1.05339396, + "balance_loss_mlp": 1.03494728, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 3.5662691070025705, + "language_loss": 0.82891846, + "learning_rate": 3.986318848181186e-06, + "loss": 0.85129166, + "num_input_tokens_seen": 23421410, + "step": 1100, + "time_per_iteration": 2.4698448181152344 + }, + { + "auxiliary_loss_clip": 0.01185198, + "auxiliary_loss_mlp": 0.01071187, + "balance_loss_clip": 1.06465065, + "balance_loss_mlp": 1.04655862, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 12.955492390187858, + "language_loss": 0.73132193, + "learning_rate": 3.986273334538702e-06, + "loss": 0.75388581, + "num_input_tokens_seen": 23438870, + "step": 1101, + "time_per_iteration": 2.500384569168091 + }, + { + "auxiliary_loss_clip": 0.01195306, + "auxiliary_loss_mlp": 0.01065047, + "balance_loss_clip": 1.057863, + "balance_loss_mlp": 1.04027486, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.670219440649662, + "language_loss": 0.86015546, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88275898, + "num_input_tokens_seen": 23456975, + "step": 1102, + "time_per_iteration": 3.85739803314209 + }, + { + "auxiliary_loss_clip": 0.01185658, + "auxiliary_loss_mlp": 0.01064143, + "balance_loss_clip": 1.06260812, + "balance_loss_mlp": 1.03833437, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.1432493588661297, + "language_loss": 0.81283331, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83533126, + "num_input_tokens_seen": 23473440, + "step": 1103, + "time_per_iteration": 2.486717939376831 + }, + { + "auxiliary_loss_clip": 0.01199576, + "auxiliary_loss_mlp": 0.00793973, + "balance_loss_clip": 1.06350088, + "balance_loss_mlp": 1.00080562, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 2.146728095098504, + "language_loss": 0.82200688, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84194231, + "num_input_tokens_seen": 23493880, + "step": 1104, + "time_per_iteration": 2.4758152961730957 + }, + { + "auxiliary_loss_clip": 0.0115953, + "auxiliary_loss_mlp": 0.01054805, + "balance_loss_clip": 1.05183113, + "balance_loss_mlp": 1.02835286, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 2.2386895871543726, + "language_loss": 0.80573392, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82787728, + "num_input_tokens_seen": 23514920, + "step": 1105, + "time_per_iteration": 2.5654919147491455 + }, + { + "auxiliary_loss_clip": 0.01176278, + "auxiliary_loss_mlp": 0.01063242, + "balance_loss_clip": 1.0598402, + "balance_loss_mlp": 1.03943563, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 2.2703647952330224, + "language_loss": 0.96605468, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98844981, + "num_input_tokens_seen": 23531635, + "step": 1106, + "time_per_iteration": 2.4987831115722656 + }, + { + "auxiliary_loss_clip": 0.01196862, + "auxiliary_loss_mlp": 0.01064324, + "balance_loss_clip": 1.05887198, + "balance_loss_mlp": 1.03865814, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 2.4741475784942955, + "language_loss": 0.82699049, + "learning_rate": 3.985998671031039e-06, + "loss": 0.84960234, + "num_input_tokens_seen": 23551020, + "step": 1107, + "time_per_iteration": 2.463742256164551 + }, + { + "auxiliary_loss_clip": 0.0108124, + "auxiliary_loss_mlp": 0.01018674, + "balance_loss_clip": 1.02879834, + "balance_loss_mlp": 1.01464427, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.7994576644484309, + "language_loss": 0.56688792, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58788699, + "num_input_tokens_seen": 23610675, + "step": 1108, + "time_per_iteration": 3.0393779277801514 + }, + { + "auxiliary_loss_clip": 0.01182794, + "auxiliary_loss_mlp": 0.01062515, + "balance_loss_clip": 1.05426145, + "balance_loss_mlp": 1.03636026, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.6586028141751568, + "language_loss": 0.72580659, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74825966, + "num_input_tokens_seen": 23628710, + "step": 1109, + "time_per_iteration": 2.5064187049865723 + }, + { + "auxiliary_loss_clip": 0.01148887, + "auxiliary_loss_mlp": 0.01066654, + "balance_loss_clip": 1.05282807, + "balance_loss_mlp": 1.04089308, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 1.6882726193570703, + "language_loss": 0.78183305, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80398846, + "num_input_tokens_seen": 23649160, + "step": 1110, + "time_per_iteration": 2.58931565284729 + }, + { + "auxiliary_loss_clip": 0.01153341, + "auxiliary_loss_mlp": 0.01064094, + "balance_loss_clip": 1.0536499, + "balance_loss_mlp": 1.03879786, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.070190155789009, + "language_loss": 0.7153132, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73748755, + "num_input_tokens_seen": 23671995, + "step": 1111, + "time_per_iteration": 4.036217212677002 + }, + { + "auxiliary_loss_clip": 0.01169531, + "auxiliary_loss_mlp": 0.0107435, + "balance_loss_clip": 1.05476487, + "balance_loss_mlp": 1.04908919, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.9814389425406356, + "language_loss": 0.78610122, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80853999, + "num_input_tokens_seen": 23690705, + "step": 1112, + "time_per_iteration": 2.574164628982544 + }, + { + "auxiliary_loss_clip": 0.0115379, + "auxiliary_loss_mlp": 0.01066016, + "balance_loss_clip": 1.0519619, + "balance_loss_mlp": 1.04155397, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 3.0285785719378295, + "language_loss": 0.78878516, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81098324, + "num_input_tokens_seen": 23709990, + "step": 1113, + "time_per_iteration": 2.5713179111480713 + }, + { + "auxiliary_loss_clip": 0.01147245, + "auxiliary_loss_mlp": 0.01064011, + "balance_loss_clip": 1.04773211, + "balance_loss_mlp": 1.03981185, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 2.4407514916467314, + "language_loss": 0.82483733, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84694988, + "num_input_tokens_seen": 23728485, + "step": 1114, + "time_per_iteration": 2.586035966873169 + }, + { + "auxiliary_loss_clip": 0.0105482, + "auxiliary_loss_mlp": 0.01060662, + "balance_loss_clip": 1.03398824, + "balance_loss_mlp": 1.05691898, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8467354087605319, + "language_loss": 0.58120012, + "learning_rate": 3.985628235767584e-06, + "loss": 0.602355, + "num_input_tokens_seen": 23786650, + "step": 1115, + "time_per_iteration": 3.634582996368408 + }, + { + "auxiliary_loss_clip": 0.01177719, + "auxiliary_loss_mlp": 0.0106499, + "balance_loss_clip": 1.06049156, + "balance_loss_mlp": 1.0387876, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 3.307709318845581, + "language_loss": 0.91389418, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93632126, + "num_input_tokens_seen": 23802555, + "step": 1116, + "time_per_iteration": 2.8012733459472656 + }, + { + "auxiliary_loss_clip": 0.01167189, + "auxiliary_loss_mlp": 0.00795755, + "balance_loss_clip": 1.05762458, + "balance_loss_mlp": 1.00089216, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 3.046981632972151, + "language_loss": 0.87607139, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89570081, + "num_input_tokens_seen": 23822945, + "step": 1117, + "time_per_iteration": 2.6060760021209717 + }, + { + "auxiliary_loss_clip": 0.01081393, + "auxiliary_loss_mlp": 0.01005271, + "balance_loss_clip": 1.03147125, + "balance_loss_mlp": 1.00186121, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.8594883097155388, + "language_loss": 0.59761041, + "learning_rate": 3.985488080124218e-06, + "loss": 0.61847705, + "num_input_tokens_seen": 23874075, + "step": 1118, + "time_per_iteration": 3.005596399307251 + }, + { + "auxiliary_loss_clip": 0.01177962, + "auxiliary_loss_mlp": 0.01058522, + "balance_loss_clip": 1.0519973, + "balance_loss_mlp": 1.03316617, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.3243258734468926, + "language_loss": 0.83576965, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85813451, + "num_input_tokens_seen": 23889720, + "step": 1119, + "time_per_iteration": 2.535994291305542 + }, + { + "auxiliary_loss_clip": 0.01181718, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_clip": 1.05993903, + "balance_loss_mlp": 1.0439415, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.9828265243894598, + "language_loss": 0.85109377, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87357974, + "num_input_tokens_seen": 23909385, + "step": 1120, + "time_per_iteration": 2.564821243286133 + }, + { + "auxiliary_loss_clip": 0.0121412, + "auxiliary_loss_mlp": 0.01065915, + "balance_loss_clip": 1.06624579, + "balance_loss_mlp": 1.04049957, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 1.8095988489477266, + "language_loss": 0.78564101, + "learning_rate": 3.985347246871708e-06, + "loss": 0.80844128, + "num_input_tokens_seen": 23926830, + "step": 1121, + "time_per_iteration": 2.5046629905700684 + }, + { + "auxiliary_loss_clip": 0.01076313, + "auxiliary_loss_mlp": 0.01007093, + "balance_loss_clip": 1.02938724, + "balance_loss_mlp": 1.00344551, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7501880960958303, + "language_loss": 0.58359683, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60443079, + "num_input_tokens_seen": 23992640, + "step": 1122, + "time_per_iteration": 3.1954939365386963 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01064326, + "balance_loss_clip": 1.05508852, + "balance_loss_mlp": 1.03953052, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.0339570474604183, + "language_loss": 0.71583378, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73793459, + "num_input_tokens_seen": 24011135, + "step": 1123, + "time_per_iteration": 2.6383590698242188 + }, + { + "auxiliary_loss_clip": 0.01143889, + "auxiliary_loss_mlp": 0.01064724, + "balance_loss_clip": 1.05194283, + "balance_loss_mlp": 1.03651905, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.809784490103701, + "language_loss": 0.78886998, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81095612, + "num_input_tokens_seen": 24030695, + "step": 1124, + "time_per_iteration": 2.6419880390167236 + }, + { + "auxiliary_loss_clip": 0.01188953, + "auxiliary_loss_mlp": 0.01057089, + "balance_loss_clip": 1.05701721, + "balance_loss_mlp": 1.03426039, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.470480890507755, + "language_loss": 0.71823448, + "learning_rate": 3.985158415226128e-06, + "loss": 0.74069488, + "num_input_tokens_seen": 24050680, + "step": 1125, + "time_per_iteration": 2.519120216369629 + }, + { + "auxiliary_loss_clip": 0.01168633, + "auxiliary_loss_mlp": 0.01075303, + "balance_loss_clip": 1.06105971, + "balance_loss_mlp": 1.04948258, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 3.1367164956960756, + "language_loss": 0.81196654, + "learning_rate": 3.985111019116736e-06, + "loss": 0.8344059, + "num_input_tokens_seen": 24067205, + "step": 1126, + "time_per_iteration": 2.5861518383026123 + }, + { + "auxiliary_loss_clip": 0.01065085, + "auxiliary_loss_mlp": 0.01012676, + "balance_loss_clip": 1.02601314, + "balance_loss_mlp": 1.00864649, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.9037812939541403, + "language_loss": 0.59770507, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61848271, + "num_input_tokens_seen": 24131320, + "step": 1127, + "time_per_iteration": 3.1128363609313965 + }, + { + "auxiliary_loss_clip": 0.01204817, + "auxiliary_loss_mlp": 0.01056232, + "balance_loss_clip": 1.06277382, + "balance_loss_mlp": 1.0324018, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.3909383840228045, + "language_loss": 0.8156873, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83829778, + "num_input_tokens_seen": 24149930, + "step": 1128, + "time_per_iteration": 2.509495973587036 + }, + { + "auxiliary_loss_clip": 0.0116217, + "auxiliary_loss_mlp": 0.01055287, + "balance_loss_clip": 1.05514646, + "balance_loss_mlp": 1.03025341, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 2.7590569135249994, + "language_loss": 0.75436091, + "learning_rate": 3.984968379142109e-06, + "loss": 0.77653545, + "num_input_tokens_seen": 24169590, + "step": 1129, + "time_per_iteration": 2.5896360874176025 + }, + { + "auxiliary_loss_clip": 0.01124013, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_clip": 1.04833579, + "balance_loss_mlp": 1.03399134, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.9601803134967726, + "language_loss": 0.7186805, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74050832, + "num_input_tokens_seen": 24189965, + "step": 1130, + "time_per_iteration": 2.895988702774048 + }, + { + "auxiliary_loss_clip": 0.01160166, + "auxiliary_loss_mlp": 0.010631, + "balance_loss_clip": 1.05939722, + "balance_loss_mlp": 1.03891182, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 3.5522486939116296, + "language_loss": 0.80949038, + "learning_rate": 3.984872909471688e-06, + "loss": 0.83172303, + "num_input_tokens_seen": 24208045, + "step": 1131, + "time_per_iteration": 3.1089272499084473 + }, + { + "auxiliary_loss_clip": 0.01197392, + "auxiliary_loss_mlp": 0.01069022, + "balance_loss_clip": 1.06431651, + "balance_loss_mlp": 1.04541814, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.73269156593683, + "language_loss": 0.80695581, + "learning_rate": 3.984825061735701e-06, + "loss": 0.82961994, + "num_input_tokens_seen": 24223805, + "step": 1132, + "time_per_iteration": 2.48874831199646 + }, + { + "auxiliary_loss_clip": 0.01173971, + "auxiliary_loss_mlp": 0.01063492, + "balance_loss_clip": 1.05761826, + "balance_loss_mlp": 1.04005551, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.855127676223435, + "language_loss": 0.63590693, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65828156, + "num_input_tokens_seen": 24249475, + "step": 1133, + "time_per_iteration": 2.834726572036743 + }, + { + "auxiliary_loss_clip": 0.01128134, + "auxiliary_loss_mlp": 0.01068375, + "balance_loss_clip": 1.04731297, + "balance_loss_mlp": 1.04106379, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.858527713598741, + "language_loss": 0.74736333, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.76932842, + "num_input_tokens_seen": 24267980, + "step": 1134, + "time_per_iteration": 2.6321966648101807 + }, + { + "auxiliary_loss_clip": 0.01169931, + "auxiliary_loss_mlp": 0.00794433, + "balance_loss_clip": 1.05800128, + "balance_loss_mlp": 1.00079179, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 1.915609656264408, + "language_loss": 0.87083465, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89047825, + "num_input_tokens_seen": 24286805, + "step": 1135, + "time_per_iteration": 4.10443639755249 + }, + { + "auxiliary_loss_clip": 0.01176951, + "auxiliary_loss_mlp": 0.00793603, + "balance_loss_clip": 1.05484819, + "balance_loss_mlp": 1.00081921, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.359656193826117, + "language_loss": 0.78508532, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80479085, + "num_input_tokens_seen": 24305855, + "step": 1136, + "time_per_iteration": 2.561399459838867 + }, + { + "auxiliary_loss_clip": 0.01188212, + "auxiliary_loss_mlp": 0.0106646, + "balance_loss_clip": 1.06252551, + "balance_loss_mlp": 1.04167676, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 2.065672592476005, + "language_loss": 0.84015656, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86270332, + "num_input_tokens_seen": 24326535, + "step": 1137, + "time_per_iteration": 2.716338872909546 + }, + { + "auxiliary_loss_clip": 0.01158043, + "auxiliary_loss_mlp": 0.01062607, + "balance_loss_clip": 1.05409217, + "balance_loss_mlp": 1.03836012, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 3.2833067832619975, + "language_loss": 0.78919619, + "learning_rate": 3.984536394823418e-06, + "loss": 0.81140268, + "num_input_tokens_seen": 24345810, + "step": 1138, + "time_per_iteration": 4.061971187591553 + }, + { + "auxiliary_loss_clip": 0.01210372, + "auxiliary_loss_mlp": 0.01061646, + "balance_loss_clip": 1.06496179, + "balance_loss_mlp": 1.03754234, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.5774397168973895, + "language_loss": 0.85795498, + "learning_rate": 3.984488020272336e-06, + "loss": 0.8806752, + "num_input_tokens_seen": 24366095, + "step": 1139, + "time_per_iteration": 2.5201356410980225 + }, + { + "auxiliary_loss_clip": 0.01157243, + "auxiliary_loss_mlp": 0.01062389, + "balance_loss_clip": 1.05437601, + "balance_loss_mlp": 1.03640187, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 2.5517098984827804, + "language_loss": 0.74757707, + "learning_rate": 3.984439570469271e-06, + "loss": 0.76977336, + "num_input_tokens_seen": 24388665, + "step": 1140, + "time_per_iteration": 2.71315598487854 + }, + { + "auxiliary_loss_clip": 0.01187793, + "auxiliary_loss_mlp": 0.0079434, + "balance_loss_clip": 1.06125617, + "balance_loss_mlp": 1.00075912, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 5.150163211250629, + "language_loss": 0.6786654, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.69848669, + "num_input_tokens_seen": 24407705, + "step": 1141, + "time_per_iteration": 4.0122971534729 + }, + { + "auxiliary_loss_clip": 0.01201512, + "auxiliary_loss_mlp": 0.01067778, + "balance_loss_clip": 1.063223, + "balance_loss_mlp": 1.04329216, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 2.220045203617276, + "language_loss": 0.79111302, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81380594, + "num_input_tokens_seen": 24428390, + "step": 1142, + "time_per_iteration": 2.5490665435791016 + }, + { + "auxiliary_loss_clip": 0.01185749, + "auxiliary_loss_mlp": 0.01065211, + "balance_loss_clip": 1.06195116, + "balance_loss_mlp": 1.04145288, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.8661838957129564, + "language_loss": 0.68831909, + "learning_rate": 3.984293769566553e-06, + "loss": 0.71082866, + "num_input_tokens_seen": 24450810, + "step": 1143, + "time_per_iteration": 2.615567207336426 + }, + { + "auxiliary_loss_clip": 0.01178849, + "auxiliary_loss_mlp": 0.0106522, + "balance_loss_clip": 1.05906081, + "balance_loss_mlp": 1.04297519, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 2.8010484094086494, + "language_loss": 0.74260473, + "learning_rate": 3.98424501877395e-06, + "loss": 0.7650454, + "num_input_tokens_seen": 24469965, + "step": 1144, + "time_per_iteration": 2.5528669357299805 + }, + { + "auxiliary_loss_clip": 0.01195163, + "auxiliary_loss_mlp": 0.01065554, + "balance_loss_clip": 1.06009531, + "balance_loss_mlp": 1.03991151, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.7752862161267493, + "language_loss": 0.91910094, + "learning_rate": 3.984196192738577e-06, + "loss": 0.94170809, + "num_input_tokens_seen": 24486370, + "step": 1145, + "time_per_iteration": 2.507546901702881 + }, + { + "auxiliary_loss_clip": 0.01214481, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_clip": 1.06415868, + "balance_loss_mlp": 1.04632187, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.877334961826918, + "language_loss": 0.81787765, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84073925, + "num_input_tokens_seen": 24503780, + "step": 1146, + "time_per_iteration": 2.4889025688171387 + }, + { + "auxiliary_loss_clip": 0.01205318, + "auxiliary_loss_mlp": 0.01062644, + "balance_loss_clip": 1.06447101, + "balance_loss_mlp": 1.03977942, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.093584845579487, + "language_loss": 0.8532021, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87588173, + "num_input_tokens_seen": 24522320, + "step": 1147, + "time_per_iteration": 2.559809684753418 + }, + { + "auxiliary_loss_clip": 0.01155213, + "auxiliary_loss_mlp": 0.01064068, + "balance_loss_clip": 1.05613279, + "balance_loss_mlp": 1.03980875, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 3.0073094324669776, + "language_loss": 0.86092937, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88312215, + "num_input_tokens_seen": 24540445, + "step": 1148, + "time_per_iteration": 2.585946559906006 + }, + { + "auxiliary_loss_clip": 0.0117591, + "auxiliary_loss_mlp": 0.01062605, + "balance_loss_clip": 1.05865765, + "balance_loss_mlp": 1.03624749, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.328324665182935, + "language_loss": 0.69144922, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.7138344, + "num_input_tokens_seen": 24557105, + "step": 1149, + "time_per_iteration": 2.5432522296905518 + }, + { + "auxiliary_loss_clip": 0.01212083, + "auxiliary_loss_mlp": 0.01057979, + "balance_loss_clip": 1.06471872, + "balance_loss_mlp": 1.03276634, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.363323167147532, + "language_loss": 0.83813876, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86083937, + "num_input_tokens_seen": 24578240, + "step": 1150, + "time_per_iteration": 4.000298500061035 + }, + { + "auxiliary_loss_clip": 0.01185573, + "auxiliary_loss_mlp": 0.0105827, + "balance_loss_clip": 1.06498671, + "balance_loss_mlp": 1.03415346, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 4.563681318568086, + "language_loss": 0.81640667, + "learning_rate": 3.983901656532052e-06, + "loss": 0.83884513, + "num_input_tokens_seen": 24593585, + "step": 1151, + "time_per_iteration": 2.60555362701416 + }, + { + "auxiliary_loss_clip": 0.01207145, + "auxiliary_loss_mlp": 0.01059881, + "balance_loss_clip": 1.06634259, + "balance_loss_mlp": 1.03617036, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.9291108510536752, + "language_loss": 0.85470575, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87737608, + "num_input_tokens_seen": 24613110, + "step": 1152, + "time_per_iteration": 2.518155574798584 + }, + { + "auxiliary_loss_clip": 0.01191305, + "auxiliary_loss_mlp": 0.01058545, + "balance_loss_clip": 1.06479383, + "balance_loss_mlp": 1.03564453, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 3.315040605696331, + "language_loss": 0.90708554, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92958391, + "num_input_tokens_seen": 24628795, + "step": 1153, + "time_per_iteration": 2.577810525894165 + }, + { + "auxiliary_loss_clip": 0.01174179, + "auxiliary_loss_mlp": 0.01058219, + "balance_loss_clip": 1.06064558, + "balance_loss_mlp": 1.03460383, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.271941546021802, + "language_loss": 0.81658846, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83891249, + "num_input_tokens_seen": 24645480, + "step": 1154, + "time_per_iteration": 2.5514533519744873 + }, + { + "auxiliary_loss_clip": 0.01184924, + "auxiliary_loss_mlp": 0.01066901, + "balance_loss_clip": 1.06749666, + "balance_loss_mlp": 1.04295158, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 1.8034584481841862, + "language_loss": 0.74776363, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77028191, + "num_input_tokens_seen": 24664630, + "step": 1155, + "time_per_iteration": 2.587172508239746 + }, + { + "auxiliary_loss_clip": 0.01183999, + "auxiliary_loss_mlp": 0.0079465, + "balance_loss_clip": 1.05777097, + "balance_loss_mlp": 1.00083113, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.775726040236368, + "language_loss": 0.7067309, + "learning_rate": 3.98365414085822e-06, + "loss": 0.72651744, + "num_input_tokens_seen": 24684210, + "step": 1156, + "time_per_iteration": 2.5589537620544434 + }, + { + "auxiliary_loss_clip": 0.01181464, + "auxiliary_loss_mlp": 0.00795742, + "balance_loss_clip": 1.06204391, + "balance_loss_mlp": 1.00077796, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.134879496968512, + "language_loss": 0.74748194, + "learning_rate": 3.98360441205484e-06, + "loss": 0.76725399, + "num_input_tokens_seen": 24702490, + "step": 1157, + "time_per_iteration": 2.6166908740997314 + }, + { + "auxiliary_loss_clip": 0.01179174, + "auxiliary_loss_mlp": 0.01058881, + "balance_loss_clip": 1.05946791, + "balance_loss_mlp": 1.03419244, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 2.9353382111248894, + "language_loss": 0.71640992, + "learning_rate": 3.983554608032982e-06, + "loss": 0.73879051, + "num_input_tokens_seen": 24724340, + "step": 1158, + "time_per_iteration": 2.6053261756896973 + }, + { + "auxiliary_loss_clip": 0.01210297, + "auxiliary_loss_mlp": 0.01061535, + "balance_loss_clip": 1.06362116, + "balance_loss_mlp": 1.03642917, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.9119902667531807, + "language_loss": 0.79729557, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82001388, + "num_input_tokens_seen": 24745550, + "step": 1159, + "time_per_iteration": 2.5413379669189453 + }, + { + "auxiliary_loss_clip": 0.01212358, + "auxiliary_loss_mlp": 0.01066016, + "balance_loss_clip": 1.06555152, + "balance_loss_mlp": 1.03845477, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 3.2013435420059384, + "language_loss": 0.80739278, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83017647, + "num_input_tokens_seen": 24762575, + "step": 1160, + "time_per_iteration": 2.4652037620544434 + }, + { + "auxiliary_loss_clip": 0.01193808, + "auxiliary_loss_mlp": 0.01064051, + "balance_loss_clip": 1.05951142, + "balance_loss_mlp": 1.03851676, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.6593283927845397, + "language_loss": 0.76101589, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78359449, + "num_input_tokens_seen": 24782605, + "step": 1161, + "time_per_iteration": 2.5545239448547363 + }, + { + "auxiliary_loss_clip": 0.01175255, + "auxiliary_loss_mlp": 0.01062327, + "balance_loss_clip": 1.05795395, + "balance_loss_mlp": 1.03734028, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.9424124508838683, + "language_loss": 0.82849807, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85087383, + "num_input_tokens_seen": 24802910, + "step": 1162, + "time_per_iteration": 2.538623809814453 + }, + { + "auxiliary_loss_clip": 0.01183733, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.0580864, + "balance_loss_mlp": 1.03195322, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 2.2233203617138133, + "language_loss": 0.79328966, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81571275, + "num_input_tokens_seen": 24823305, + "step": 1163, + "time_per_iteration": 2.5804176330566406 + }, + { + "auxiliary_loss_clip": 0.01192359, + "auxiliary_loss_mlp": 0.01060457, + "balance_loss_clip": 1.06171608, + "balance_loss_mlp": 1.03454065, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 8.119712309320827, + "language_loss": 0.79151672, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81404495, + "num_input_tokens_seen": 24842155, + "step": 1164, + "time_per_iteration": 2.5069992542266846 + }, + { + "auxiliary_loss_clip": 0.01149218, + "auxiliary_loss_mlp": 0.01074895, + "balance_loss_clip": 1.05614316, + "balance_loss_mlp": 1.04670238, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 2.7916275986808388, + "language_loss": 0.72835171, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75059289, + "num_input_tokens_seen": 24862080, + "step": 1165, + "time_per_iteration": 2.5836689472198486 + }, + { + "auxiliary_loss_clip": 0.01184031, + "auxiliary_loss_mlp": 0.01060352, + "balance_loss_clip": 1.06144404, + "balance_loss_mlp": 1.0356164, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.9946473648907965, + "language_loss": 0.81131059, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83375442, + "num_input_tokens_seen": 24886165, + "step": 1166, + "time_per_iteration": 2.5847630500793457 + }, + { + "auxiliary_loss_clip": 0.01177291, + "auxiliary_loss_mlp": 0.01051661, + "balance_loss_clip": 1.05917394, + "balance_loss_mlp": 1.0266037, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 2.5361897988544877, + "language_loss": 0.84405887, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86634839, + "num_input_tokens_seen": 24905775, + "step": 1167, + "time_per_iteration": 2.5613019466400146 + }, + { + "auxiliary_loss_clip": 0.01194339, + "auxiliary_loss_mlp": 0.01066389, + "balance_loss_clip": 1.06330478, + "balance_loss_mlp": 1.04056859, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 7.15948362488454, + "language_loss": 0.89604771, + "learning_rate": 3.983052431214997e-06, + "loss": 0.91865498, + "num_input_tokens_seen": 24924295, + "step": 1168, + "time_per_iteration": 2.4760067462921143 + }, + { + "auxiliary_loss_clip": 0.01188266, + "auxiliary_loss_mlp": 0.01068082, + "balance_loss_clip": 1.05921984, + "balance_loss_mlp": 1.03937602, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.0807695531868946, + "language_loss": 0.88801658, + "learning_rate": 3.983001799915153e-06, + "loss": 0.9105801, + "num_input_tokens_seen": 24943210, + "step": 1169, + "time_per_iteration": 2.541893243789673 + }, + { + "auxiliary_loss_clip": 0.01210291, + "auxiliary_loss_mlp": 0.0107266, + "balance_loss_clip": 1.06439495, + "balance_loss_mlp": 1.04662514, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.2482619280569867, + "language_loss": 0.84014547, + "learning_rate": 3.982951093419681e-06, + "loss": 0.862975, + "num_input_tokens_seen": 24960360, + "step": 1170, + "time_per_iteration": 2.500476598739624 + }, + { + "auxiliary_loss_clip": 0.01180304, + "auxiliary_loss_mlp": 0.00794356, + "balance_loss_clip": 1.06156015, + "balance_loss_mlp": 1.00085115, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 9.59888666738657, + "language_loss": 0.75724322, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77698982, + "num_input_tokens_seen": 24978290, + "step": 1171, + "time_per_iteration": 2.5182011127471924 + }, + { + "auxiliary_loss_clip": 0.01177961, + "auxiliary_loss_mlp": 0.01062579, + "balance_loss_clip": 1.06023371, + "balance_loss_mlp": 1.03809297, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 1.8799459478158969, + "language_loss": 0.88830912, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91071451, + "num_input_tokens_seen": 24997055, + "step": 1172, + "time_per_iteration": 2.566152572631836 + }, + { + "auxiliary_loss_clip": 0.01190031, + "auxiliary_loss_mlp": 0.01061471, + "balance_loss_clip": 1.05899453, + "balance_loss_mlp": 1.03545952, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 4.039877925086409, + "language_loss": 0.8182689, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84078389, + "num_input_tokens_seen": 25017490, + "step": 1173, + "time_per_iteration": 2.57320499420166 + }, + { + "auxiliary_loss_clip": 0.01197585, + "auxiliary_loss_mlp": 0.01063032, + "balance_loss_clip": 1.06107271, + "balance_loss_mlp": 1.03696084, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 1.9863615033067774, + "language_loss": 0.82131064, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.84391683, + "num_input_tokens_seen": 25035660, + "step": 1174, + "time_per_iteration": 2.504293203353882 + }, + { + "auxiliary_loss_clip": 0.01176223, + "auxiliary_loss_mlp": 0.01064664, + "balance_loss_clip": 1.05417824, + "balance_loss_mlp": 1.04002357, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.9839090426156043, + "language_loss": 0.85283178, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87524062, + "num_input_tokens_seen": 25054785, + "step": 1175, + "time_per_iteration": 4.08319354057312 + }, + { + "auxiliary_loss_clip": 0.01194971, + "auxiliary_loss_mlp": 0.01073387, + "balance_loss_clip": 1.06332564, + "balance_loss_mlp": 1.04958093, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.8690715796785335, + "language_loss": 0.83180779, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85449147, + "num_input_tokens_seen": 25075180, + "step": 1176, + "time_per_iteration": 2.588122844696045 + }, + { + "auxiliary_loss_clip": 0.01153441, + "auxiliary_loss_mlp": 0.01064717, + "balance_loss_clip": 1.05346656, + "balance_loss_mlp": 1.03869379, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.663645540295704, + "language_loss": 0.74809784, + "learning_rate": 3.982594042635701e-06, + "loss": 0.77027941, + "num_input_tokens_seen": 25093035, + "step": 1177, + "time_per_iteration": 4.090673208236694 + }, + { + "auxiliary_loss_clip": 0.01185451, + "auxiliary_loss_mlp": 0.01059487, + "balance_loss_clip": 1.05887198, + "balance_loss_mlp": 1.03304672, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.0853386042657447, + "language_loss": 0.85581368, + "learning_rate": 3.982542734644673e-06, + "loss": 0.878263, + "num_input_tokens_seen": 25112520, + "step": 1178, + "time_per_iteration": 2.5626580715179443 + }, + { + "auxiliary_loss_clip": 0.01066136, + "auxiliary_loss_mlp": 0.01065605, + "balance_loss_clip": 1.02104449, + "balance_loss_mlp": 1.06126547, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8565069957067418, + "language_loss": 0.63180196, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65311939, + "num_input_tokens_seen": 25177760, + "step": 1179, + "time_per_iteration": 3.2646076679229736 + }, + { + "auxiliary_loss_clip": 0.0120509, + "auxiliary_loss_mlp": 0.01063384, + "balance_loss_clip": 1.06506658, + "balance_loss_mlp": 1.03920817, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.5888233884550917, + "language_loss": 0.83887738, + "learning_rate": 3.98243989312991e-06, + "loss": 0.86156213, + "num_input_tokens_seen": 25195260, + "step": 1180, + "time_per_iteration": 2.5844969749450684 + }, + { + "auxiliary_loss_clip": 0.01178994, + "auxiliary_loss_mlp": 0.01064571, + "balance_loss_clip": 1.05904901, + "balance_loss_mlp": 1.03938174, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.0761593003623946, + "language_loss": 0.88222933, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90466499, + "num_input_tokens_seen": 25212740, + "step": 1181, + "time_per_iteration": 4.089690923690796 + }, + { + "auxiliary_loss_clip": 0.0118211, + "auxiliary_loss_mlp": 0.01060479, + "balance_loss_clip": 1.06322551, + "balance_loss_mlp": 1.03563619, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 1.9598214824723361, + "language_loss": 0.83196402, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85438991, + "num_input_tokens_seen": 25236420, + "step": 1182, + "time_per_iteration": 2.809509754180908 + }, + { + "auxiliary_loss_clip": 0.01195529, + "auxiliary_loss_mlp": 0.01061709, + "balance_loss_clip": 1.06487858, + "balance_loss_mlp": 1.03586435, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.820563709373279, + "language_loss": 0.7867974, + "learning_rate": 3.982285067055262e-06, + "loss": 0.8093698, + "num_input_tokens_seen": 25255120, + "step": 1183, + "time_per_iteration": 2.574307918548584 + }, + { + "auxiliary_loss_clip": 0.01209108, + "auxiliary_loss_mlp": 0.01064184, + "balance_loss_clip": 1.05943584, + "balance_loss_mlp": 1.03831518, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.392838212373492, + "language_loss": 0.78950715, + "learning_rate": 3.982233308024204e-06, + "loss": 0.81224, + "num_input_tokens_seen": 25275150, + "step": 1184, + "time_per_iteration": 2.564800262451172 + }, + { + "auxiliary_loss_clip": 0.01149586, + "auxiliary_loss_mlp": 0.01060565, + "balance_loss_clip": 1.05682409, + "balance_loss_mlp": 1.03592443, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.012735455962106, + "language_loss": 0.76965243, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79175401, + "num_input_tokens_seen": 25293680, + "step": 1185, + "time_per_iteration": 2.6747121810913086 + }, + { + "auxiliary_loss_clip": 0.01207624, + "auxiliary_loss_mlp": 0.01068773, + "balance_loss_clip": 1.06285501, + "balance_loss_mlp": 1.04419231, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.491918078098215, + "language_loss": 0.65921593, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68197989, + "num_input_tokens_seen": 25310050, + "step": 1186, + "time_per_iteration": 2.4548301696777344 + }, + { + "auxiliary_loss_clip": 0.01190424, + "auxiliary_loss_mlp": 0.01057115, + "balance_loss_clip": 1.05963421, + "balance_loss_mlp": 1.03249812, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.7359212264836614, + "language_loss": 0.6954236, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71789896, + "num_input_tokens_seen": 25331020, + "step": 1187, + "time_per_iteration": 2.5516748428344727 + }, + { + "auxiliary_loss_clip": 0.01151767, + "auxiliary_loss_mlp": 0.01063648, + "balance_loss_clip": 1.05282867, + "balance_loss_mlp": 1.03903174, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.8542690725487176, + "language_loss": 0.78836906, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.81052315, + "num_input_tokens_seen": 25347875, + "step": 1188, + "time_per_iteration": 2.5896453857421875 + }, + { + "auxiliary_loss_clip": 0.01205727, + "auxiliary_loss_mlp": 0.01064258, + "balance_loss_clip": 1.06443334, + "balance_loss_mlp": 1.03849709, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 1.9334779891092022, + "language_loss": 0.84966445, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87236428, + "num_input_tokens_seen": 25366715, + "step": 1189, + "time_per_iteration": 3.9274652004241943 + }, + { + "auxiliary_loss_clip": 0.01172205, + "auxiliary_loss_mlp": 0.00793839, + "balance_loss_clip": 1.06265473, + "balance_loss_mlp": 1.00074911, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 2.3774187411723164, + "language_loss": 0.76715457, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.78681505, + "num_input_tokens_seen": 25385450, + "step": 1190, + "time_per_iteration": 2.638573408126831 + }, + { + "auxiliary_loss_clip": 0.01207597, + "auxiliary_loss_mlp": 0.01071107, + "balance_loss_clip": 1.06197262, + "balance_loss_mlp": 1.04442799, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.225428534942411, + "language_loss": 0.75490516, + "learning_rate": 3.981868890255468e-06, + "loss": 0.7776922, + "num_input_tokens_seen": 25403940, + "step": 1191, + "time_per_iteration": 2.4853079319000244 + }, + { + "auxiliary_loss_clip": 0.01166132, + "auxiliary_loss_mlp": 0.01066272, + "balance_loss_clip": 1.05320406, + "balance_loss_mlp": 1.03880596, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 3.1405422735225494, + "language_loss": 0.73852575, + "learning_rate": 3.981816529947719e-06, + "loss": 0.76084971, + "num_input_tokens_seen": 25420410, + "step": 1192, + "time_per_iteration": 2.6261332035064697 + }, + { + "auxiliary_loss_clip": 0.01201101, + "auxiliary_loss_mlp": 0.01053792, + "balance_loss_clip": 1.05762982, + "balance_loss_mlp": 1.03008175, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.0177199846908596, + "language_loss": 0.78081667, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80336565, + "num_input_tokens_seen": 25439415, + "step": 1193, + "time_per_iteration": 2.4986112117767334 + }, + { + "auxiliary_loss_clip": 0.01186946, + "auxiliary_loss_mlp": 0.01060832, + "balance_loss_clip": 1.06346917, + "balance_loss_mlp": 1.03472471, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 2.4368714964527154, + "language_loss": 0.85389876, + "learning_rate": 3.981711583882166e-06, + "loss": 0.87637651, + "num_input_tokens_seen": 25458715, + "step": 1194, + "time_per_iteration": 2.5442254543304443 + }, + { + "auxiliary_loss_clip": 0.0118501, + "auxiliary_loss_mlp": 0.01061254, + "balance_loss_clip": 1.0565753, + "balance_loss_mlp": 1.0357312, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.8965936644753374, + "language_loss": 0.81422639, + "learning_rate": 3.981658998128341e-06, + "loss": 0.836689, + "num_input_tokens_seen": 25477985, + "step": 1195, + "time_per_iteration": 2.5593955516815186 + }, + { + "auxiliary_loss_clip": 0.01168811, + "auxiliary_loss_mlp": 0.01059887, + "balance_loss_clip": 1.05853403, + "balance_loss_mlp": 1.03627193, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 2.138028822301237, + "language_loss": 0.79692066, + "learning_rate": 3.981606337229808e-06, + "loss": 0.81920767, + "num_input_tokens_seen": 25497110, + "step": 1196, + "time_per_iteration": 2.548786163330078 + }, + { + "auxiliary_loss_clip": 0.01176532, + "auxiliary_loss_mlp": 0.00797078, + "balance_loss_clip": 1.06083262, + "balance_loss_mlp": 1.00064588, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 2.472106533664537, + "language_loss": 0.7079348, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.72767091, + "num_input_tokens_seen": 25516555, + "step": 1197, + "time_per_iteration": 2.6654975414276123 + }, + { + "auxiliary_loss_clip": 0.01159135, + "auxiliary_loss_mlp": 0.01056681, + "balance_loss_clip": 1.06322134, + "balance_loss_mlp": 1.03130102, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.749192162901002, + "language_loss": 0.86033857, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88249671, + "num_input_tokens_seen": 25533895, + "step": 1198, + "time_per_iteration": 2.5852725505828857 + }, + { + "auxiliary_loss_clip": 0.01162573, + "auxiliary_loss_mlp": 0.01062938, + "balance_loss_clip": 1.06164289, + "balance_loss_mlp": 1.03685498, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.060402669067644, + "language_loss": 0.83885622, + "learning_rate": 3.981447903685947e-06, + "loss": 0.86111128, + "num_input_tokens_seen": 25554195, + "step": 1199, + "time_per_iteration": 2.644512414932251 + }, + { + "auxiliary_loss_clip": 0.0120966, + "auxiliary_loss_mlp": 0.01058571, + "balance_loss_clip": 1.06635404, + "balance_loss_mlp": 1.03475308, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.854260533977387, + "language_loss": 0.76560992, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78829229, + "num_input_tokens_seen": 25574155, + "step": 1200, + "time_per_iteration": 2.527773857116699 + }, + { + "auxiliary_loss_clip": 0.0119061, + "auxiliary_loss_mlp": 0.01073217, + "balance_loss_clip": 1.06537414, + "balance_loss_mlp": 1.04752791, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 2.21076598647393, + "language_loss": 0.82603467, + "learning_rate": 3.98134190563652e-06, + "loss": 0.84867293, + "num_input_tokens_seen": 25592735, + "step": 1201, + "time_per_iteration": 2.581367015838623 + }, + { + "auxiliary_loss_clip": 0.01198349, + "auxiliary_loss_mlp": 0.01061969, + "balance_loss_clip": 1.06226993, + "balance_loss_mlp": 1.03478956, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.992221936805793, + "language_loss": 0.68730503, + "learning_rate": 3.981288793911775e-06, + "loss": 0.70990825, + "num_input_tokens_seen": 25611510, + "step": 1202, + "time_per_iteration": 2.517117738723755 + }, + { + "auxiliary_loss_clip": 0.01182024, + "auxiliary_loss_mlp": 0.00795915, + "balance_loss_clip": 1.06087708, + "balance_loss_mlp": 1.00066507, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 2.2725198120090124, + "language_loss": 0.87765658, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89743596, + "num_input_tokens_seen": 25629560, + "step": 1203, + "time_per_iteration": 2.52459979057312 + }, + { + "auxiliary_loss_clip": 0.01161479, + "auxiliary_loss_mlp": 0.0106524, + "balance_loss_clip": 1.05606794, + "balance_loss_mlp": 1.03846538, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 2.082596330393399, + "language_loss": 0.7877022, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80996943, + "num_input_tokens_seen": 25648330, + "step": 1204, + "time_per_iteration": 2.5657901763916016 + }, + { + "auxiliary_loss_clip": 0.01190985, + "auxiliary_loss_mlp": 0.01073341, + "balance_loss_clip": 1.05985737, + "balance_loss_mlp": 1.04817617, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.6071104339804037, + "language_loss": 0.82425791, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84690118, + "num_input_tokens_seen": 25669470, + "step": 1205, + "time_per_iteration": 2.5783824920654297 + }, + { + "auxiliary_loss_clip": 0.01181435, + "auxiliary_loss_mlp": 0.00795146, + "balance_loss_clip": 1.06337857, + "balance_loss_mlp": 1.00064921, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 1.767070665758486, + "language_loss": 0.76510578, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78487158, + "num_input_tokens_seen": 25690470, + "step": 1206, + "time_per_iteration": 2.6302506923675537 + }, + { + "auxiliary_loss_clip": 0.01190253, + "auxiliary_loss_mlp": 0.01065665, + "balance_loss_clip": 1.06413364, + "balance_loss_mlp": 1.03993988, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 2.6052262519249956, + "language_loss": 0.77672434, + "learning_rate": 3.981022108368387e-06, + "loss": 0.7992835, + "num_input_tokens_seen": 25709205, + "step": 1207, + "time_per_iteration": 2.5195748805999756 + }, + { + "auxiliary_loss_clip": 0.01189163, + "auxiliary_loss_mlp": 0.01057529, + "balance_loss_clip": 1.05968809, + "balance_loss_mlp": 1.03380692, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 2.0969894319543565, + "language_loss": 0.79583555, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81830251, + "num_input_tokens_seen": 25728485, + "step": 1208, + "time_per_iteration": 2.547898054122925 + }, + { + "auxiliary_loss_clip": 0.01187499, + "auxiliary_loss_mlp": 0.01062507, + "balance_loss_clip": 1.05860996, + "balance_loss_mlp": 1.03771162, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 6.389526617657428, + "language_loss": 0.78647184, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80897188, + "num_input_tokens_seen": 25747730, + "step": 1209, + "time_per_iteration": 2.5279693603515625 + }, + { + "auxiliary_loss_clip": 0.01194368, + "auxiliary_loss_mlp": 0.01067338, + "balance_loss_clip": 1.06104112, + "balance_loss_mlp": 1.04279292, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 3.466796254739717, + "language_loss": 0.80955172, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83216882, + "num_input_tokens_seen": 25768050, + "step": 1210, + "time_per_iteration": 2.57283616065979 + }, + { + "auxiliary_loss_clip": 0.01174766, + "auxiliary_loss_mlp": 0.01066218, + "balance_loss_clip": 1.0596751, + "balance_loss_mlp": 1.04123211, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 3.068694873823838, + "language_loss": 0.84206003, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86446989, + "num_input_tokens_seen": 25787985, + "step": 1211, + "time_per_iteration": 2.5958750247955322 + }, + { + "auxiliary_loss_clip": 0.01162954, + "auxiliary_loss_mlp": 0.01056999, + "balance_loss_clip": 1.0516001, + "balance_loss_mlp": 1.03187013, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.9678588431418076, + "language_loss": 0.90476698, + "learning_rate": 3.98075354481122e-06, + "loss": 0.92696655, + "num_input_tokens_seen": 25803620, + "step": 1212, + "time_per_iteration": 2.5151641368865967 + }, + { + "auxiliary_loss_clip": 0.01203056, + "auxiliary_loss_mlp": 0.01061406, + "balance_loss_clip": 1.06214631, + "balance_loss_mlp": 1.03631234, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 1.927291170156467, + "language_loss": 0.72314441, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74578911, + "num_input_tokens_seen": 25823315, + "step": 1213, + "time_per_iteration": 2.517224073410034 + }, + { + "auxiliary_loss_clip": 0.01150786, + "auxiliary_loss_mlp": 0.01057942, + "balance_loss_clip": 1.05271077, + "balance_loss_mlp": 1.03276467, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 4.154430830600615, + "language_loss": 0.84015763, + "learning_rate": 3.980645593601465e-06, + "loss": 0.8622449, + "num_input_tokens_seen": 25842605, + "step": 1214, + "time_per_iteration": 4.1813225746154785 + }, + { + "auxiliary_loss_clip": 0.01209948, + "auxiliary_loss_mlp": 0.01061158, + "balance_loss_clip": 1.0646615, + "balance_loss_mlp": 1.03468144, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.177193143370097, + "language_loss": 0.84160519, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86431628, + "num_input_tokens_seen": 25863030, + "step": 1215, + "time_per_iteration": 2.554687261581421 + }, + { + "auxiliary_loss_clip": 0.01151998, + "auxiliary_loss_mlp": 0.01064234, + "balance_loss_clip": 1.04988694, + "balance_loss_mlp": 1.03797197, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 2.978857586426779, + "language_loss": 0.81231701, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83447933, + "num_input_tokens_seen": 25888015, + "step": 1216, + "time_per_iteration": 2.701465129852295 + }, + { + "auxiliary_loss_clip": 0.01171596, + "auxiliary_loss_mlp": 0.01059786, + "balance_loss_clip": 1.05592394, + "balance_loss_mlp": 1.03576529, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 3.3097190343585745, + "language_loss": 0.75937968, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78169346, + "num_input_tokens_seen": 25908660, + "step": 1217, + "time_per_iteration": 4.086913108825684 + }, + { + "auxiliary_loss_clip": 0.01172861, + "auxiliary_loss_mlp": 0.01062679, + "balance_loss_clip": 1.06277561, + "balance_loss_mlp": 1.03957665, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.0487836416128906, + "language_loss": 0.86321455, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88556993, + "num_input_tokens_seen": 25927215, + "step": 1218, + "time_per_iteration": 2.568754196166992 + }, + { + "auxiliary_loss_clip": 0.01193541, + "auxiliary_loss_mlp": 0.01065932, + "balance_loss_clip": 1.0597856, + "balance_loss_mlp": 1.04187512, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 1.8732329687584839, + "language_loss": 0.86807901, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.89067376, + "num_input_tokens_seen": 25945500, + "step": 1219, + "time_per_iteration": 2.550708055496216 + }, + { + "auxiliary_loss_clip": 0.01201471, + "auxiliary_loss_mlp": 0.01058205, + "balance_loss_clip": 1.0597012, + "balance_loss_mlp": 1.03467298, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 3.107766431820592, + "language_loss": 0.84570396, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86830074, + "num_input_tokens_seen": 25963105, + "step": 1220, + "time_per_iteration": 3.844134569168091 + }, + { + "auxiliary_loss_clip": 0.01161361, + "auxiliary_loss_mlp": 0.01063694, + "balance_loss_clip": 1.05320716, + "balance_loss_mlp": 1.03900623, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 3.003794781632345, + "language_loss": 0.77139229, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79364276, + "num_input_tokens_seen": 25981690, + "step": 1221, + "time_per_iteration": 2.5806822776794434 + }, + { + "auxiliary_loss_clip": 0.01163581, + "auxiliary_loss_mlp": 0.01070322, + "balance_loss_clip": 1.06029868, + "balance_loss_mlp": 1.04537153, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 1.8830876739476576, + "language_loss": 0.92056954, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94290853, + "num_input_tokens_seen": 25999890, + "step": 1222, + "time_per_iteration": 2.5699634552001953 + }, + { + "auxiliary_loss_clip": 0.01142621, + "auxiliary_loss_mlp": 0.01063415, + "balance_loss_clip": 1.05668592, + "balance_loss_mlp": 1.04016876, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 4.223990420453841, + "language_loss": 0.90669739, + "learning_rate": 3.980156095634242e-06, + "loss": 0.92875767, + "num_input_tokens_seen": 26016445, + "step": 1223, + "time_per_iteration": 2.619011163711548 + }, + { + "auxiliary_loss_clip": 0.01203009, + "auxiliary_loss_mlp": 0.01074268, + "balance_loss_clip": 1.06118882, + "balance_loss_mlp": 1.04928207, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.093377139917846, + "language_loss": 0.82092005, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84369278, + "num_input_tokens_seen": 26036080, + "step": 1224, + "time_per_iteration": 2.503087282180786 + }, + { + "auxiliary_loss_clip": 0.01201022, + "auxiliary_loss_mlp": 0.01059985, + "balance_loss_clip": 1.05918956, + "balance_loss_mlp": 1.03379416, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.3745345495293635, + "language_loss": 0.83298552, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.85559559, + "num_input_tokens_seen": 26055805, + "step": 1225, + "time_per_iteration": 2.485989570617676 + }, + { + "auxiliary_loss_clip": 0.01170551, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.0576508, + "balance_loss_mlp": 1.03581667, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 1.9609372118649957, + "language_loss": 0.90392828, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92623246, + "num_input_tokens_seen": 26073905, + "step": 1226, + "time_per_iteration": 2.5456900596618652 + }, + { + "auxiliary_loss_clip": 0.01211243, + "auxiliary_loss_mlp": 0.01051075, + "balance_loss_clip": 1.06038713, + "balance_loss_mlp": 1.02558851, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 3.0347921420049815, + "language_loss": 0.76527965, + "learning_rate": 3.97993658861193e-06, + "loss": 0.78790283, + "num_input_tokens_seen": 26091700, + "step": 1227, + "time_per_iteration": 2.472271680831909 + }, + { + "auxiliary_loss_clip": 0.01189366, + "auxiliary_loss_mlp": 0.01053497, + "balance_loss_clip": 1.06286407, + "balance_loss_mlp": 1.02967906, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.705206840159258, + "language_loss": 0.85349357, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.8759222, + "num_input_tokens_seen": 26114105, + "step": 1228, + "time_per_iteration": 3.9852311611175537 + }, + { + "auxiliary_loss_clip": 0.01190206, + "auxiliary_loss_mlp": 0.01062415, + "balance_loss_clip": 1.05804908, + "balance_loss_mlp": 1.03802502, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 2.3222658040785613, + "language_loss": 0.79573977, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81826603, + "num_input_tokens_seen": 26131165, + "step": 1229, + "time_per_iteration": 2.4836955070495605 + }, + { + "auxiliary_loss_clip": 0.01192601, + "auxiliary_loss_mlp": 0.00793764, + "balance_loss_clip": 1.05921984, + "balance_loss_mlp": 1.00059676, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 1.9641271953284745, + "language_loss": 0.78137994, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80124366, + "num_input_tokens_seen": 26150040, + "step": 1230, + "time_per_iteration": 2.489763021469116 + }, + { + "auxiliary_loss_clip": 0.01201703, + "auxiliary_loss_mlp": 0.0104958, + "balance_loss_clip": 1.0618062, + "balance_loss_mlp": 1.02465332, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 3.8823152159500633, + "language_loss": 0.81350446, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83601725, + "num_input_tokens_seen": 26169380, + "step": 1231, + "time_per_iteration": 2.499516248703003 + }, + { + "auxiliary_loss_clip": 0.0118067, + "auxiliary_loss_mlp": 0.01068809, + "balance_loss_clip": 1.05591583, + "balance_loss_mlp": 1.0432024, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.2493490414988364, + "language_loss": 0.95102471, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97351956, + "num_input_tokens_seen": 26189420, + "step": 1232, + "time_per_iteration": 2.5545847415924072 + }, + { + "auxiliary_loss_clip": 0.01186612, + "auxiliary_loss_mlp": 0.01056935, + "balance_loss_clip": 1.06117678, + "balance_loss_mlp": 1.03411889, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 1.774803205129442, + "language_loss": 0.81088018, + "learning_rate": 3.979605075738569e-06, + "loss": 0.83331567, + "num_input_tokens_seen": 26209300, + "step": 1233, + "time_per_iteration": 2.5241293907165527 + }, + { + "auxiliary_loss_clip": 0.01206026, + "auxiliary_loss_mlp": 0.01060906, + "balance_loss_clip": 1.05858874, + "balance_loss_mlp": 1.03473949, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.7345209733958122, + "language_loss": 0.7037493, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72641861, + "num_input_tokens_seen": 26228110, + "step": 1234, + "time_per_iteration": 2.5974974632263184 + }, + { + "auxiliary_loss_clip": 0.01173074, + "auxiliary_loss_mlp": 0.01069848, + "balance_loss_clip": 1.05648637, + "balance_loss_mlp": 1.04327619, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.016615903442107, + "language_loss": 0.76924384, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79167306, + "num_input_tokens_seen": 26247020, + "step": 1235, + "time_per_iteration": 2.5600759983062744 + }, + { + "auxiliary_loss_clip": 0.01198102, + "auxiliary_loss_mlp": 0.01058578, + "balance_loss_clip": 1.05982041, + "balance_loss_mlp": 1.03437901, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 2.231023077187181, + "language_loss": 0.8309747, + "learning_rate": 3.979438305871464e-06, + "loss": 0.85354149, + "num_input_tokens_seen": 26265750, + "step": 1236, + "time_per_iteration": 2.4760143756866455 + }, + { + "auxiliary_loss_clip": 0.01155583, + "auxiliary_loss_mlp": 0.00793074, + "balance_loss_clip": 1.05606484, + "balance_loss_mlp": 1.00065231, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 2.6453430450422344, + "language_loss": 0.75871259, + "learning_rate": 3.979382565791951e-06, + "loss": 0.7781992, + "num_input_tokens_seen": 26287905, + "step": 1237, + "time_per_iteration": 2.6262307167053223 + }, + { + "auxiliary_loss_clip": 0.01138509, + "auxiliary_loss_mlp": 0.00796008, + "balance_loss_clip": 1.05078077, + "balance_loss_mlp": 1.00061631, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.975472778013899, + "language_loss": 0.77125257, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79059774, + "num_input_tokens_seen": 26311795, + "step": 1238, + "time_per_iteration": 2.697720766067505 + }, + { + "auxiliary_loss_clip": 0.01176795, + "auxiliary_loss_mlp": 0.01061574, + "balance_loss_clip": 1.05761957, + "balance_loss_mlp": 1.03726768, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.110661602298105, + "language_loss": 0.86684722, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88923097, + "num_input_tokens_seen": 26330330, + "step": 1239, + "time_per_iteration": 2.587674140930176 + }, + { + "auxiliary_loss_clip": 0.01164262, + "auxiliary_loss_mlp": 0.01055611, + "balance_loss_clip": 1.05348361, + "balance_loss_mlp": 1.02996898, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.8313802134519825, + "language_loss": 0.88809121, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91028994, + "num_input_tokens_seen": 26348865, + "step": 1240, + "time_per_iteration": 2.6008262634277344 + }, + { + "auxiliary_loss_clip": 0.01178024, + "auxiliary_loss_mlp": 0.0106546, + "balance_loss_clip": 1.06138515, + "balance_loss_mlp": 1.03965163, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 1.856348478095502, + "language_loss": 0.88949865, + "learning_rate": 3.979158854911225e-06, + "loss": 0.91193342, + "num_input_tokens_seen": 26368210, + "step": 1241, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.01069082, + "auxiliary_loss_mlp": 0.01003264, + "balance_loss_clip": 1.02737725, + "balance_loss_mlp": 0.99983072, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.8942656867301869, + "language_loss": 0.63045979, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65118325, + "num_input_tokens_seen": 26424890, + "step": 1242, + "time_per_iteration": 3.1455636024475098 + }, + { + "auxiliary_loss_clip": 0.01168754, + "auxiliary_loss_mlp": 0.0106857, + "balance_loss_clip": 1.05432439, + "balance_loss_mlp": 1.03891087, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 3.4266902914379527, + "language_loss": 0.62755406, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.64992732, + "num_input_tokens_seen": 26446405, + "step": 1243, + "time_per_iteration": 2.5883090496063232 + }, + { + "auxiliary_loss_clip": 0.0119017, + "auxiliary_loss_mlp": 0.01058283, + "balance_loss_clip": 1.05807781, + "balance_loss_mlp": 1.03355849, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.6170822088526027, + "language_loss": 0.76532596, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78781044, + "num_input_tokens_seen": 26466070, + "step": 1244, + "time_per_iteration": 2.5277063846588135 + }, + { + "auxiliary_loss_clip": 0.01181619, + "auxiliary_loss_mlp": 0.00794603, + "balance_loss_clip": 1.05905902, + "balance_loss_mlp": 1.00069952, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 3.5461382025739985, + "language_loss": 0.6896106, + "learning_rate": 3.978933943232123e-06, + "loss": 0.70937282, + "num_input_tokens_seen": 26479350, + "step": 1245, + "time_per_iteration": 2.4919209480285645 + }, + { + "auxiliary_loss_clip": 0.01203115, + "auxiliary_loss_mlp": 0.01060689, + "balance_loss_clip": 1.06024992, + "balance_loss_mlp": 1.0356431, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.8938632391580132, + "language_loss": 0.88438725, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90702534, + "num_input_tokens_seen": 26498255, + "step": 1246, + "time_per_iteration": 2.4910550117492676 + }, + { + "auxiliary_loss_clip": 0.01213698, + "auxiliary_loss_mlp": 0.0107529, + "balance_loss_clip": 1.06152081, + "balance_loss_mlp": 1.04774117, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.6423128184619817, + "language_loss": 0.88324952, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90613937, + "num_input_tokens_seen": 26515375, + "step": 1247, + "time_per_iteration": 2.4486513137817383 + }, + { + "auxiliary_loss_clip": 0.01185617, + "auxiliary_loss_mlp": 0.01067, + "balance_loss_clip": 1.06136262, + "balance_loss_mlp": 1.04209745, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.3241816702081612, + "language_loss": 0.65102303, + "learning_rate": 3.978764471530921e-06, + "loss": 0.67354918, + "num_input_tokens_seen": 26533595, + "step": 1248, + "time_per_iteration": 2.502145528793335 + }, + { + "auxiliary_loss_clip": 0.0118749, + "auxiliary_loss_mlp": 0.0079394, + "balance_loss_clip": 1.06082034, + "balance_loss_mlp": 1.00076294, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 13.233381054948012, + "language_loss": 0.74509358, + "learning_rate": 3.978707830891102e-06, + "loss": 0.7649079, + "num_input_tokens_seen": 26549405, + "step": 1249, + "time_per_iteration": 2.470088481903076 + }, + { + "auxiliary_loss_clip": 0.01166052, + "auxiliary_loss_mlp": 0.01079589, + "balance_loss_clip": 1.05435109, + "balance_loss_mlp": 1.05379212, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 3.0139800029190664, + "language_loss": 0.81896985, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84142619, + "num_input_tokens_seen": 26567200, + "step": 1250, + "time_per_iteration": 2.5230932235717773 + }, + { + "auxiliary_loss_clip": 0.01155985, + "auxiliary_loss_mlp": 0.01059883, + "balance_loss_clip": 1.05855894, + "balance_loss_mlp": 1.03525436, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.000730677819673, + "language_loss": 0.67100829, + "learning_rate": 3.978594324515215e-06, + "loss": 0.69316697, + "num_input_tokens_seen": 26586190, + "step": 1251, + "time_per_iteration": 2.6723954677581787 + }, + { + "auxiliary_loss_clip": 0.01072431, + "auxiliary_loss_mlp": 0.01022007, + "balance_loss_clip": 1.03725863, + "balance_loss_mlp": 1.01807308, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.9361572126417257, + "language_loss": 0.70375943, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72470379, + "num_input_tokens_seen": 26650710, + "step": 1252, + "time_per_iteration": 3.179225206375122 + }, + { + "auxiliary_loss_clip": 0.01204033, + "auxiliary_loss_mlp": 0.01068634, + "balance_loss_clip": 1.06086957, + "balance_loss_mlp": 1.04418421, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.9757047609508342, + "language_loss": 0.80043793, + "learning_rate": 3.97848051802535e-06, + "loss": 0.82316458, + "num_input_tokens_seen": 26669000, + "step": 1253, + "time_per_iteration": 2.4970362186431885 + }, + { + "auxiliary_loss_clip": 0.01167421, + "auxiliary_loss_mlp": 0.01064513, + "balance_loss_clip": 1.05754697, + "balance_loss_mlp": 1.04054046, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 5.27802657197448, + "language_loss": 0.93518937, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95750874, + "num_input_tokens_seen": 26683075, + "step": 1254, + "time_per_iteration": 4.126815557479858 + }, + { + "auxiliary_loss_clip": 0.01176476, + "auxiliary_loss_mlp": 0.01064444, + "balance_loss_clip": 1.06261969, + "balance_loss_mlp": 1.04074478, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 2.074481716797264, + "language_loss": 0.87933379, + "learning_rate": 3.97836641143877e-06, + "loss": 0.90174294, + "num_input_tokens_seen": 26701875, + "step": 1255, + "time_per_iteration": 2.5990397930145264 + }, + { + "auxiliary_loss_clip": 0.01202313, + "auxiliary_loss_mlp": 0.01068765, + "balance_loss_clip": 1.06144357, + "balance_loss_mlp": 1.04371881, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 2.004912064912058, + "language_loss": 0.79779124, + "learning_rate": 3.978309245614618e-06, + "loss": 0.82050204, + "num_input_tokens_seen": 26719050, + "step": 1256, + "time_per_iteration": 3.9153337478637695 + }, + { + "auxiliary_loss_clip": 0.01068638, + "auxiliary_loss_mlp": 0.01003773, + "balance_loss_clip": 1.03230071, + "balance_loss_mlp": 0.99979115, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.776306989510213, + "language_loss": 0.57985419, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60057831, + "num_input_tokens_seen": 26780650, + "step": 1257, + "time_per_iteration": 3.1726176738739014 + }, + { + "auxiliary_loss_clip": 0.01155523, + "auxiliary_loss_mlp": 0.01061687, + "balance_loss_clip": 1.06279683, + "balance_loss_mlp": 1.03745198, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 6.817024707215891, + "language_loss": 0.90192401, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92409611, + "num_input_tokens_seen": 26798725, + "step": 1258, + "time_per_iteration": 2.6116437911987305 + }, + { + "auxiliary_loss_clip": 0.01174925, + "auxiliary_loss_mlp": 0.01062157, + "balance_loss_clip": 1.06428444, + "balance_loss_mlp": 1.03615761, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 3.812333854216531, + "language_loss": 0.81584764, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83821845, + "num_input_tokens_seen": 26817005, + "step": 1259, + "time_per_iteration": 2.5310590267181396 + }, + { + "auxiliary_loss_clip": 0.01197861, + "auxiliary_loss_mlp": 0.0106225, + "balance_loss_clip": 1.06468868, + "balance_loss_mlp": 1.03865838, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 2.179491671227446, + "language_loss": 0.76131153, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78391266, + "num_input_tokens_seen": 26836655, + "step": 1260, + "time_per_iteration": 3.927548885345459 + }, + { + "auxiliary_loss_clip": 0.01161273, + "auxiliary_loss_mlp": 0.01068091, + "balance_loss_clip": 1.05424261, + "balance_loss_mlp": 1.04237771, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 3.6713054424400906, + "language_loss": 0.84985614, + "learning_rate": 3.978022291272044e-06, + "loss": 0.87214983, + "num_input_tokens_seen": 26854925, + "step": 1261, + "time_per_iteration": 2.5325300693511963 + }, + { + "auxiliary_loss_clip": 0.01210092, + "auxiliary_loss_mlp": 0.01065847, + "balance_loss_clip": 1.06630266, + "balance_loss_mlp": 1.04254198, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.0425067604006473, + "language_loss": 0.82243633, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84519577, + "num_input_tokens_seen": 26876170, + "step": 1262, + "time_per_iteration": 2.5070290565490723 + }, + { + "auxiliary_loss_clip": 0.01203404, + "auxiliary_loss_mlp": 0.0106215, + "balance_loss_clip": 1.06149745, + "balance_loss_mlp": 1.03752136, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.655479509747497, + "language_loss": 0.8240875, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84674299, + "num_input_tokens_seen": 26895005, + "step": 1263, + "time_per_iteration": 2.464107036590576 + }, + { + "auxiliary_loss_clip": 0.01160095, + "auxiliary_loss_mlp": 0.01057568, + "balance_loss_clip": 1.06027436, + "balance_loss_mlp": 1.03411961, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.037852768981042, + "language_loss": 0.75981283, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78198946, + "num_input_tokens_seen": 26913930, + "step": 1264, + "time_per_iteration": 2.5957300662994385 + }, + { + "auxiliary_loss_clip": 0.0118184, + "auxiliary_loss_mlp": 0.01061786, + "balance_loss_clip": 1.06035805, + "balance_loss_mlp": 1.03750265, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.791837708202825, + "language_loss": 0.80942595, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83186221, + "num_input_tokens_seen": 26931485, + "step": 1265, + "time_per_iteration": 2.5055744647979736 + }, + { + "auxiliary_loss_clip": 0.01141879, + "auxiliary_loss_mlp": 0.01071846, + "balance_loss_clip": 1.04884243, + "balance_loss_mlp": 1.04569149, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.6328178787224346, + "language_loss": 0.6553598, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67749709, + "num_input_tokens_seen": 26951670, + "step": 1266, + "time_per_iteration": 2.5807266235351562 + }, + { + "auxiliary_loss_clip": 0.01160183, + "auxiliary_loss_mlp": 0.01064615, + "balance_loss_clip": 1.05765724, + "balance_loss_mlp": 1.04060626, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.2267039684330534, + "language_loss": 0.79111469, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81336272, + "num_input_tokens_seen": 26970335, + "step": 1267, + "time_per_iteration": 2.548067092895508 + }, + { + "auxiliary_loss_clip": 0.01180654, + "auxiliary_loss_mlp": 0.01054261, + "balance_loss_clip": 1.05976534, + "balance_loss_mlp": 1.03192163, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.3728264036569238, + "language_loss": 0.73105597, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75340515, + "num_input_tokens_seen": 26986025, + "step": 1268, + "time_per_iteration": 3.895848035812378 + }, + { + "auxiliary_loss_clip": 0.01186559, + "auxiliary_loss_mlp": 0.01063035, + "balance_loss_clip": 1.05613685, + "balance_loss_mlp": 1.03875196, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.631068667676171, + "language_loss": 0.82277226, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84526825, + "num_input_tokens_seen": 27004045, + "step": 1269, + "time_per_iteration": 2.471877336502075 + }, + { + "auxiliary_loss_clip": 0.01193403, + "auxiliary_loss_mlp": 0.01060328, + "balance_loss_clip": 1.06166291, + "balance_loss_mlp": 1.03604555, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 5.150902222805539, + "language_loss": 0.88803601, + "learning_rate": 3.977501048211088e-06, + "loss": 0.91057336, + "num_input_tokens_seen": 27022070, + "step": 1270, + "time_per_iteration": 2.4784581661224365 + }, + { + "auxiliary_loss_clip": 0.01187611, + "auxiliary_loss_mlp": 0.01062267, + "balance_loss_clip": 1.05810201, + "balance_loss_mlp": 1.03834152, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.3397211852188793, + "language_loss": 0.70809352, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73059237, + "num_input_tokens_seen": 27041755, + "step": 1271, + "time_per_iteration": 2.544335126876831 + }, + { + "auxiliary_loss_clip": 0.01152581, + "auxiliary_loss_mlp": 0.01075357, + "balance_loss_clip": 1.05693805, + "balance_loss_mlp": 1.05106258, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.595113836162177, + "language_loss": 0.82993472, + "learning_rate": 3.977384391505823e-06, + "loss": 0.8522141, + "num_input_tokens_seen": 27061540, + "step": 1272, + "time_per_iteration": 2.5773837566375732 + }, + { + "auxiliary_loss_clip": 0.01173873, + "auxiliary_loss_mlp": 0.00793301, + "balance_loss_clip": 1.05565858, + "balance_loss_mlp": 1.000844, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 1.5763513720559597, + "language_loss": 0.80117559, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82084727, + "num_input_tokens_seen": 27081395, + "step": 1273, + "time_per_iteration": 2.5542736053466797 + }, + { + "auxiliary_loss_clip": 0.01179201, + "auxiliary_loss_mlp": 0.0106771, + "balance_loss_clip": 1.06102848, + "balance_loss_mlp": 1.04322505, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 3.190576321853015, + "language_loss": 0.81138754, + "learning_rate": 3.977267434870103e-06, + "loss": 0.8338567, + "num_input_tokens_seen": 27101175, + "step": 1274, + "time_per_iteration": 2.578134059906006 + }, + { + "auxiliary_loss_clip": 0.01179279, + "auxiliary_loss_mlp": 0.01074833, + "balance_loss_clip": 1.05832458, + "balance_loss_mlp": 1.04896462, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 1.788569481261662, + "language_loss": 0.73031616, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75285727, + "num_input_tokens_seen": 27124505, + "step": 1275, + "time_per_iteration": 2.6372873783111572 + }, + { + "auxiliary_loss_clip": 0.01203842, + "auxiliary_loss_mlp": 0.01068825, + "balance_loss_clip": 1.06113744, + "balance_loss_mlp": 1.04350471, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 3.042316727054189, + "language_loss": 0.79461068, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81733733, + "num_input_tokens_seen": 27140960, + "step": 1276, + "time_per_iteration": 2.4442813396453857 + }, + { + "auxiliary_loss_clip": 0.01194634, + "auxiliary_loss_mlp": 0.0105863, + "balance_loss_clip": 1.06005287, + "balance_loss_mlp": 1.03553867, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.3990784601471717, + "language_loss": 0.59766459, + "learning_rate": 3.97709143758574e-06, + "loss": 0.62019718, + "num_input_tokens_seen": 27160985, + "step": 1277, + "time_per_iteration": 2.5689351558685303 + }, + { + "auxiliary_loss_clip": 0.01198288, + "auxiliary_loss_mlp": 0.01063646, + "balance_loss_clip": 1.06161869, + "balance_loss_mlp": 1.03948247, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 3.22240381818765, + "language_loss": 0.74639642, + "learning_rate": 3.977032621878305e-06, + "loss": 0.76901579, + "num_input_tokens_seen": 27178390, + "step": 1278, + "time_per_iteration": 2.4840502738952637 + }, + { + "auxiliary_loss_clip": 0.01159806, + "auxiliary_loss_mlp": 0.01065062, + "balance_loss_clip": 1.05672765, + "balance_loss_mlp": 1.04159009, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 2.6049002351579746, + "language_loss": 0.88414145, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90639013, + "num_input_tokens_seen": 27197505, + "step": 1279, + "time_per_iteration": 2.550356864929199 + }, + { + "auxiliary_loss_clip": 0.01167019, + "auxiliary_loss_mlp": 0.01063951, + "balance_loss_clip": 1.0539211, + "balance_loss_mlp": 1.03973973, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.5497760312591473, + "language_loss": 0.82011527, + "learning_rate": 3.976914765557845e-06, + "loss": 0.84242499, + "num_input_tokens_seen": 27214260, + "step": 1280, + "time_per_iteration": 2.5388238430023193 + }, + { + "auxiliary_loss_clip": 0.01185773, + "auxiliary_loss_mlp": 0.0106284, + "balance_loss_clip": 1.05920696, + "balance_loss_mlp": 1.03861701, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 3.0077254471963917, + "language_loss": 0.76351017, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78599632, + "num_input_tokens_seen": 27232525, + "step": 1281, + "time_per_iteration": 2.459705352783203 + }, + { + "auxiliary_loss_clip": 0.01164997, + "auxiliary_loss_mlp": 0.01063952, + "balance_loss_clip": 1.0542326, + "balance_loss_mlp": 1.03940642, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 2.011635143382921, + "language_loss": 0.75279063, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77508008, + "num_input_tokens_seen": 27249800, + "step": 1282, + "time_per_iteration": 2.5242655277252197 + }, + { + "auxiliary_loss_clip": 0.01200497, + "auxiliary_loss_mlp": 0.01072914, + "balance_loss_clip": 1.0607456, + "balance_loss_mlp": 1.04880941, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 1.8936065369068784, + "language_loss": 0.83846807, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86120212, + "num_input_tokens_seen": 27268895, + "step": 1283, + "time_per_iteration": 2.4358253479003906 + }, + { + "auxiliary_loss_clip": 0.01186065, + "auxiliary_loss_mlp": 0.010659, + "balance_loss_clip": 1.05712938, + "balance_loss_mlp": 1.03961456, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.0731323292373327, + "language_loss": 0.75176436, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77428401, + "num_input_tokens_seen": 27288180, + "step": 1284, + "time_per_iteration": 2.457888603210449 + }, + { + "auxiliary_loss_clip": 0.01173567, + "auxiliary_loss_mlp": 0.01072421, + "balance_loss_clip": 1.05478895, + "balance_loss_mlp": 1.04900837, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 3.0626051740188247, + "language_loss": 0.76218164, + "learning_rate": 3.976618812911817e-06, + "loss": 0.7846415, + "num_input_tokens_seen": 27311815, + "step": 1285, + "time_per_iteration": 2.6657018661499023 + }, + { + "auxiliary_loss_clip": 0.01207251, + "auxiliary_loss_mlp": 0.01070103, + "balance_loss_clip": 1.06585979, + "balance_loss_mlp": 1.0471313, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.6920949290368716, + "language_loss": 0.83918804, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.8619616, + "num_input_tokens_seen": 27331890, + "step": 1286, + "time_per_iteration": 2.483884572982788 + }, + { + "auxiliary_loss_clip": 0.01176282, + "auxiliary_loss_mlp": 0.01060008, + "balance_loss_clip": 1.05933511, + "balance_loss_mlp": 1.03589225, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 2.388877504430929, + "language_loss": 0.76998127, + "learning_rate": 3.97649990716259e-06, + "loss": 0.79234415, + "num_input_tokens_seen": 27348320, + "step": 1287, + "time_per_iteration": 2.488919258117676 + }, + { + "auxiliary_loss_clip": 0.01174659, + "auxiliary_loss_mlp": 0.0105722, + "balance_loss_clip": 1.05575264, + "balance_loss_mlp": 1.03347349, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 2.6913807733903945, + "language_loss": 0.84728914, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86960793, + "num_input_tokens_seen": 27367670, + "step": 1288, + "time_per_iteration": 2.5555338859558105 + }, + { + "auxiliary_loss_clip": 0.01204859, + "auxiliary_loss_mlp": 0.01060555, + "balance_loss_clip": 1.06073785, + "balance_loss_mlp": 1.03738117, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 3.1657304720959822, + "language_loss": 0.8529312, + "learning_rate": 3.976380701617068e-06, + "loss": 0.87558532, + "num_input_tokens_seen": 27385485, + "step": 1289, + "time_per_iteration": 2.461897850036621 + }, + { + "auxiliary_loss_clip": 0.01202171, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.06016171, + "balance_loss_mlp": 1.02998853, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.9724765909980204, + "language_loss": 0.8539722, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87652254, + "num_input_tokens_seen": 27405110, + "step": 1290, + "time_per_iteration": 2.488818407058716 + }, + { + "auxiliary_loss_clip": 0.01167458, + "auxiliary_loss_mlp": 0.01062888, + "balance_loss_clip": 1.05874658, + "balance_loss_mlp": 1.0376513, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.5410433357598277, + "language_loss": 0.90971112, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93201458, + "num_input_tokens_seen": 27422855, + "step": 1291, + "time_per_iteration": 2.4986557960510254 + }, + { + "auxiliary_loss_clip": 0.01076878, + "auxiliary_loss_mlp": 0.01008212, + "balance_loss_clip": 1.03392076, + "balance_loss_mlp": 1.00496984, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.890438827529014, + "language_loss": 0.6505419, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.6713928, + "num_input_tokens_seen": 27487190, + "step": 1292, + "time_per_iteration": 3.2012736797332764 + }, + { + "auxiliary_loss_clip": 0.01189907, + "auxiliary_loss_mlp": 0.0106085, + "balance_loss_clip": 1.06045198, + "balance_loss_mlp": 1.03729415, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.6989163159364766, + "language_loss": 0.8775959, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.90010345, + "num_input_tokens_seen": 27510465, + "step": 1293, + "time_per_iteration": 2.5797359943389893 + }, + { + "auxiliary_loss_clip": 0.01120923, + "auxiliary_loss_mlp": 0.01074906, + "balance_loss_clip": 1.05035174, + "balance_loss_mlp": 1.04593825, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.1357094814671713, + "language_loss": 0.84764767, + "learning_rate": 3.976081376263239e-06, + "loss": 0.86960602, + "num_input_tokens_seen": 27528645, + "step": 1294, + "time_per_iteration": 4.093553066253662 + }, + { + "auxiliary_loss_clip": 0.01156492, + "auxiliary_loss_mlp": 0.01059221, + "balance_loss_clip": 1.05616832, + "balance_loss_mlp": 1.03453231, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.457746966290446, + "language_loss": 0.7903837, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81254083, + "num_input_tokens_seen": 27546165, + "step": 1295, + "time_per_iteration": 2.56471586227417 + }, + { + "auxiliary_loss_clip": 0.01151707, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_clip": 1.05844879, + "balance_loss_mlp": 1.0291723, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.799498236588625, + "language_loss": 0.88012874, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90218097, + "num_input_tokens_seen": 27566520, + "step": 1296, + "time_per_iteration": 3.948199510574341 + }, + { + "auxiliary_loss_clip": 0.01207515, + "auxiliary_loss_mlp": 0.01063266, + "balance_loss_clip": 1.06449735, + "balance_loss_mlp": 1.03724241, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 2.8107153708493002, + "language_loss": 0.96015263, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98286051, + "num_input_tokens_seen": 27581960, + "step": 1297, + "time_per_iteration": 2.4754292964935303 + }, + { + "auxiliary_loss_clip": 0.01171137, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.05550361, + "balance_loss_mlp": 1.03072131, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.3882748169351338, + "language_loss": 0.76383293, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78609055, + "num_input_tokens_seen": 27601415, + "step": 1298, + "time_per_iteration": 2.5757980346679688 + }, + { + "auxiliary_loss_clip": 0.01151012, + "auxiliary_loss_mlp": 0.00795214, + "balance_loss_clip": 1.06348181, + "balance_loss_mlp": 1.00074601, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.7431270505351644, + "language_loss": 0.80747658, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.82693887, + "num_input_tokens_seen": 27621490, + "step": 1299, + "time_per_iteration": 4.010985612869263 + }, + { + "auxiliary_loss_clip": 0.01163917, + "auxiliary_loss_mlp": 0.01066995, + "balance_loss_clip": 1.06337917, + "balance_loss_mlp": 1.04228306, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.0023239168762648, + "language_loss": 0.86597079, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88827991, + "num_input_tokens_seen": 27640600, + "step": 1300, + "time_per_iteration": 2.6211001873016357 + }, + { + "auxiliary_loss_clip": 0.01202103, + "auxiliary_loss_mlp": 0.01062048, + "balance_loss_clip": 1.06130874, + "balance_loss_mlp": 1.03731203, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.8988069658445064, + "language_loss": 0.71958333, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74222481, + "num_input_tokens_seen": 27663070, + "step": 1301, + "time_per_iteration": 2.6914710998535156 + }, + { + "auxiliary_loss_clip": 0.01194386, + "auxiliary_loss_mlp": 0.01066479, + "balance_loss_clip": 1.06058466, + "balance_loss_mlp": 1.04220796, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.8679221828059511, + "language_loss": 0.71022916, + "learning_rate": 3.97559855928952e-06, + "loss": 0.7328378, + "num_input_tokens_seen": 27686425, + "step": 1302, + "time_per_iteration": 2.7125751972198486 + }, + { + "auxiliary_loss_clip": 0.01164675, + "auxiliary_loss_mlp": 0.00794286, + "balance_loss_clip": 1.06173384, + "balance_loss_mlp": 1.00080776, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.010259854625311, + "language_loss": 0.81692314, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.83651274, + "num_input_tokens_seen": 27704900, + "step": 1303, + "time_per_iteration": 2.568211317062378 + }, + { + "auxiliary_loss_clip": 0.01185596, + "auxiliary_loss_mlp": 0.01067433, + "balance_loss_clip": 1.06049609, + "balance_loss_mlp": 1.04295969, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 2.424631051628164, + "language_loss": 0.74940431, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77193457, + "num_input_tokens_seen": 27724890, + "step": 1304, + "time_per_iteration": 2.4815609455108643 + }, + { + "auxiliary_loss_clip": 0.01204122, + "auxiliary_loss_mlp": 0.01064204, + "balance_loss_clip": 1.06514573, + "balance_loss_mlp": 1.03919411, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.762341484004092, + "language_loss": 0.7660706, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78875381, + "num_input_tokens_seen": 27743115, + "step": 1305, + "time_per_iteration": 2.475579023361206 + }, + { + "auxiliary_loss_clip": 0.01139498, + "auxiliary_loss_mlp": 0.01058077, + "balance_loss_clip": 1.05784726, + "balance_loss_mlp": 1.03343606, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 1.9865168878582788, + "language_loss": 0.84795702, + "learning_rate": 3.975355352771841e-06, + "loss": 0.86993277, + "num_input_tokens_seen": 27763570, + "step": 1306, + "time_per_iteration": 2.642615556716919 + }, + { + "auxiliary_loss_clip": 0.01194087, + "auxiliary_loss_mlp": 0.0104792, + "balance_loss_clip": 1.06583881, + "balance_loss_mlp": 1.02457881, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 3.059429643439424, + "language_loss": 0.9075346, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92995465, + "num_input_tokens_seen": 27780030, + "step": 1307, + "time_per_iteration": 3.9137918949127197 + }, + { + "auxiliary_loss_clip": 0.01146101, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_clip": 1.05466151, + "balance_loss_mlp": 1.03556347, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 3.1798742816086487, + "language_loss": 0.83516574, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85723567, + "num_input_tokens_seen": 27796225, + "step": 1308, + "time_per_iteration": 2.5680458545684814 + }, + { + "auxiliary_loss_clip": 0.01155441, + "auxiliary_loss_mlp": 0.01059021, + "balance_loss_clip": 1.05535483, + "balance_loss_mlp": 1.03504777, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.4353959247126484, + "language_loss": 0.77333105, + "learning_rate": 3.975172161365958e-06, + "loss": 0.79547572, + "num_input_tokens_seen": 27815975, + "step": 1309, + "time_per_iteration": 2.5713438987731934 + }, + { + "auxiliary_loss_clip": 0.01199693, + "auxiliary_loss_mlp": 0.010658, + "balance_loss_clip": 1.06224144, + "balance_loss_mlp": 1.03900206, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 2.4048561312331524, + "language_loss": 0.80229259, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82494748, + "num_input_tokens_seen": 27832255, + "step": 1310, + "time_per_iteration": 2.46100115776062 + }, + { + "auxiliary_loss_clip": 0.01173502, + "auxiliary_loss_mlp": 0.00792643, + "balance_loss_clip": 1.06165612, + "balance_loss_mlp": 1.00086236, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7315804075588193, + "language_loss": 0.73207998, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75174141, + "num_input_tokens_seen": 27852180, + "step": 1311, + "time_per_iteration": 2.54887318611145 + }, + { + "auxiliary_loss_clip": 0.01188389, + "auxiliary_loss_mlp": 0.01081425, + "balance_loss_clip": 1.06186819, + "balance_loss_mlp": 1.05604529, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.815353507231622, + "language_loss": 0.85782462, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88052273, + "num_input_tokens_seen": 27871435, + "step": 1312, + "time_per_iteration": 2.49800968170166 + }, + { + "auxiliary_loss_clip": 0.01178629, + "auxiliary_loss_mlp": 0.01067068, + "balance_loss_clip": 1.06259823, + "balance_loss_mlp": 1.04379821, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.792962369169269, + "language_loss": 0.82121646, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84367341, + "num_input_tokens_seen": 27890625, + "step": 1313, + "time_per_iteration": 2.5247607231140137 + }, + { + "auxiliary_loss_clip": 0.01181453, + "auxiliary_loss_mlp": 0.00795414, + "balance_loss_clip": 1.05774307, + "balance_loss_mlp": 1.00089693, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 2.849354973056333, + "language_loss": 0.72990632, + "learning_rate": 3.97486534441264e-06, + "loss": 0.74967504, + "num_input_tokens_seen": 27906530, + "step": 1314, + "time_per_iteration": 2.4590084552764893 + }, + { + "auxiliary_loss_clip": 0.01158928, + "auxiliary_loss_mlp": 0.00795304, + "balance_loss_clip": 1.05719268, + "balance_loss_mlp": 1.00089514, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 2.021330062179839, + "language_loss": 0.79859447, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81813681, + "num_input_tokens_seen": 27926725, + "step": 1315, + "time_per_iteration": 2.5954346656799316 + }, + { + "auxiliary_loss_clip": 0.01190938, + "auxiliary_loss_mlp": 0.01066687, + "balance_loss_clip": 1.05871391, + "balance_loss_mlp": 1.0414381, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.6714793968644488, + "language_loss": 0.7386198, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76119602, + "num_input_tokens_seen": 27947875, + "step": 1316, + "time_per_iteration": 2.535975694656372 + }, + { + "auxiliary_loss_clip": 0.01165156, + "auxiliary_loss_mlp": 0.01070703, + "balance_loss_clip": 1.05856061, + "balance_loss_mlp": 1.04535902, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 4.423929301242326, + "language_loss": 0.65622246, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67858112, + "num_input_tokens_seen": 27965040, + "step": 1317, + "time_per_iteration": 2.5697219371795654 + }, + { + "auxiliary_loss_clip": 0.01180111, + "auxiliary_loss_mlp": 0.01067389, + "balance_loss_clip": 1.06453419, + "balance_loss_mlp": 1.0407455, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 3.003971543700719, + "language_loss": 0.729361, + "learning_rate": 3.974618542868415e-06, + "loss": 0.751836, + "num_input_tokens_seen": 27985330, + "step": 1318, + "time_per_iteration": 2.6327338218688965 + }, + { + "auxiliary_loss_clip": 0.01142647, + "auxiliary_loss_mlp": 0.01064308, + "balance_loss_clip": 1.05655551, + "balance_loss_mlp": 1.04118133, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.7047134809363331, + "language_loss": 0.90290117, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92497069, + "num_input_tokens_seen": 28007615, + "step": 1319, + "time_per_iteration": 2.6684367656707764 + }, + { + "auxiliary_loss_clip": 0.01178759, + "auxiliary_loss_mlp": 0.01065683, + "balance_loss_clip": 1.05756235, + "balance_loss_mlp": 1.04120898, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.677681771882241, + "language_loss": 0.80252802, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82497245, + "num_input_tokens_seen": 28027765, + "step": 1320, + "time_per_iteration": 2.495692253112793 + }, + { + "auxiliary_loss_clip": 0.01182488, + "auxiliary_loss_mlp": 0.01061749, + "balance_loss_clip": 1.06383896, + "balance_loss_mlp": 1.03811002, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.0524804014950084, + "language_loss": 0.69258481, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71502721, + "num_input_tokens_seen": 28044225, + "step": 1321, + "time_per_iteration": 2.5935235023498535 + }, + { + "auxiliary_loss_clip": 0.01188774, + "auxiliary_loss_mlp": 0.01067852, + "balance_loss_clip": 1.06468964, + "balance_loss_mlp": 1.04459465, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 2.505370543460301, + "language_loss": 0.83672959, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.85929585, + "num_input_tokens_seen": 28062915, + "step": 1322, + "time_per_iteration": 2.4772226810455322 + }, + { + "auxiliary_loss_clip": 0.01202956, + "auxiliary_loss_mlp": 0.01065587, + "balance_loss_clip": 1.06166935, + "balance_loss_mlp": 1.04050493, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 2.026547975133824, + "language_loss": 0.90220004, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92488551, + "num_input_tokens_seen": 28082175, + "step": 1323, + "time_per_iteration": 2.464226007461548 + }, + { + "auxiliary_loss_clip": 0.01167713, + "auxiliary_loss_mlp": 0.01061951, + "balance_loss_clip": 1.0631001, + "balance_loss_mlp": 1.03732276, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.8165203241038645, + "language_loss": 0.82291955, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84521616, + "num_input_tokens_seen": 28102645, + "step": 1324, + "time_per_iteration": 2.5717313289642334 + }, + { + "auxiliary_loss_clip": 0.01180108, + "auxiliary_loss_mlp": 0.01059308, + "balance_loss_clip": 1.06406689, + "balance_loss_mlp": 1.03409505, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.5726176595703074, + "language_loss": 0.79709327, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81948739, + "num_input_tokens_seen": 28122805, + "step": 1325, + "time_per_iteration": 2.547548532485962 + }, + { + "auxiliary_loss_clip": 0.0112361, + "auxiliary_loss_mlp": 0.00798271, + "balance_loss_clip": 1.0488776, + "balance_loss_mlp": 1.00082946, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.203816257115718, + "language_loss": 0.88280898, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90202779, + "num_input_tokens_seen": 28140530, + "step": 1326, + "time_per_iteration": 2.8440823554992676 + }, + { + "auxiliary_loss_clip": 0.01196751, + "auxiliary_loss_mlp": 0.01060164, + "balance_loss_clip": 1.06008637, + "balance_loss_mlp": 1.03561926, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.4633404126819713, + "language_loss": 0.83181566, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85438478, + "num_input_tokens_seen": 28159640, + "step": 1327, + "time_per_iteration": 2.6721744537353516 + }, + { + "auxiliary_loss_clip": 0.01205539, + "auxiliary_loss_mlp": 0.01057154, + "balance_loss_clip": 1.06511617, + "balance_loss_mlp": 1.03227568, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.2878109307454477, + "language_loss": 0.78026533, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80289227, + "num_input_tokens_seen": 28177050, + "step": 1328, + "time_per_iteration": 2.4398512840270996 + }, + { + "auxiliary_loss_clip": 0.01202301, + "auxiliary_loss_mlp": 0.01057139, + "balance_loss_clip": 1.06959271, + "balance_loss_mlp": 1.03227186, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 3.392443768864589, + "language_loss": 0.73612177, + "learning_rate": 3.973933661662101e-06, + "loss": 0.75871617, + "num_input_tokens_seen": 28193245, + "step": 1329, + "time_per_iteration": 2.456404209136963 + }, + { + "auxiliary_loss_clip": 0.01167592, + "auxiliary_loss_mlp": 0.0106553, + "balance_loss_clip": 1.06017566, + "balance_loss_mlp": 1.04190302, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 3.623687694460766, + "language_loss": 0.81265128, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83498251, + "num_input_tokens_seen": 28213570, + "step": 1330, + "time_per_iteration": 2.548180341720581 + }, + { + "auxiliary_loss_clip": 0.01203502, + "auxiliary_loss_mlp": 0.00792626, + "balance_loss_clip": 1.06337357, + "balance_loss_mlp": 1.00083518, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 5.525145887799788, + "language_loss": 0.89098084, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91094208, + "num_input_tokens_seen": 28229980, + "step": 1331, + "time_per_iteration": 2.4360883235931396 + }, + { + "auxiliary_loss_clip": 0.01196516, + "auxiliary_loss_mlp": 0.00793079, + "balance_loss_clip": 1.0612843, + "balance_loss_mlp": 1.00078642, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.46356947769657, + "language_loss": 0.72982788, + "learning_rate": 3.973745303858942e-06, + "loss": 0.74972391, + "num_input_tokens_seen": 28253840, + "step": 1332, + "time_per_iteration": 2.6548664569854736 + }, + { + "auxiliary_loss_clip": 0.01181142, + "auxiliary_loss_mlp": 0.0105645, + "balance_loss_clip": 1.06184983, + "balance_loss_mlp": 1.03322792, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.8329694832475387, + "language_loss": 0.82616735, + "learning_rate": 3.973682368232138e-06, + "loss": 0.84854329, + "num_input_tokens_seen": 28271675, + "step": 1333, + "time_per_iteration": 3.963465929031372 + }, + { + "auxiliary_loss_clip": 0.01164568, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_clip": 1.06058407, + "balance_loss_mlp": 1.03559935, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.54137656068077, + "language_loss": 0.7482599, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.77050495, + "num_input_tokens_seen": 28291850, + "step": 1334, + "time_per_iteration": 2.670004367828369 + }, + { + "auxiliary_loss_clip": 0.01177077, + "auxiliary_loss_mlp": 0.01066478, + "balance_loss_clip": 1.06643367, + "balance_loss_mlp": 1.04308939, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 2.3010689051331275, + "language_loss": 0.79884124, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82127678, + "num_input_tokens_seen": 28310780, + "step": 1335, + "time_per_iteration": 3.9491348266601562 + }, + { + "auxiliary_loss_clip": 0.01067298, + "auxiliary_loss_mlp": 0.01002732, + "balance_loss_clip": 1.04074192, + "balance_loss_mlp": 0.99974006, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7410458476561851, + "language_loss": 0.56047022, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58117056, + "num_input_tokens_seen": 28369985, + "step": 1336, + "time_per_iteration": 3.1739120483398438 + }, + { + "auxiliary_loss_clip": 0.01181666, + "auxiliary_loss_mlp": 0.01061599, + "balance_loss_clip": 1.06200242, + "balance_loss_mlp": 1.03867543, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.1713990833791392, + "language_loss": 0.67746365, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.69989628, + "num_input_tokens_seen": 28388670, + "step": 1337, + "time_per_iteration": 2.5779953002929688 + }, + { + "auxiliary_loss_clip": 0.01175582, + "auxiliary_loss_mlp": 0.01075494, + "balance_loss_clip": 1.0636946, + "balance_loss_mlp": 1.0506748, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.7978072301973647, + "language_loss": 0.87042153, + "learning_rate": 3.973366567512453e-06, + "loss": 0.89293242, + "num_input_tokens_seen": 28411845, + "step": 1338, + "time_per_iteration": 4.035330772399902 + }, + { + "auxiliary_loss_clip": 0.01143622, + "auxiliary_loss_mlp": 0.0107327, + "balance_loss_clip": 1.05202532, + "balance_loss_mlp": 1.04643559, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.3845195109755752, + "language_loss": 0.87380344, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89597237, + "num_input_tokens_seen": 28427875, + "step": 1339, + "time_per_iteration": 2.571910858154297 + }, + { + "auxiliary_loss_clip": 0.01189495, + "auxiliary_loss_mlp": 0.01057, + "balance_loss_clip": 1.06384945, + "balance_loss_mlp": 1.0348742, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.0385274846658716, + "language_loss": 0.89446747, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91693234, + "num_input_tokens_seen": 28446615, + "step": 1340, + "time_per_iteration": 2.4961061477661133 + }, + { + "auxiliary_loss_clip": 0.01078231, + "auxiliary_loss_mlp": 0.01021857, + "balance_loss_clip": 1.03006756, + "balance_loss_mlp": 1.01832795, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8844209458952098, + "language_loss": 0.64830816, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66930902, + "num_input_tokens_seen": 28505290, + "step": 1341, + "time_per_iteration": 2.9893200397491455 + }, + { + "auxiliary_loss_clip": 0.01196824, + "auxiliary_loss_mlp": 0.01063413, + "balance_loss_clip": 1.06302047, + "balance_loss_mlp": 1.03856945, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 1.9781761991404025, + "language_loss": 0.89654738, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91914976, + "num_input_tokens_seen": 28522735, + "step": 1342, + "time_per_iteration": 2.4820804595947266 + }, + { + "auxiliary_loss_clip": 0.01178925, + "auxiliary_loss_mlp": 0.01063456, + "balance_loss_clip": 1.06493962, + "balance_loss_mlp": 1.03876793, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.21295204246261, + "language_loss": 0.76689315, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78931695, + "num_input_tokens_seen": 28539460, + "step": 1343, + "time_per_iteration": 2.5151259899139404 + }, + { + "auxiliary_loss_clip": 0.01066774, + "auxiliary_loss_mlp": 0.010079, + "balance_loss_clip": 1.0279026, + "balance_loss_mlp": 1.00468159, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8060294467105165, + "language_loss": 0.5746057, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59535247, + "num_input_tokens_seen": 28599855, + "step": 1344, + "time_per_iteration": 2.984255075454712 + }, + { + "auxiliary_loss_clip": 0.01162579, + "auxiliary_loss_mlp": 0.01057413, + "balance_loss_clip": 1.06006813, + "balance_loss_mlp": 1.03258133, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.3635049506765187, + "language_loss": 0.86522508, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88742495, + "num_input_tokens_seen": 28617585, + "step": 1345, + "time_per_iteration": 2.5509114265441895 + }, + { + "auxiliary_loss_clip": 0.0120371, + "auxiliary_loss_mlp": 0.01056532, + "balance_loss_clip": 1.06671762, + "balance_loss_mlp": 1.03402507, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.6939647912700575, + "language_loss": 0.87230301, + "learning_rate": 3.972857395313042e-06, + "loss": 0.89490545, + "num_input_tokens_seen": 28636355, + "step": 1346, + "time_per_iteration": 2.470588445663452 + }, + { + "auxiliary_loss_clip": 0.0119189, + "auxiliary_loss_mlp": 0.01059177, + "balance_loss_clip": 1.06494546, + "balance_loss_mlp": 1.03509688, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6240662427497017, + "language_loss": 0.92752421, + "learning_rate": 3.972793412113439e-06, + "loss": 0.95003486, + "num_input_tokens_seen": 28656260, + "step": 1347, + "time_per_iteration": 3.950834274291992 + }, + { + "auxiliary_loss_clip": 0.01191993, + "auxiliary_loss_mlp": 0.01063326, + "balance_loss_clip": 1.06486547, + "balance_loss_mlp": 1.03848279, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 1.9460583058162784, + "language_loss": 0.89563608, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91818929, + "num_input_tokens_seen": 28675865, + "step": 1348, + "time_per_iteration": 2.5211851596832275 + }, + { + "auxiliary_loss_clip": 0.0113817, + "auxiliary_loss_mlp": 0.01055789, + "balance_loss_clip": 1.06089258, + "balance_loss_mlp": 1.03362787, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 1.6400344639305637, + "language_loss": 0.76673567, + "learning_rate": 3.97266522129109e-06, + "loss": 0.78867531, + "num_input_tokens_seen": 28696255, + "step": 1349, + "time_per_iteration": 2.667999744415283 + }, + { + "auxiliary_loss_clip": 0.01204527, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_clip": 1.06454408, + "balance_loss_mlp": 1.0400269, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.3761969284047875, + "language_loss": 0.88137227, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90405965, + "num_input_tokens_seen": 28713905, + "step": 1350, + "time_per_iteration": 2.49067759513855 + }, + { + "auxiliary_loss_clip": 0.01167017, + "auxiliary_loss_mlp": 0.00791452, + "balance_loss_clip": 1.05993128, + "balance_loss_mlp": 1.0007683, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 1.915166836730395, + "language_loss": 0.82518888, + "learning_rate": 3.972536731254092e-06, + "loss": 0.84477353, + "num_input_tokens_seen": 28732075, + "step": 1351, + "time_per_iteration": 2.5601308345794678 + }, + { + "auxiliary_loss_clip": 0.01199091, + "auxiliary_loss_mlp": 0.01052305, + "balance_loss_clip": 1.06042647, + "balance_loss_mlp": 1.02753377, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 2.4244455544577903, + "language_loss": 0.75748861, + "learning_rate": 3.972472374036189e-06, + "loss": 0.78000247, + "num_input_tokens_seen": 28751150, + "step": 1352, + "time_per_iteration": 2.505892276763916 + }, + { + "auxiliary_loss_clip": 0.01194973, + "auxiliary_loss_mlp": 0.00794964, + "balance_loss_clip": 1.06544209, + "balance_loss_mlp": 1.00070238, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 2.3377205081807992, + "language_loss": 0.83268082, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85258019, + "num_input_tokens_seen": 28773360, + "step": 1353, + "time_per_iteration": 2.544276475906372 + }, + { + "auxiliary_loss_clip": 0.01074031, + "auxiliary_loss_mlp": 0.01007846, + "balance_loss_clip": 1.02907205, + "balance_loss_mlp": 1.00446033, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8617625248541092, + "language_loss": 0.59733272, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61815155, + "num_input_tokens_seen": 28833390, + "step": 1354, + "time_per_iteration": 3.078716516494751 + }, + { + "auxiliary_loss_clip": 0.01152711, + "auxiliary_loss_mlp": 0.01059032, + "balance_loss_clip": 1.05599141, + "balance_loss_mlp": 1.03585792, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.8413312813713443, + "language_loss": 0.82733595, + "learning_rate": 3.972278853614154e-06, + "loss": 0.84945345, + "num_input_tokens_seen": 28852430, + "step": 1355, + "time_per_iteration": 2.582401752471924 + }, + { + "auxiliary_loss_clip": 0.01189682, + "auxiliary_loss_mlp": 0.01066127, + "balance_loss_clip": 1.06119895, + "balance_loss_mlp": 1.04060447, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.8461803460940047, + "language_loss": 0.71146202, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73402011, + "num_input_tokens_seen": 28870685, + "step": 1356, + "time_per_iteration": 2.4931838512420654 + }, + { + "auxiliary_loss_clip": 0.0119484, + "auxiliary_loss_mlp": 0.01058315, + "balance_loss_clip": 1.0638926, + "balance_loss_mlp": 1.0342108, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 1.7962099294661817, + "language_loss": 0.70651382, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72904539, + "num_input_tokens_seen": 28889860, + "step": 1357, + "time_per_iteration": 2.5393450260162354 + }, + { + "auxiliary_loss_clip": 0.01188043, + "auxiliary_loss_mlp": 0.0106165, + "balance_loss_clip": 1.06264925, + "balance_loss_mlp": 1.03834498, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.169303763483565, + "language_loss": 0.84164679, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86414373, + "num_input_tokens_seen": 28905865, + "step": 1358, + "time_per_iteration": 2.490936756134033 + }, + { + "auxiliary_loss_clip": 0.01180043, + "auxiliary_loss_mlp": 0.01062846, + "balance_loss_clip": 1.06325293, + "balance_loss_mlp": 1.03759801, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 2.0352413623136627, + "language_loss": 1.02301931, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04544818, + "num_input_tokens_seen": 28925250, + "step": 1359, + "time_per_iteration": 2.552682876586914 + }, + { + "auxiliary_loss_clip": 0.01133196, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.05195975, + "balance_loss_mlp": 1.0375638, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 1.997261767504377, + "language_loss": 0.83767986, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85962951, + "num_input_tokens_seen": 28943445, + "step": 1360, + "time_per_iteration": 2.727053642272949 + }, + { + "auxiliary_loss_clip": 0.01201483, + "auxiliary_loss_mlp": 0.01072866, + "balance_loss_clip": 1.06180763, + "balance_loss_mlp": 1.04938161, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.708932402854641, + "language_loss": 0.72176659, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74451011, + "num_input_tokens_seen": 28962695, + "step": 1361, + "time_per_iteration": 2.4966418743133545 + }, + { + "auxiliary_loss_clip": 0.01168282, + "auxiliary_loss_mlp": 0.01064143, + "balance_loss_clip": 1.05557299, + "balance_loss_mlp": 1.03890646, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 3.293598807726063, + "language_loss": 0.7653929, + "learning_rate": 3.971824688461976e-06, + "loss": 0.78771716, + "num_input_tokens_seen": 28982120, + "step": 1362, + "time_per_iteration": 2.5469589233398438 + }, + { + "auxiliary_loss_clip": 0.01201039, + "auxiliary_loss_mlp": 0.0105671, + "balance_loss_clip": 1.0649997, + "balance_loss_mlp": 1.03422689, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.7112753832258827, + "language_loss": 0.72730327, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74988079, + "num_input_tokens_seen": 28998100, + "step": 1363, + "time_per_iteration": 2.4419426918029785 + }, + { + "auxiliary_loss_clip": 0.01201877, + "auxiliary_loss_mlp": 0.01073045, + "balance_loss_clip": 1.0665493, + "balance_loss_mlp": 1.04730821, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 1.799534932884765, + "language_loss": 0.7731688, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79591805, + "num_input_tokens_seen": 29017095, + "step": 1364, + "time_per_iteration": 2.481529712677002 + }, + { + "auxiliary_loss_clip": 0.01139089, + "auxiliary_loss_mlp": 0.01065643, + "balance_loss_clip": 1.05272818, + "balance_loss_mlp": 1.03929782, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.0257806794741637, + "language_loss": 0.82048237, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84252965, + "num_input_tokens_seen": 29037240, + "step": 1365, + "time_per_iteration": 2.6001932621002197 + }, + { + "auxiliary_loss_clip": 0.01196043, + "auxiliary_loss_mlp": 0.01069757, + "balance_loss_clip": 1.06891572, + "balance_loss_mlp": 1.04645169, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 27.216594413242042, + "language_loss": 0.82257277, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84523076, + "num_input_tokens_seen": 29056250, + "step": 1366, + "time_per_iteration": 2.5138633251190186 + }, + { + "auxiliary_loss_clip": 0.01154043, + "auxiliary_loss_mlp": 0.01076181, + "balance_loss_clip": 1.05240715, + "balance_loss_mlp": 1.05238712, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 1.6387924687715913, + "language_loss": 0.81613797, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83844024, + "num_input_tokens_seen": 29073380, + "step": 1367, + "time_per_iteration": 2.527020215988159 + }, + { + "auxiliary_loss_clip": 0.01203474, + "auxiliary_loss_mlp": 0.01064463, + "balance_loss_clip": 1.0639708, + "balance_loss_mlp": 1.04056144, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.1198904737676414, + "language_loss": 0.83696848, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85964781, + "num_input_tokens_seen": 29091330, + "step": 1368, + "time_per_iteration": 2.452029228210449 + }, + { + "auxiliary_loss_clip": 0.0115652, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.06038797, + "balance_loss_mlp": 1.03250432, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.6924517005244204, + "language_loss": 0.8161478, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83825713, + "num_input_tokens_seen": 29110375, + "step": 1369, + "time_per_iteration": 2.6023929119110107 + }, + { + "auxiliary_loss_clip": 0.01139344, + "auxiliary_loss_mlp": 0.00792801, + "balance_loss_clip": 1.05425072, + "balance_loss_mlp": 1.00069916, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.289054293113324, + "language_loss": 0.74867296, + "learning_rate": 3.971301156316582e-06, + "loss": 0.7679944, + "num_input_tokens_seen": 29129395, + "step": 1370, + "time_per_iteration": 2.66086745262146 + }, + { + "auxiliary_loss_clip": 0.01151763, + "auxiliary_loss_mlp": 0.01066476, + "balance_loss_clip": 1.06120157, + "balance_loss_mlp": 1.04245591, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.672025214212037, + "language_loss": 0.74482512, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76700747, + "num_input_tokens_seen": 29148650, + "step": 1371, + "time_per_iteration": 2.5970840454101562 + }, + { + "auxiliary_loss_clip": 0.01099724, + "auxiliary_loss_mlp": 0.01063552, + "balance_loss_clip": 1.05148137, + "balance_loss_mlp": 1.03881669, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 5.996463526571552, + "language_loss": 0.70948726, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73111999, + "num_input_tokens_seen": 29170785, + "step": 1372, + "time_per_iteration": 4.52979302406311 + }, + { + "auxiliary_loss_clip": 0.01164268, + "auxiliary_loss_mlp": 0.01057955, + "balance_loss_clip": 1.05571961, + "balance_loss_mlp": 1.03454268, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 2.811276858573368, + "language_loss": 0.88373828, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.9059605, + "num_input_tokens_seen": 29185210, + "step": 1373, + "time_per_iteration": 2.9920969009399414 + }, + { + "auxiliary_loss_clip": 0.01153856, + "auxiliary_loss_mlp": 0.01062533, + "balance_loss_clip": 1.05484128, + "balance_loss_mlp": 1.03892922, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.8534582323379465, + "language_loss": 0.82246804, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84463191, + "num_input_tokens_seen": 29205210, + "step": 1374, + "time_per_iteration": 4.238830804824829 + }, + { + "auxiliary_loss_clip": 0.01044391, + "auxiliary_loss_mlp": 0.01005415, + "balance_loss_clip": 1.02874923, + "balance_loss_mlp": 1.00167203, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8230931177611689, + "language_loss": 0.6061101, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62660819, + "num_input_tokens_seen": 29265350, + "step": 1375, + "time_per_iteration": 3.212329864501953 + }, + { + "auxiliary_loss_clip": 0.0106046, + "auxiliary_loss_mlp": 0.01014723, + "balance_loss_clip": 1.0231998, + "balance_loss_mlp": 1.01155174, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9107007911429694, + "language_loss": 0.62166351, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64241534, + "num_input_tokens_seen": 29321475, + "step": 1376, + "time_per_iteration": 3.0221166610717773 + }, + { + "auxiliary_loss_clip": 0.01159227, + "auxiliary_loss_mlp": 0.01068059, + "balance_loss_clip": 1.05989265, + "balance_loss_mlp": 1.04480171, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 2.7189472640454713, + "language_loss": 0.82590365, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84817648, + "num_input_tokens_seen": 29341405, + "step": 1377, + "time_per_iteration": 4.071829080581665 + }, + { + "auxiliary_loss_clip": 0.01171784, + "auxiliary_loss_mlp": 0.01056874, + "balance_loss_clip": 1.06042814, + "balance_loss_mlp": 1.03361595, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 3.305291338885858, + "language_loss": 0.84703749, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86932409, + "num_input_tokens_seen": 29361955, + "step": 1378, + "time_per_iteration": 2.5941405296325684 + }, + { + "auxiliary_loss_clip": 0.01183866, + "auxiliary_loss_mlp": 0.0106044, + "balance_loss_clip": 1.05947852, + "balance_loss_mlp": 1.03641987, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 2.1871212896547254, + "language_loss": 0.87698519, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89942825, + "num_input_tokens_seen": 29382395, + "step": 1379, + "time_per_iteration": 2.5768706798553467 + }, + { + "auxiliary_loss_clip": 0.01157976, + "auxiliary_loss_mlp": 0.01064019, + "balance_loss_clip": 1.05498505, + "balance_loss_mlp": 1.04099941, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 3.7451724727690365, + "language_loss": 0.78587157, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80809152, + "num_input_tokens_seen": 29404460, + "step": 1380, + "time_per_iteration": 2.7958571910858154 + }, + { + "auxiliary_loss_clip": 0.01182819, + "auxiliary_loss_mlp": 0.01064362, + "balance_loss_clip": 1.06355667, + "balance_loss_mlp": 1.04041278, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.2329670782389823, + "language_loss": 0.85863602, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88110787, + "num_input_tokens_seen": 29422675, + "step": 1381, + "time_per_iteration": 2.5514931678771973 + }, + { + "auxiliary_loss_clip": 0.01193912, + "auxiliary_loss_mlp": 0.00792501, + "balance_loss_clip": 1.06284845, + "balance_loss_mlp": 1.00086939, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 5.088853928498281, + "language_loss": 0.88180196, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90166616, + "num_input_tokens_seen": 29439840, + "step": 1382, + "time_per_iteration": 2.475792646408081 + }, + { + "auxiliary_loss_clip": 0.01160471, + "auxiliary_loss_mlp": 0.01057121, + "balance_loss_clip": 1.06021988, + "balance_loss_mlp": 1.03491247, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.3038748476074162, + "language_loss": 0.77399504, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79617107, + "num_input_tokens_seen": 29457360, + "step": 1383, + "time_per_iteration": 2.5654263496398926 + }, + { + "auxiliary_loss_clip": 0.01190262, + "auxiliary_loss_mlp": 0.01056561, + "balance_loss_clip": 1.06029582, + "balance_loss_mlp": 1.03350592, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 2.56837291619552, + "language_loss": 0.83061945, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85308778, + "num_input_tokens_seen": 29477040, + "step": 1384, + "time_per_iteration": 2.507277250289917 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01058714, + "balance_loss_clip": 1.05484986, + "balance_loss_mlp": 1.03099775, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.934804982353076, + "language_loss": 0.85229313, + "learning_rate": 3.970306639845e-06, + "loss": 0.87436742, + "num_input_tokens_seen": 29492010, + "step": 1385, + "time_per_iteration": 2.6179323196411133 + }, + { + "auxiliary_loss_clip": 0.01158137, + "auxiliary_loss_mlp": 0.01060214, + "balance_loss_clip": 1.05992699, + "balance_loss_mlp": 1.0363965, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 3.010758503038589, + "language_loss": 0.68858182, + "learning_rate": 3.970239740938835e-06, + "loss": 0.71076536, + "num_input_tokens_seen": 29511850, + "step": 1386, + "time_per_iteration": 2.564462900161743 + }, + { + "auxiliary_loss_clip": 0.01174772, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_clip": 1.05518317, + "balance_loss_mlp": 1.03337932, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.6689799003155221, + "language_loss": 0.81958151, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84189707, + "num_input_tokens_seen": 29531415, + "step": 1387, + "time_per_iteration": 3.899775981903076 + }, + { + "auxiliary_loss_clip": 0.01178553, + "auxiliary_loss_mlp": 0.01067008, + "balance_loss_clip": 1.06168342, + "balance_loss_mlp": 1.04247475, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 3.311574280722923, + "language_loss": 0.77076572, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79322135, + "num_input_tokens_seen": 29549525, + "step": 1388, + "time_per_iteration": 2.5147411823272705 + }, + { + "auxiliary_loss_clip": 0.01132234, + "auxiliary_loss_mlp": 0.01062478, + "balance_loss_clip": 1.05824077, + "balance_loss_mlp": 1.03728867, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 4.033657818643421, + "language_loss": 0.79614699, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81809402, + "num_input_tokens_seen": 29568705, + "step": 1389, + "time_per_iteration": 2.700643301010132 + }, + { + "auxiliary_loss_clip": 0.01175583, + "auxiliary_loss_mlp": 0.01063338, + "balance_loss_clip": 1.0587461, + "balance_loss_mlp": 1.03957939, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 3.6792004391616837, + "language_loss": 0.87597698, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89836621, + "num_input_tokens_seen": 29585855, + "step": 1390, + "time_per_iteration": 3.0163276195526123 + }, + { + "auxiliary_loss_clip": 0.01163742, + "auxiliary_loss_mlp": 0.01063665, + "balance_loss_clip": 1.05723321, + "balance_loss_mlp": 1.03761816, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 3.1642264811597363, + "language_loss": 0.86959183, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89186585, + "num_input_tokens_seen": 29607280, + "step": 1391, + "time_per_iteration": 2.5939433574676514 + }, + { + "auxiliary_loss_clip": 0.01158378, + "auxiliary_loss_mlp": 0.01073977, + "balance_loss_clip": 1.05668557, + "balance_loss_mlp": 1.05004025, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 2.558630849633882, + "language_loss": 0.87793171, + "learning_rate": 3.969836778645371e-06, + "loss": 0.90025532, + "num_input_tokens_seen": 29624130, + "step": 1392, + "time_per_iteration": 2.6270275115966797 + }, + { + "auxiliary_loss_clip": 0.01187326, + "auxiliary_loss_mlp": 0.01064349, + "balance_loss_clip": 1.06118274, + "balance_loss_mlp": 1.04078174, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.7204995594692196, + "language_loss": 0.80780399, + "learning_rate": 3.969769356810819e-06, + "loss": 0.83032072, + "num_input_tokens_seen": 29643210, + "step": 1393, + "time_per_iteration": 2.545454502105713 + }, + { + "auxiliary_loss_clip": 0.01200825, + "auxiliary_loss_mlp": 0.01058605, + "balance_loss_clip": 1.06708169, + "balance_loss_mlp": 1.03613448, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.922452011109715, + "language_loss": 0.84496236, + "learning_rate": 3.969701860282415e-06, + "loss": 0.86755669, + "num_input_tokens_seen": 29663920, + "step": 1394, + "time_per_iteration": 2.5265438556671143 + }, + { + "auxiliary_loss_clip": 0.01146716, + "auxiliary_loss_mlp": 0.01059222, + "balance_loss_clip": 1.06027341, + "balance_loss_mlp": 1.03577399, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 2.1640249254121056, + "language_loss": 0.82800376, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85006309, + "num_input_tokens_seen": 29683825, + "step": 1395, + "time_per_iteration": 2.603213310241699 + }, + { + "auxiliary_loss_clip": 0.01188443, + "auxiliary_loss_mlp": 0.00793591, + "balance_loss_clip": 1.06279612, + "balance_loss_mlp": 1.00075126, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 3.0201724113304023, + "language_loss": 0.83025175, + "learning_rate": 3.969566643154293e-06, + "loss": 0.85007203, + "num_input_tokens_seen": 29698775, + "step": 1396, + "time_per_iteration": 2.482184648513794 + }, + { + "auxiliary_loss_clip": 0.01186614, + "auxiliary_loss_mlp": 0.01062762, + "balance_loss_clip": 1.06445718, + "balance_loss_mlp": 1.03758478, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.1990534263018295, + "language_loss": 0.76895708, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79145086, + "num_input_tokens_seen": 29719430, + "step": 1397, + "time_per_iteration": 2.6419196128845215 + }, + { + "auxiliary_loss_clip": 0.01155291, + "auxiliary_loss_mlp": 0.01050913, + "balance_loss_clip": 1.06080139, + "balance_loss_mlp": 1.02702379, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.0677280043096213, + "language_loss": 0.77923316, + "learning_rate": 3.969431127281516e-06, + "loss": 0.80129516, + "num_input_tokens_seen": 29739685, + "step": 1398, + "time_per_iteration": 2.600210189819336 + }, + { + "auxiliary_loss_clip": 0.01191052, + "auxiliary_loss_mlp": 0.0105467, + "balance_loss_clip": 1.05988562, + "balance_loss_mlp": 1.03157902, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 2.341675324291026, + "language_loss": 0.94924527, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97170252, + "num_input_tokens_seen": 29756165, + "step": 1399, + "time_per_iteration": 2.437361001968384 + }, + { + "auxiliary_loss_clip": 0.01173855, + "auxiliary_loss_mlp": 0.01068717, + "balance_loss_clip": 1.05483842, + "balance_loss_mlp": 1.043051, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.7588264553998, + "language_loss": 0.81747156, + "learning_rate": 3.96929531268464e-06, + "loss": 0.83989727, + "num_input_tokens_seen": 29776425, + "step": 1400, + "time_per_iteration": 2.524796724319458 + }, + { + "auxiliary_loss_clip": 0.01169087, + "auxiliary_loss_mlp": 0.01060576, + "balance_loss_clip": 1.05910647, + "balance_loss_mlp": 1.03687739, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 1.87426431161583, + "language_loss": 0.86614072, + "learning_rate": 3.969227293371099e-06, + "loss": 0.88843733, + "num_input_tokens_seen": 29796440, + "step": 1401, + "time_per_iteration": 2.59269118309021 + }, + { + "auxiliary_loss_clip": 0.01195693, + "auxiliary_loss_mlp": 0.01062661, + "balance_loss_clip": 1.05877876, + "balance_loss_mlp": 1.03815174, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.50560430288914, + "language_loss": 0.87257683, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89516032, + "num_input_tokens_seen": 29814755, + "step": 1402, + "time_per_iteration": 2.5332846641540527 + }, + { + "auxiliary_loss_clip": 0.01147523, + "auxiliary_loss_mlp": 0.00792423, + "balance_loss_clip": 1.05131733, + "balance_loss_mlp": 1.00083995, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.2792599326561516, + "language_loss": 0.88739085, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90679038, + "num_input_tokens_seen": 29834785, + "step": 1403, + "time_per_iteration": 2.734802722930908 + }, + { + "auxiliary_loss_clip": 0.01166594, + "auxiliary_loss_mlp": 0.0106103, + "balance_loss_clip": 1.0572834, + "balance_loss_mlp": 1.03630567, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.931543087288917, + "language_loss": 0.80280662, + "learning_rate": 3.969022787401033e-06, + "loss": 0.8250829, + "num_input_tokens_seen": 29854695, + "step": 1404, + "time_per_iteration": 2.5188968181610107 + }, + { + "auxiliary_loss_clip": 0.01182482, + "auxiliary_loss_mlp": 0.01075068, + "balance_loss_clip": 1.06227684, + "balance_loss_mlp": 1.05058265, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.8792016095498822, + "language_loss": 0.8309257, + "learning_rate": 3.968954469409811e-06, + "loss": 0.85350126, + "num_input_tokens_seen": 29872180, + "step": 1405, + "time_per_iteration": 2.50286865234375 + }, + { + "auxiliary_loss_clip": 0.01181453, + "auxiliary_loss_mlp": 0.01056378, + "balance_loss_clip": 1.05474305, + "balance_loss_mlp": 1.03384781, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 2.458180841839169, + "language_loss": 0.8031584, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82553673, + "num_input_tokens_seen": 29893205, + "step": 1406, + "time_per_iteration": 2.536987066268921 + }, + { + "auxiliary_loss_clip": 0.01172281, + "auxiliary_loss_mlp": 0.01072879, + "balance_loss_clip": 1.05868912, + "balance_loss_mlp": 1.04913282, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 2.218608257828336, + "language_loss": 0.7943297, + "learning_rate": 3.96881760944111e-06, + "loss": 0.81678128, + "num_input_tokens_seen": 29911970, + "step": 1407, + "time_per_iteration": 2.5424065589904785 + }, + { + "auxiliary_loss_clip": 0.01183735, + "auxiliary_loss_mlp": 0.01056873, + "balance_loss_clip": 1.05839181, + "balance_loss_mlp": 1.03303134, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 2.652867277764469, + "language_loss": 0.9160217, + "learning_rate": 3.968749067468819e-06, + "loss": 0.93842781, + "num_input_tokens_seen": 29929925, + "step": 1408, + "time_per_iteration": 2.457052707672119 + }, + { + "auxiliary_loss_clip": 0.01069985, + "auxiliary_loss_mlp": 0.01016219, + "balance_loss_clip": 1.02960587, + "balance_loss_mlp": 1.01309574, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.914497711422582, + "language_loss": 0.61793941, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63880146, + "num_input_tokens_seen": 29985950, + "step": 1409, + "time_per_iteration": 3.136484146118164 + }, + { + "auxiliary_loss_clip": 0.0119068, + "auxiliary_loss_mlp": 0.01064137, + "balance_loss_clip": 1.06038642, + "balance_loss_mlp": 1.04160595, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 2.693227048213835, + "language_loss": 0.8679167, + "learning_rate": 3.968611759561355e-06, + "loss": 0.8904649, + "num_input_tokens_seen": 30004330, + "step": 1410, + "time_per_iteration": 2.46341872215271 + }, + { + "auxiliary_loss_clip": 0.01182017, + "auxiliary_loss_mlp": 0.01054916, + "balance_loss_clip": 1.05924702, + "balance_loss_mlp": 1.02927375, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.3128978285384734, + "language_loss": 0.7468583, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76922762, + "num_input_tokens_seen": 30022555, + "step": 1411, + "time_per_iteration": 3.901576519012451 + }, + { + "auxiliary_loss_clip": 0.01087834, + "auxiliary_loss_mlp": 0.01005547, + "balance_loss_clip": 1.02863431, + "balance_loss_mlp": 1.00209033, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9087461124327998, + "language_loss": 0.56714833, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58808208, + "num_input_tokens_seen": 30077220, + "step": 1412, + "time_per_iteration": 2.952892780303955 + }, + { + "auxiliary_loss_clip": 0.01159186, + "auxiliary_loss_mlp": 0.01061719, + "balance_loss_clip": 1.0542922, + "balance_loss_mlp": 1.03695905, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.5466536142366554, + "language_loss": 0.8954137, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91762275, + "num_input_tokens_seen": 30094600, + "step": 1413, + "time_per_iteration": 3.8820254802703857 + }, + { + "auxiliary_loss_clip": 0.0116474, + "auxiliary_loss_mlp": 0.01055183, + "balance_loss_clip": 1.05744648, + "balance_loss_mlp": 1.03116226, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.0673999910111824, + "language_loss": 0.88068676, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90288591, + "num_input_tokens_seen": 30114475, + "step": 1414, + "time_per_iteration": 2.5286476612091064 + }, + { + "auxiliary_loss_clip": 0.01170529, + "auxiliary_loss_mlp": 0.01060452, + "balance_loss_clip": 1.05920458, + "balance_loss_mlp": 1.03805315, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 2.1251212842657377, + "language_loss": 0.77550215, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79781199, + "num_input_tokens_seen": 30133350, + "step": 1415, + "time_per_iteration": 2.5164525508880615 + }, + { + "auxiliary_loss_clip": 0.01181234, + "auxiliary_loss_mlp": 0.01055834, + "balance_loss_clip": 1.05836582, + "balance_loss_mlp": 1.0336616, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 3.4378667947042976, + "language_loss": 0.7076416, + "learning_rate": 3.968198044323587e-06, + "loss": 0.7300123, + "num_input_tokens_seen": 30159005, + "step": 1416, + "time_per_iteration": 2.862006187438965 + }, + { + "auxiliary_loss_clip": 0.01175585, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_clip": 1.06033862, + "balance_loss_mlp": 1.04354501, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 3.337668637371872, + "language_loss": 0.75187325, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77431864, + "num_input_tokens_seen": 30179450, + "step": 1417, + "time_per_iteration": 4.15311336517334 + }, + { + "auxiliary_loss_clip": 0.01171698, + "auxiliary_loss_mlp": 0.01052729, + "balance_loss_clip": 1.05830705, + "balance_loss_mlp": 1.02868462, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.2352777034914637, + "language_loss": 0.82240653, + "learning_rate": 3.968059542142265e-06, + "loss": 0.8446508, + "num_input_tokens_seen": 30197235, + "step": 1418, + "time_per_iteration": 2.4974348545074463 + }, + { + "auxiliary_loss_clip": 0.01047021, + "auxiliary_loss_mlp": 0.01016104, + "balance_loss_clip": 1.02421284, + "balance_loss_mlp": 1.01318312, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8981315393708561, + "language_loss": 0.56610131, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58673251, + "num_input_tokens_seen": 30257410, + "step": 1419, + "time_per_iteration": 3.068135976791382 + }, + { + "auxiliary_loss_clip": 0.01191646, + "auxiliary_loss_mlp": 0.01057488, + "balance_loss_clip": 1.05679917, + "balance_loss_mlp": 1.03355122, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.421638733156774, + "language_loss": 0.70302087, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72551221, + "num_input_tokens_seen": 30277865, + "step": 1420, + "time_per_iteration": 2.505315065383911 + }, + { + "auxiliary_loss_clip": 0.01156588, + "auxiliary_loss_mlp": 0.01051295, + "balance_loss_clip": 1.05322886, + "balance_loss_mlp": 1.02778709, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.7999063795041144, + "language_loss": 0.88021576, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90229464, + "num_input_tokens_seen": 30298545, + "step": 1421, + "time_per_iteration": 2.5944252014160156 + }, + { + "auxiliary_loss_clip": 0.01089642, + "auxiliary_loss_mlp": 0.01011989, + "balance_loss_clip": 1.03094387, + "balance_loss_mlp": 1.00893772, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7994870052362985, + "language_loss": 0.63482368, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65584004, + "num_input_tokens_seen": 30361725, + "step": 1422, + "time_per_iteration": 3.020129919052124 + }, + { + "auxiliary_loss_clip": 0.01150537, + "auxiliary_loss_mlp": 0.01058748, + "balance_loss_clip": 1.05784762, + "balance_loss_mlp": 1.03556204, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.245683011566345, + "language_loss": 0.82994318, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85203606, + "num_input_tokens_seen": 30382180, + "step": 1423, + "time_per_iteration": 2.597221851348877 + }, + { + "auxiliary_loss_clip": 0.01157424, + "auxiliary_loss_mlp": 0.01067239, + "balance_loss_clip": 1.05270195, + "balance_loss_mlp": 1.04318273, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 12.351374810314393, + "language_loss": 0.7531743, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77542096, + "num_input_tokens_seen": 30402980, + "step": 1424, + "time_per_iteration": 2.6055922508239746 + }, + { + "auxiliary_loss_clip": 0.01139492, + "auxiliary_loss_mlp": 0.01058568, + "balance_loss_clip": 1.05782509, + "balance_loss_mlp": 1.03526235, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.7665386516950763, + "language_loss": 0.75864285, + "learning_rate": 3.96757243383196e-06, + "loss": 0.78062344, + "num_input_tokens_seen": 30420800, + "step": 1425, + "time_per_iteration": 2.539958953857422 + }, + { + "auxiliary_loss_clip": 0.01191377, + "auxiliary_loss_mlp": 0.01058022, + "balance_loss_clip": 1.05912209, + "balance_loss_mlp": 1.03454983, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 4.28674782855721, + "language_loss": 0.92830479, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95079875, + "num_input_tokens_seen": 30439620, + "step": 1426, + "time_per_iteration": 3.8476502895355225 + }, + { + "auxiliary_loss_clip": 0.01145584, + "auxiliary_loss_mlp": 0.0107222, + "balance_loss_clip": 1.05447245, + "balance_loss_mlp": 1.04605317, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.303936821065985, + "language_loss": 0.75480497, + "learning_rate": 3.967432588494471e-06, + "loss": 0.77698302, + "num_input_tokens_seen": 30457300, + "step": 1427, + "time_per_iteration": 2.5654633045196533 + }, + { + "auxiliary_loss_clip": 0.01191743, + "auxiliary_loss_mlp": 0.01064464, + "balance_loss_clip": 1.05995452, + "balance_loss_mlp": 1.04190969, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 3.098766618417918, + "language_loss": 0.81796265, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84052473, + "num_input_tokens_seen": 30471580, + "step": 1428, + "time_per_iteration": 2.416731119155884 + }, + { + "auxiliary_loss_clip": 0.01179157, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_clip": 1.05914879, + "balance_loss_mlp": 1.04119086, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.148325700302025, + "language_loss": 0.80233204, + "learning_rate": 3.967292444736023e-06, + "loss": 0.82478958, + "num_input_tokens_seen": 30492720, + "step": 1429, + "time_per_iteration": 2.5376412868499756 + }, + { + "auxiliary_loss_clip": 0.01171439, + "auxiliary_loss_mlp": 0.01061589, + "balance_loss_clip": 1.06135631, + "balance_loss_mlp": 1.03840327, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 3.2254680825876263, + "language_loss": 0.88011181, + "learning_rate": 3.967222260955578e-06, + "loss": 0.9024421, + "num_input_tokens_seen": 30509535, + "step": 1430, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01151067, + "auxiliary_loss_mlp": 0.01078259, + "balance_loss_clip": 1.06118965, + "balance_loss_mlp": 1.05376148, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.9100366520777716, + "language_loss": 0.81971103, + "learning_rate": 3.96715200257787e-06, + "loss": 0.8420043, + "num_input_tokens_seen": 30529490, + "step": 1431, + "time_per_iteration": 2.5642924308776855 + }, + { + "auxiliary_loss_clip": 0.01152245, + "auxiliary_loss_mlp": 0.01059814, + "balance_loss_clip": 1.05929375, + "balance_loss_mlp": 1.03532887, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 2.1255623059402287, + "language_loss": 0.7773298, + "learning_rate": 3.967081669605559e-06, + "loss": 0.79945046, + "num_input_tokens_seen": 30550205, + "step": 1432, + "time_per_iteration": 2.591872215270996 + }, + { + "auxiliary_loss_clip": 0.011678, + "auxiliary_loss_mlp": 0.01065777, + "balance_loss_clip": 1.0535605, + "balance_loss_mlp": 1.04028988, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.1719369888919595, + "language_loss": 0.73166746, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75400317, + "num_input_tokens_seen": 30568830, + "step": 1433, + "time_per_iteration": 2.5032613277435303 + }, + { + "auxiliary_loss_clip": 0.01152759, + "auxiliary_loss_mlp": 0.007948, + "balance_loss_clip": 1.05417538, + "balance_loss_mlp": 1.00085926, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 3.0120675393808547, + "language_loss": 0.85570908, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.8751846, + "num_input_tokens_seen": 30585730, + "step": 1434, + "time_per_iteration": 2.5267646312713623 + }, + { + "auxiliary_loss_clip": 0.0117234, + "auxiliary_loss_mlp": 0.01061073, + "balance_loss_clip": 1.05539632, + "balance_loss_mlp": 1.03673017, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 2.670206128746562, + "language_loss": 0.78758013, + "learning_rate": 3.966870223147707e-06, + "loss": 0.80991429, + "num_input_tokens_seen": 30603180, + "step": 1435, + "time_per_iteration": 2.4905638694763184 + }, + { + "auxiliary_loss_clip": 0.01052294, + "auxiliary_loss_mlp": 0.01020138, + "balance_loss_clip": 1.02270794, + "balance_loss_mlp": 1.01734853, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8922197008582136, + "language_loss": 0.57989103, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60061538, + "num_input_tokens_seen": 30668895, + "step": 1436, + "time_per_iteration": 3.216421365737915 + }, + { + "auxiliary_loss_clip": 0.01177213, + "auxiliary_loss_mlp": 0.01062197, + "balance_loss_clip": 1.05690444, + "balance_loss_mlp": 1.03558958, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.498072947412779, + "language_loss": 0.69386268, + "learning_rate": 3.966728885918437e-06, + "loss": 0.7162568, + "num_input_tokens_seen": 30688955, + "step": 1437, + "time_per_iteration": 2.537828207015991 + }, + { + "auxiliary_loss_clip": 0.01124903, + "auxiliary_loss_mlp": 0.01057878, + "balance_loss_clip": 1.05293202, + "balance_loss_mlp": 1.03371453, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.1918568083425805, + "language_loss": 0.7260558, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74788362, + "num_input_tokens_seen": 30706095, + "step": 1438, + "time_per_iteration": 2.6206045150756836 + }, + { + "auxiliary_loss_clip": 0.01181433, + "auxiliary_loss_mlp": 0.01059632, + "balance_loss_clip": 1.06108189, + "balance_loss_mlp": 1.03444266, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.679708097383088, + "language_loss": 0.64181453, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66422522, + "num_input_tokens_seen": 30729025, + "step": 1439, + "time_per_iteration": 2.6049487590789795 + }, + { + "auxiliary_loss_clip": 0.01157718, + "auxiliary_loss_mlp": 0.01060047, + "balance_loss_clip": 1.05599058, + "balance_loss_mlp": 1.03422666, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.026124849477577, + "language_loss": 0.87686145, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89903909, + "num_input_tokens_seen": 30746155, + "step": 1440, + "time_per_iteration": 2.5479276180267334 + }, + { + "auxiliary_loss_clip": 0.01157803, + "auxiliary_loss_mlp": 0.00796626, + "balance_loss_clip": 1.05883908, + "balance_loss_mlp": 1.00089121, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.204288053309198, + "language_loss": 0.83536929, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85491359, + "num_input_tokens_seen": 30761410, + "step": 1441, + "time_per_iteration": 2.57977557182312 + }, + { + "auxiliary_loss_clip": 0.01081894, + "auxiliary_loss_mlp": 0.01004212, + "balance_loss_clip": 1.02611399, + "balance_loss_mlp": 1.00120831, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 1.9867775515592059, + "language_loss": 0.6044693, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62533045, + "num_input_tokens_seen": 30823010, + "step": 1442, + "time_per_iteration": 3.1478917598724365 + }, + { + "auxiliary_loss_clip": 0.01168826, + "auxiliary_loss_mlp": 0.01058002, + "balance_loss_clip": 1.05843282, + "balance_loss_mlp": 1.03330207, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 3.9494504695051824, + "language_loss": 0.79182506, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81409335, + "num_input_tokens_seen": 30841980, + "step": 1443, + "time_per_iteration": 2.5170791149139404 + }, + { + "auxiliary_loss_clip": 0.01186816, + "auxiliary_loss_mlp": 0.01055904, + "balance_loss_clip": 1.06032133, + "balance_loss_mlp": 1.03119183, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.7459819793474234, + "language_loss": 0.8234908, + "learning_rate": 3.966231856532584e-06, + "loss": 0.84591806, + "num_input_tokens_seen": 30863280, + "step": 1444, + "time_per_iteration": 2.5355727672576904 + }, + { + "auxiliary_loss_clip": 0.01202436, + "auxiliary_loss_mlp": 0.0105593, + "balance_loss_clip": 1.06369591, + "balance_loss_mlp": 1.03150415, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 3.580754389177093, + "language_loss": 0.86629647, + "learning_rate": 3.966160554074189e-06, + "loss": 0.88888013, + "num_input_tokens_seen": 30881710, + "step": 1445, + "time_per_iteration": 2.498595952987671 + }, + { + "auxiliary_loss_clip": 0.01186491, + "auxiliary_loss_mlp": 0.01059697, + "balance_loss_clip": 1.06498468, + "balance_loss_mlp": 1.03669012, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.8953178516923113, + "language_loss": 0.81460571, + "learning_rate": 3.96608917705879e-06, + "loss": 0.83706754, + "num_input_tokens_seen": 30900225, + "step": 1446, + "time_per_iteration": 2.4816901683807373 + }, + { + "auxiliary_loss_clip": 0.01061315, + "auxiliary_loss_mlp": 0.01006557, + "balance_loss_clip": 1.02469277, + "balance_loss_mlp": 1.00386322, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 1.1257572671846114, + "language_loss": 0.54721034, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56788909, + "num_input_tokens_seen": 30959580, + "step": 1447, + "time_per_iteration": 3.108811616897583 + }, + { + "auxiliary_loss_clip": 0.01156229, + "auxiliary_loss_mlp": 0.01064, + "balance_loss_clip": 1.05863714, + "balance_loss_mlp": 1.04057527, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.1536757254623913, + "language_loss": 0.8454026, + "learning_rate": 3.965946199367804e-06, + "loss": 0.86760485, + "num_input_tokens_seen": 30976775, + "step": 1448, + "time_per_iteration": 2.5142838954925537 + }, + { + "auxiliary_loss_clip": 0.0119879, + "auxiliary_loss_mlp": 0.01064957, + "balance_loss_clip": 1.06250107, + "balance_loss_mlp": 1.03989923, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 4.929456115842483, + "language_loss": 0.80057985, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82321733, + "num_input_tokens_seen": 30990495, + "step": 1449, + "time_per_iteration": 3.9225869178771973 + }, + { + "auxiliary_loss_clip": 0.01144756, + "auxiliary_loss_mlp": 0.01059443, + "balance_loss_clip": 1.05793047, + "balance_loss_mlp": 1.03471935, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 1.5632379084119095, + "language_loss": 0.7108013, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73284328, + "num_input_tokens_seen": 31014080, + "step": 1450, + "time_per_iteration": 2.67783260345459 + }, + { + "auxiliary_loss_clip": 0.01130613, + "auxiliary_loss_mlp": 0.01059312, + "balance_loss_clip": 1.05592036, + "balance_loss_mlp": 1.0354228, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.9637603821299203, + "language_loss": 0.83541906, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85731834, + "num_input_tokens_seen": 31031210, + "step": 1451, + "time_per_iteration": 2.554810047149658 + }, + { + "auxiliary_loss_clip": 0.01141533, + "auxiliary_loss_mlp": 0.00793112, + "balance_loss_clip": 1.05628371, + "balance_loss_mlp": 1.00081754, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 2.423076311483711, + "language_loss": 0.74313086, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76247734, + "num_input_tokens_seen": 31049710, + "step": 1452, + "time_per_iteration": 2.617260456085205 + }, + { + "auxiliary_loss_clip": 0.01162066, + "auxiliary_loss_mlp": 0.01061046, + "balance_loss_clip": 1.05660701, + "balance_loss_mlp": 1.03447437, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.886024689935185, + "language_loss": 0.79459792, + "learning_rate": 3.965587450582556e-06, + "loss": 0.81682909, + "num_input_tokens_seen": 31066160, + "step": 1453, + "time_per_iteration": 3.8788740634918213 + }, + { + "auxiliary_loss_clip": 0.01174974, + "auxiliary_loss_mlp": 0.01065277, + "balance_loss_clip": 1.063779, + "balance_loss_mlp": 1.03996849, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.9692910076562467, + "language_loss": 0.71194053, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73434299, + "num_input_tokens_seen": 31085270, + "step": 1454, + "time_per_iteration": 2.5167932510375977 + }, + { + "auxiliary_loss_clip": 0.01078671, + "auxiliary_loss_mlp": 0.01017125, + "balance_loss_clip": 1.03184378, + "balance_loss_mlp": 1.01381111, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7957424463492283, + "language_loss": 0.58620113, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60715908, + "num_input_tokens_seen": 31148445, + "step": 1455, + "time_per_iteration": 3.068344831466675 + }, + { + "auxiliary_loss_clip": 0.01196422, + "auxiliary_loss_mlp": 0.01064288, + "balance_loss_clip": 1.06184149, + "balance_loss_mlp": 1.03948069, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.6761306446646356, + "language_loss": 0.77413893, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79674602, + "num_input_tokens_seen": 31168770, + "step": 1456, + "time_per_iteration": 4.0485005378723145 + }, + { + "auxiliary_loss_clip": 0.01136062, + "auxiliary_loss_mlp": 0.01060636, + "balance_loss_clip": 1.05412006, + "balance_loss_mlp": 1.03537607, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.6148939304228596, + "language_loss": 0.72032094, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74228799, + "num_input_tokens_seen": 31189270, + "step": 1457, + "time_per_iteration": 2.690279722213745 + }, + { + "auxiliary_loss_clip": 0.01180913, + "auxiliary_loss_mlp": 0.01047837, + "balance_loss_clip": 1.05870128, + "balance_loss_mlp": 1.02431667, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.6167325863139757, + "language_loss": 0.8647964, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88708395, + "num_input_tokens_seen": 31210385, + "step": 1458, + "time_per_iteration": 2.521008253097534 + }, + { + "auxiliary_loss_clip": 0.01177202, + "auxiliary_loss_mlp": 0.01072082, + "balance_loss_clip": 1.06409347, + "balance_loss_mlp": 1.04776382, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.6759021447345528, + "language_loss": 0.80562842, + "learning_rate": 3.965154492406486e-06, + "loss": 0.82812124, + "num_input_tokens_seen": 31229745, + "step": 1459, + "time_per_iteration": 2.5172958374023438 + }, + { + "auxiliary_loss_clip": 0.01136028, + "auxiliary_loss_mlp": 0.01059454, + "balance_loss_clip": 1.0565598, + "balance_loss_mlp": 1.0342418, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.934119334297433, + "language_loss": 0.8429898, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86494464, + "num_input_tokens_seen": 31248280, + "step": 1460, + "time_per_iteration": 2.6125943660736084 + }, + { + "auxiliary_loss_clip": 0.01181567, + "auxiliary_loss_mlp": 0.01057831, + "balance_loss_clip": 1.05985022, + "balance_loss_mlp": 1.03575397, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 2.7568646474486607, + "language_loss": 0.80912244, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83151644, + "num_input_tokens_seen": 31262190, + "step": 1461, + "time_per_iteration": 2.44469952583313 + }, + { + "auxiliary_loss_clip": 0.01172128, + "auxiliary_loss_mlp": 0.01066946, + "balance_loss_clip": 1.05989099, + "balance_loss_mlp": 1.0437839, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 2.072626893531403, + "language_loss": 0.76514405, + "learning_rate": 3.964937007276932e-06, + "loss": 0.78753483, + "num_input_tokens_seen": 31283690, + "step": 1462, + "time_per_iteration": 2.5696988105773926 + }, + { + "auxiliary_loss_clip": 0.01178802, + "auxiliary_loss_mlp": 0.01065558, + "balance_loss_clip": 1.06135988, + "balance_loss_mlp": 1.04045296, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.3991182194440763, + "language_loss": 0.74442732, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76687098, + "num_input_tokens_seen": 31302505, + "step": 1463, + "time_per_iteration": 2.5125198364257812 + }, + { + "auxiliary_loss_clip": 0.0119509, + "auxiliary_loss_mlp": 0.01061935, + "balance_loss_clip": 1.06437206, + "balance_loss_mlp": 1.03564978, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.06498451983234, + "language_loss": 0.83303845, + "learning_rate": 3.964791644632941e-06, + "loss": 0.8556087, + "num_input_tokens_seen": 31323070, + "step": 1464, + "time_per_iteration": 2.5237314701080322 + }, + { + "auxiliary_loss_clip": 0.01175117, + "auxiliary_loss_mlp": 0.01064881, + "balance_loss_clip": 1.05874801, + "balance_loss_mlp": 1.04125357, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 2.094203503300819, + "language_loss": 0.78218961, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80458951, + "num_input_tokens_seen": 31341880, + "step": 1465, + "time_per_iteration": 2.5164735317230225 + }, + { + "auxiliary_loss_clip": 0.01201115, + "auxiliary_loss_mlp": 0.01070302, + "balance_loss_clip": 1.06305027, + "balance_loss_mlp": 1.04742551, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 1.8959208562734524, + "language_loss": 0.85227978, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87499398, + "num_input_tokens_seen": 31361995, + "step": 1466, + "time_per_iteration": 3.8619236946105957 + }, + { + "auxiliary_loss_clip": 0.01123434, + "auxiliary_loss_mlp": 0.0079458, + "balance_loss_clip": 1.04932833, + "balance_loss_mlp": 1.0009346, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 2.426190968835191, + "language_loss": 0.83750093, + "learning_rate": 3.964573041885641e-06, + "loss": 0.85668111, + "num_input_tokens_seen": 31381515, + "step": 1467, + "time_per_iteration": 2.631863594055176 + }, + { + "auxiliary_loss_clip": 0.01180674, + "auxiliary_loss_mlp": 0.01058303, + "balance_loss_clip": 1.06048965, + "balance_loss_mlp": 1.03444898, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.8039075074508055, + "language_loss": 0.75561988, + "learning_rate": 3.964500025305907e-06, + "loss": 0.77800965, + "num_input_tokens_seen": 31400345, + "step": 1468, + "time_per_iteration": 2.4972662925720215 + }, + { + "auxiliary_loss_clip": 0.01179562, + "auxiliary_loss_mlp": 0.01055084, + "balance_loss_clip": 1.06172776, + "balance_loss_mlp": 1.03320885, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.5737794014167252, + "language_loss": 0.8054083, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82775474, + "num_input_tokens_seen": 31419620, + "step": 1469, + "time_per_iteration": 2.5162596702575684 + }, + { + "auxiliary_loss_clip": 0.01199766, + "auxiliary_loss_mlp": 0.01063621, + "balance_loss_clip": 1.06304979, + "balance_loss_mlp": 1.04079211, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 1.8559534116152485, + "language_loss": 0.77527189, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.7979058, + "num_input_tokens_seen": 31437970, + "step": 1470, + "time_per_iteration": 2.453503131866455 + }, + { + "auxiliary_loss_clip": 0.01192614, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.06158495, + "balance_loss_mlp": 1.04093802, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.9134805269049284, + "language_loss": 0.84458601, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86716402, + "num_input_tokens_seen": 31457040, + "step": 1471, + "time_per_iteration": 2.4761784076690674 + }, + { + "auxiliary_loss_clip": 0.0115838, + "auxiliary_loss_mlp": 0.01051705, + "balance_loss_clip": 1.05842328, + "balance_loss_mlp": 1.03103399, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.6058471544971202, + "language_loss": 0.83591855, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85801947, + "num_input_tokens_seen": 31477520, + "step": 1472, + "time_per_iteration": 2.6029930114746094 + }, + { + "auxiliary_loss_clip": 0.01177269, + "auxiliary_loss_mlp": 0.01061029, + "balance_loss_clip": 1.06133187, + "balance_loss_mlp": 1.03648365, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.6017040062416994, + "language_loss": 0.83053124, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85291421, + "num_input_tokens_seen": 31495575, + "step": 1473, + "time_per_iteration": 2.6001882553100586 + }, + { + "auxiliary_loss_clip": 0.01126545, + "auxiliary_loss_mlp": 0.01062677, + "balance_loss_clip": 1.05212569, + "balance_loss_mlp": 1.04090953, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.6507551224236112, + "language_loss": 0.78835118, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81024337, + "num_input_tokens_seen": 31520020, + "step": 1474, + "time_per_iteration": 2.721923589706421 + }, + { + "auxiliary_loss_clip": 0.01151031, + "auxiliary_loss_mlp": 0.01067106, + "balance_loss_clip": 1.05754566, + "balance_loss_mlp": 1.04082072, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.727185862887265, + "language_loss": 0.79095978, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81314117, + "num_input_tokens_seen": 31539265, + "step": 1475, + "time_per_iteration": 2.5845816135406494 + }, + { + "auxiliary_loss_clip": 0.01194895, + "auxiliary_loss_mlp": 0.01053178, + "balance_loss_clip": 1.06268716, + "balance_loss_mlp": 1.03040934, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 2.078517712037706, + "language_loss": 0.74299777, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76547849, + "num_input_tokens_seen": 31563425, + "step": 1476, + "time_per_iteration": 2.6712000370025635 + }, + { + "auxiliary_loss_clip": 0.01173301, + "auxiliary_loss_mlp": 0.01062133, + "balance_loss_clip": 1.06202531, + "balance_loss_mlp": 1.03920865, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.4358519944261796, + "language_loss": 0.74241567, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76477003, + "num_input_tokens_seen": 31584525, + "step": 1477, + "time_per_iteration": 2.6371896266937256 + }, + { + "auxiliary_loss_clip": 0.01193245, + "auxiliary_loss_mlp": 0.01055035, + "balance_loss_clip": 1.06085944, + "balance_loss_mlp": 1.03158641, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.3809823258004132, + "language_loss": 0.87092286, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89340568, + "num_input_tokens_seen": 31603325, + "step": 1478, + "time_per_iteration": 2.485524892807007 + }, + { + "auxiliary_loss_clip": 0.01178402, + "auxiliary_loss_mlp": 0.01062429, + "balance_loss_clip": 1.05833459, + "balance_loss_mlp": 1.04031563, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 1.750623793194506, + "language_loss": 0.77825862, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80066693, + "num_input_tokens_seen": 31624820, + "step": 1479, + "time_per_iteration": 2.5429556369781494 + }, + { + "auxiliary_loss_clip": 0.01166701, + "auxiliary_loss_mlp": 0.01055925, + "balance_loss_clip": 1.05802214, + "balance_loss_mlp": 1.03091455, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.039274404270091, + "language_loss": 0.78165537, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80388165, + "num_input_tokens_seen": 31646080, + "step": 1480, + "time_per_iteration": 2.5828263759613037 + }, + { + "auxiliary_loss_clip": 0.01183412, + "auxiliary_loss_mlp": 0.01059356, + "balance_loss_clip": 1.06040883, + "balance_loss_mlp": 1.0345726, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 1.7129640044967438, + "language_loss": 0.66386968, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68629742, + "num_input_tokens_seen": 31665770, + "step": 1481, + "time_per_iteration": 2.505575180053711 + }, + { + "auxiliary_loss_clip": 0.01149067, + "auxiliary_loss_mlp": 0.01053301, + "balance_loss_clip": 1.05721378, + "balance_loss_mlp": 1.03063917, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 1.898365858058204, + "language_loss": 0.96045506, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98247874, + "num_input_tokens_seen": 31683805, + "step": 1482, + "time_per_iteration": 2.6105363368988037 + }, + { + "auxiliary_loss_clip": 0.01159058, + "auxiliary_loss_mlp": 0.01055514, + "balance_loss_clip": 1.05957901, + "balance_loss_mlp": 1.03222096, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.08239759558702, + "language_loss": 0.79059494, + "learning_rate": 3.96339583888261e-06, + "loss": 0.81274068, + "num_input_tokens_seen": 31704630, + "step": 1483, + "time_per_iteration": 2.6289024353027344 + }, + { + "auxiliary_loss_clip": 0.01173695, + "auxiliary_loss_mlp": 0.01087573, + "balance_loss_clip": 1.05808008, + "balance_loss_mlp": 1.06318319, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 5.687951151433048, + "language_loss": 0.85552931, + "learning_rate": 3.963321630732448e-06, + "loss": 0.878142, + "num_input_tokens_seen": 31723255, + "step": 1484, + "time_per_iteration": 2.464850902557373 + }, + { + "auxiliary_loss_clip": 0.01203064, + "auxiliary_loss_mlp": 0.01066522, + "balance_loss_clip": 1.06589675, + "balance_loss_mlp": 1.04246593, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7609979131382167, + "language_loss": 0.80418491, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82688081, + "num_input_tokens_seen": 31747045, + "step": 1485, + "time_per_iteration": 2.5269126892089844 + }, + { + "auxiliary_loss_clip": 0.01179234, + "auxiliary_loss_mlp": 0.01064382, + "balance_loss_clip": 1.0617882, + "balance_loss_mlp": 1.04033792, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 1.7620782591677004, + "language_loss": 0.83074248, + "learning_rate": 3.96317299108688e-06, + "loss": 0.85317862, + "num_input_tokens_seen": 31766615, + "step": 1486, + "time_per_iteration": 2.488455295562744 + }, + { + "auxiliary_loss_clip": 0.01149102, + "auxiliary_loss_mlp": 0.01062695, + "balance_loss_clip": 1.0602392, + "balance_loss_mlp": 1.03919911, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 2.0479219988153816, + "language_loss": 0.76445293, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78657091, + "num_input_tokens_seen": 31785855, + "step": 1487, + "time_per_iteration": 2.564807415008545 + }, + { + "auxiliary_loss_clip": 0.01166156, + "auxiliary_loss_mlp": 0.01065347, + "balance_loss_clip": 1.05488384, + "balance_loss_mlp": 1.04012227, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.2180393625022354, + "language_loss": 0.82828557, + "learning_rate": 3.963024053666449e-06, + "loss": 0.8506006, + "num_input_tokens_seen": 31804210, + "step": 1488, + "time_per_iteration": 2.5132486820220947 + }, + { + "auxiliary_loss_clip": 0.01178977, + "auxiliary_loss_mlp": 0.01057079, + "balance_loss_clip": 1.05933046, + "balance_loss_mlp": 1.03393996, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 2.728099612114276, + "language_loss": 0.71893525, + "learning_rate": 3.962949473297718e-06, + "loss": 0.74129581, + "num_input_tokens_seen": 31826150, + "step": 1489, + "time_per_iteration": 4.173608779907227 + }, + { + "auxiliary_loss_clip": 0.01159025, + "auxiliary_loss_mlp": 0.01053525, + "balance_loss_clip": 1.05721307, + "balance_loss_mlp": 1.03010011, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.8377841873935443, + "language_loss": 0.89728022, + "learning_rate": 3.962874818493745e-06, + "loss": 0.9194057, + "num_input_tokens_seen": 31848060, + "step": 1490, + "time_per_iteration": 2.6631338596343994 + }, + { + "auxiliary_loss_clip": 0.01189423, + "auxiliary_loss_mlp": 0.01067836, + "balance_loss_clip": 1.06075168, + "balance_loss_mlp": 1.04418457, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.60496175768383, + "language_loss": 0.73219204, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75476462, + "num_input_tokens_seen": 31870040, + "step": 1491, + "time_per_iteration": 2.547516107559204 + }, + { + "auxiliary_loss_clip": 0.01195131, + "auxiliary_loss_mlp": 0.00791269, + "balance_loss_clip": 1.06387842, + "balance_loss_mlp": 1.00089645, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6754676760563025, + "language_loss": 0.77071202, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79057604, + "num_input_tokens_seen": 31890400, + "step": 1492, + "time_per_iteration": 3.9952738285064697 + }, + { + "auxiliary_loss_clip": 0.01194148, + "auxiliary_loss_mlp": 0.01063325, + "balance_loss_clip": 1.06464052, + "balance_loss_mlp": 1.03991246, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 3.8974303076666432, + "language_loss": 0.71216983, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73474455, + "num_input_tokens_seen": 31913435, + "step": 1493, + "time_per_iteration": 2.5701191425323486 + }, + { + "auxiliary_loss_clip": 0.0119525, + "auxiliary_loss_mlp": 0.01062613, + "balance_loss_clip": 1.06254196, + "balance_loss_mlp": 1.03930783, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 2.2981145545600534, + "language_loss": 0.87131578, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89389443, + "num_input_tokens_seen": 31932435, + "step": 1494, + "time_per_iteration": 2.5113613605499268 + }, + { + "auxiliary_loss_clip": 0.01093074, + "auxiliary_loss_mlp": 0.01069756, + "balance_loss_clip": 1.05000198, + "balance_loss_mlp": 1.04485285, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 2.0448219120745676, + "language_loss": 0.82937402, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85100234, + "num_input_tokens_seen": 31950125, + "step": 1495, + "time_per_iteration": 2.6507480144500732 + }, + { + "auxiliary_loss_clip": 0.01180686, + "auxiliary_loss_mlp": 0.01061678, + "balance_loss_clip": 1.06331468, + "balance_loss_mlp": 1.03849173, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 1.9574230123680496, + "language_loss": 0.69882119, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72124487, + "num_input_tokens_seen": 31968050, + "step": 1496, + "time_per_iteration": 3.88201642036438 + }, + { + "auxiliary_loss_clip": 0.01174213, + "auxiliary_loss_mlp": 0.0105242, + "balance_loss_clip": 1.06125927, + "balance_loss_mlp": 1.03158188, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6261436857032667, + "language_loss": 0.80115068, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82341701, + "num_input_tokens_seen": 31985675, + "step": 1497, + "time_per_iteration": 2.5264830589294434 + }, + { + "auxiliary_loss_clip": 0.01134762, + "auxiliary_loss_mlp": 0.01055899, + "balance_loss_clip": 1.05687082, + "balance_loss_mlp": 1.03206885, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.0046797052837046, + "language_loss": 0.8266778, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84858441, + "num_input_tokens_seen": 32005180, + "step": 1498, + "time_per_iteration": 2.637603998184204 + }, + { + "auxiliary_loss_clip": 0.01177275, + "auxiliary_loss_mlp": 0.01063349, + "balance_loss_clip": 1.06259143, + "balance_loss_mlp": 1.04121172, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 3.3744039182562355, + "language_loss": 0.78921664, + "learning_rate": 3.962199576140195e-06, + "loss": 0.81162286, + "num_input_tokens_seen": 32022970, + "step": 1499, + "time_per_iteration": 2.502952814102173 + }, + { + "auxiliary_loss_clip": 0.01171209, + "auxiliary_loss_mlp": 0.00792306, + "balance_loss_clip": 1.06068051, + "balance_loss_mlp": 1.00081325, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.6741015666129218, + "language_loss": 0.93469906, + "learning_rate": 3.962124177139981e-06, + "loss": 0.9543342, + "num_input_tokens_seen": 32043055, + "step": 1500, + "time_per_iteration": 2.5477123260498047 + }, + { + "auxiliary_loss_clip": 0.01151627, + "auxiliary_loss_mlp": 0.01057161, + "balance_loss_clip": 1.0564425, + "balance_loss_mlp": 1.03206706, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.543729088909604, + "language_loss": 0.74819499, + "learning_rate": 3.962048703735822e-06, + "loss": 0.77028286, + "num_input_tokens_seen": 32061900, + "step": 1501, + "time_per_iteration": 2.5636985301971436 + }, + { + "auxiliary_loss_clip": 0.01072518, + "auxiliary_loss_mlp": 0.01005949, + "balance_loss_clip": 1.04700804, + "balance_loss_mlp": 1.00329065, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7295244629164899, + "language_loss": 0.58299017, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60377479, + "num_input_tokens_seen": 32122745, + "step": 1502, + "time_per_iteration": 3.1247830390930176 + }, + { + "auxiliary_loss_clip": 0.01155565, + "auxiliary_loss_mlp": 0.01061839, + "balance_loss_clip": 1.05224276, + "balance_loss_mlp": 1.03742504, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.302340259353168, + "language_loss": 0.69316626, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71534032, + "num_input_tokens_seen": 32145125, + "step": 1503, + "time_per_iteration": 2.6750710010528564 + }, + { + "auxiliary_loss_clip": 0.01149358, + "auxiliary_loss_mlp": 0.01059971, + "balance_loss_clip": 1.05781949, + "balance_loss_mlp": 1.03697526, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.127301288640993, + "language_loss": 0.86748838, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88958168, + "num_input_tokens_seen": 32166255, + "step": 1504, + "time_per_iteration": 2.629204034805298 + }, + { + "auxiliary_loss_clip": 0.01154708, + "auxiliary_loss_mlp": 0.01065952, + "balance_loss_clip": 1.05739832, + "balance_loss_mlp": 1.03936791, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 2.0783572033431525, + "language_loss": 0.72724807, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74945462, + "num_input_tokens_seen": 32184010, + "step": 1505, + "time_per_iteration": 4.008904457092285 + }, + { + "auxiliary_loss_clip": 0.01146055, + "auxiliary_loss_mlp": 0.0106037, + "balance_loss_clip": 1.05756426, + "balance_loss_mlp": 1.03681421, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.6966224867501407, + "language_loss": 0.80619192, + "learning_rate": 3.961670220756114e-06, + "loss": 0.82825619, + "num_input_tokens_seen": 32201635, + "step": 1506, + "time_per_iteration": 2.5487985610961914 + }, + { + "auxiliary_loss_clip": 0.01149174, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.05833995, + "balance_loss_mlp": 1.03277159, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 1.9873474935278548, + "language_loss": 0.75975317, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78179109, + "num_input_tokens_seen": 32221940, + "step": 1507, + "time_per_iteration": 2.6516573429107666 + }, + { + "auxiliary_loss_clip": 0.01064362, + "auxiliary_loss_mlp": 0.01002407, + "balance_loss_clip": 1.03076339, + "balance_loss_mlp": 0.99947494, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7288766456602073, + "language_loss": 0.57649207, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59715974, + "num_input_tokens_seen": 32276495, + "step": 1508, + "time_per_iteration": 2.947296619415283 + }, + { + "auxiliary_loss_clip": 0.01174493, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.06137323, + "balance_loss_mlp": 1.03319085, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 1.8810588399709778, + "language_loss": 0.85154212, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87385106, + "num_input_tokens_seen": 32294130, + "step": 1509, + "time_per_iteration": 2.507607936859131 + }, + { + "auxiliary_loss_clip": 0.01173127, + "auxiliary_loss_mlp": 0.01066089, + "balance_loss_clip": 1.06100321, + "balance_loss_mlp": 1.04149675, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.146136897212983, + "language_loss": 0.84621519, + "learning_rate": 3.961366095394002e-06, + "loss": 0.8686074, + "num_input_tokens_seen": 32313555, + "step": 1510, + "time_per_iteration": 2.55082368850708 + }, + { + "auxiliary_loss_clip": 0.01160606, + "auxiliary_loss_mlp": 0.01065759, + "balance_loss_clip": 1.05845213, + "balance_loss_mlp": 1.0421195, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 2.027996909829249, + "language_loss": 0.85267782, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87494147, + "num_input_tokens_seen": 32331430, + "step": 1511, + "time_per_iteration": 2.535367727279663 + }, + { + "auxiliary_loss_clip": 0.01152715, + "auxiliary_loss_mlp": 0.01054635, + "balance_loss_clip": 1.05629396, + "balance_loss_mlp": 1.0316875, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.643150172163249, + "language_loss": 0.84885764, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87093115, + "num_input_tokens_seen": 32353705, + "step": 1512, + "time_per_iteration": 2.622995376586914 + }, + { + "auxiliary_loss_clip": 0.01165133, + "auxiliary_loss_mlp": 0.01057425, + "balance_loss_clip": 1.06511497, + "balance_loss_mlp": 1.03600359, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.312543460676918, + "language_loss": 0.87061381, + "learning_rate": 3.961137220422749e-06, + "loss": 0.89283931, + "num_input_tokens_seen": 32370520, + "step": 1513, + "time_per_iteration": 2.535895586013794 + }, + { + "auxiliary_loss_clip": 0.01173167, + "auxiliary_loss_mlp": 0.01050951, + "balance_loss_clip": 1.05960298, + "balance_loss_mlp": 1.03014874, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.6838315065246408, + "language_loss": 0.86287665, + "learning_rate": 3.961060780028764e-06, + "loss": 0.88511777, + "num_input_tokens_seen": 32389105, + "step": 1514, + "time_per_iteration": 2.5022904872894287 + }, + { + "auxiliary_loss_clip": 0.01136309, + "auxiliary_loss_mlp": 0.01062924, + "balance_loss_clip": 1.05857968, + "balance_loss_mlp": 1.04164529, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.8882989236503043, + "language_loss": 0.90288627, + "learning_rate": 3.960984265271159e-06, + "loss": 0.9248786, + "num_input_tokens_seen": 32408065, + "step": 1515, + "time_per_iteration": 2.6163699626922607 + }, + { + "auxiliary_loss_clip": 0.01167511, + "auxiliary_loss_mlp": 0.01055738, + "balance_loss_clip": 1.05944157, + "balance_loss_mlp": 1.03238463, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 1.8093542242176766, + "language_loss": 0.85447246, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87670493, + "num_input_tokens_seen": 32427225, + "step": 1516, + "time_per_iteration": 2.5906484127044678 + }, + { + "auxiliary_loss_clip": 0.01164322, + "auxiliary_loss_mlp": 0.01055234, + "balance_loss_clip": 1.05523896, + "balance_loss_mlp": 1.03232193, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 2.0246454015512176, + "language_loss": 0.80659246, + "learning_rate": 3.960831012676692e-06, + "loss": 0.82878798, + "num_input_tokens_seen": 32450510, + "step": 1517, + "time_per_iteration": 2.652411699295044 + }, + { + "auxiliary_loss_clip": 0.01180613, + "auxiliary_loss_mlp": 0.01072531, + "balance_loss_clip": 1.05974257, + "balance_loss_mlp": 1.04952371, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.6862336941853298, + "language_loss": 0.77680063, + "learning_rate": 3.960754274845642e-06, + "loss": 0.79933214, + "num_input_tokens_seen": 32468425, + "step": 1518, + "time_per_iteration": 2.508291244506836 + }, + { + "auxiliary_loss_clip": 0.01170603, + "auxiliary_loss_mlp": 0.01062134, + "balance_loss_clip": 1.05537868, + "balance_loss_mlp": 1.03959155, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 2.023461461305392, + "language_loss": 0.8614006, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88372791, + "num_input_tokens_seen": 32487510, + "step": 1519, + "time_per_iteration": 2.499220848083496 + }, + { + "auxiliary_loss_clip": 0.01167419, + "auxiliary_loss_mlp": 0.01052686, + "balance_loss_clip": 1.05750048, + "balance_loss_mlp": 1.02908301, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.100414027720377, + "language_loss": 0.72912121, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75132227, + "num_input_tokens_seen": 32507250, + "step": 1520, + "time_per_iteration": 2.5521275997161865 + }, + { + "auxiliary_loss_clip": 0.01164645, + "auxiliary_loss_mlp": 0.01058713, + "balance_loss_clip": 1.05971038, + "balance_loss_mlp": 1.03557432, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 5.16923192252221, + "language_loss": 0.85523486, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87746835, + "num_input_tokens_seen": 32526045, + "step": 1521, + "time_per_iteration": 2.4927217960357666 + }, + { + "auxiliary_loss_clip": 0.01123275, + "auxiliary_loss_mlp": 0.01064901, + "balance_loss_clip": 1.05256855, + "balance_loss_mlp": 1.04116607, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 1.9075040228774625, + "language_loss": 0.84104115, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86292291, + "num_input_tokens_seen": 32546575, + "step": 1522, + "time_per_iteration": 2.6292107105255127 + }, + { + "auxiliary_loss_clip": 0.01185183, + "auxiliary_loss_mlp": 0.01064172, + "balance_loss_clip": 1.05904007, + "balance_loss_mlp": 1.04131973, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 1.8155204798911373, + "language_loss": 0.80912548, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83161902, + "num_input_tokens_seen": 32568795, + "step": 1523, + "time_per_iteration": 2.48669171333313 + }, + { + "auxiliary_loss_clip": 0.01162046, + "auxiliary_loss_mlp": 0.00791947, + "balance_loss_clip": 1.05590224, + "balance_loss_mlp": 1.00076687, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 3.0345704620228138, + "language_loss": 0.74641156, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76595151, + "num_input_tokens_seen": 32587010, + "step": 1524, + "time_per_iteration": 2.474013328552246 + }, + { + "auxiliary_loss_clip": 0.01141177, + "auxiliary_loss_mlp": 0.01060159, + "balance_loss_clip": 1.05446589, + "balance_loss_mlp": 1.03746128, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.432498646262709, + "language_loss": 0.86015624, + "learning_rate": 3.960215028335644e-06, + "loss": 0.8821696, + "num_input_tokens_seen": 32602375, + "step": 1525, + "time_per_iteration": 2.512186288833618 + }, + { + "auxiliary_loss_clip": 0.0116728, + "auxiliary_loss_mlp": 0.0104904, + "balance_loss_clip": 1.0594418, + "balance_loss_mlp": 1.02610433, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.0839167282728623, + "language_loss": 0.75186729, + "learning_rate": 3.96013769577032e-06, + "loss": 0.77403057, + "num_input_tokens_seen": 32621460, + "step": 1526, + "time_per_iteration": 2.5689423084259033 + }, + { + "auxiliary_loss_clip": 0.01190683, + "auxiliary_loss_mlp": 0.010592, + "balance_loss_clip": 1.0613842, + "balance_loss_mlp": 1.03695524, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.869546156314806, + "language_loss": 0.77592027, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79841912, + "num_input_tokens_seen": 32640440, + "step": 1527, + "time_per_iteration": 2.4401628971099854 + }, + { + "auxiliary_loss_clip": 0.01177761, + "auxiliary_loss_mlp": 0.01057726, + "balance_loss_clip": 1.05669737, + "balance_loss_mlp": 1.03333592, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.7707515281660513, + "language_loss": 0.78534031, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80769515, + "num_input_tokens_seen": 32660020, + "step": 1528, + "time_per_iteration": 4.027719497680664 + }, + { + "auxiliary_loss_clip": 0.01149028, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.05194736, + "balance_loss_mlp": 1.02827752, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 3.022223920458515, + "language_loss": 0.76742411, + "learning_rate": 3.959905252114384e-06, + "loss": 0.78942001, + "num_input_tokens_seen": 32678170, + "step": 1529, + "time_per_iteration": 2.5223143100738525 + }, + { + "auxiliary_loss_clip": 0.01190812, + "auxiliary_loss_mlp": 0.00793126, + "balance_loss_clip": 1.05733275, + "balance_loss_mlp": 1.00076938, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.8435450098519637, + "language_loss": 0.82627356, + "learning_rate": 3.959827622252211e-06, + "loss": 0.84611297, + "num_input_tokens_seen": 32697540, + "step": 1530, + "time_per_iteration": 2.4833621978759766 + }, + { + "auxiliary_loss_clip": 0.01131168, + "auxiliary_loss_mlp": 0.01065559, + "balance_loss_clip": 1.04948199, + "balance_loss_mlp": 1.04243279, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.2310217080771997, + "language_loss": 0.83754468, + "learning_rate": 3.959749918073179e-06, + "loss": 0.85951197, + "num_input_tokens_seen": 32716805, + "step": 1531, + "time_per_iteration": 3.988473415374756 + }, + { + "auxiliary_loss_clip": 0.01139317, + "auxiliary_loss_mlp": 0.01058043, + "balance_loss_clip": 1.04984796, + "balance_loss_mlp": 1.03446293, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.972702987498461, + "language_loss": 0.81436646, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83634007, + "num_input_tokens_seen": 32736385, + "step": 1532, + "time_per_iteration": 2.543225049972534 + }, + { + "auxiliary_loss_clip": 0.01164259, + "auxiliary_loss_mlp": 0.01058713, + "balance_loss_clip": 1.05631852, + "balance_loss_mlp": 1.03474045, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 2.0810743715066633, + "language_loss": 0.83906174, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.86129147, + "num_input_tokens_seen": 32757140, + "step": 1533, + "time_per_iteration": 2.5897858142852783 + }, + { + "auxiliary_loss_clip": 0.01150323, + "auxiliary_loss_mlp": 0.01057501, + "balance_loss_clip": 1.05838251, + "balance_loss_mlp": 1.03450549, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 2.227268641274193, + "language_loss": 0.90218914, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92426741, + "num_input_tokens_seen": 32774860, + "step": 1534, + "time_per_iteration": 4.004406929016113 + }, + { + "auxiliary_loss_clip": 0.01158234, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_clip": 1.05470312, + "balance_loss_mlp": 1.04071939, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.230184597568831, + "language_loss": 0.75954115, + "learning_rate": 3.959438358247424e-06, + "loss": 0.7817843, + "num_input_tokens_seen": 32795250, + "step": 1535, + "time_per_iteration": 2.583728075027466 + }, + { + "auxiliary_loss_clip": 0.01174768, + "auxiliary_loss_mlp": 0.01050408, + "balance_loss_clip": 1.05650246, + "balance_loss_mlp": 1.02889132, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.838036618505859, + "language_loss": 0.81619573, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83844745, + "num_input_tokens_seen": 32813805, + "step": 1536, + "time_per_iteration": 2.4720311164855957 + }, + { + "auxiliary_loss_clip": 0.01186008, + "auxiliary_loss_mlp": 0.01056923, + "balance_loss_clip": 1.05837035, + "balance_loss_mlp": 1.03510725, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.007605867459832, + "language_loss": 0.89177084, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91420019, + "num_input_tokens_seen": 32830960, + "step": 1537, + "time_per_iteration": 2.4258673191070557 + }, + { + "auxiliary_loss_clip": 0.0116125, + "auxiliary_loss_mlp": 0.01063465, + "balance_loss_clip": 1.05370736, + "balance_loss_mlp": 1.04024267, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 1.9974025477132875, + "language_loss": 0.80817264, + "learning_rate": 3.959203908195741e-06, + "loss": 0.83041978, + "num_input_tokens_seen": 32848275, + "step": 1538, + "time_per_iteration": 2.5855307579040527 + }, + { + "auxiliary_loss_clip": 0.01063385, + "auxiliary_loss_mlp": 0.01018711, + "balance_loss_clip": 1.03427696, + "balance_loss_mlp": 1.01586175, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.7350319442310973, + "language_loss": 0.57430905, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59513003, + "num_input_tokens_seen": 32917730, + "step": 1539, + "time_per_iteration": 3.2273812294006348 + }, + { + "auxiliary_loss_clip": 0.01160067, + "auxiliary_loss_mlp": 0.01050192, + "balance_loss_clip": 1.05679357, + "balance_loss_mlp": 1.02731621, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.5727460912729425, + "language_loss": 0.67815429, + "learning_rate": 3.959047236690304e-06, + "loss": 0.70025688, + "num_input_tokens_seen": 32934910, + "step": 1540, + "time_per_iteration": 2.692659854888916 + }, + { + "auxiliary_loss_clip": 0.01149868, + "auxiliary_loss_mlp": 0.01050919, + "balance_loss_clip": 1.05423903, + "balance_loss_mlp": 1.02810264, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.6687570707941806, + "language_loss": 0.83820194, + "learning_rate": 3.958968789505198e-06, + "loss": 0.86020982, + "num_input_tokens_seen": 32953840, + "step": 1541, + "time_per_iteration": 2.5386457443237305 + }, + { + "auxiliary_loss_clip": 0.01093262, + "auxiliary_loss_mlp": 0.01011201, + "balance_loss_clip": 1.04028821, + "balance_loss_mlp": 1.00844717, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8923119178392174, + "language_loss": 0.61938637, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64043099, + "num_input_tokens_seen": 33011410, + "step": 1542, + "time_per_iteration": 3.022493600845337 + }, + { + "auxiliary_loss_clip": 0.01164935, + "auxiliary_loss_mlp": 0.01055386, + "balance_loss_clip": 1.05843258, + "balance_loss_mlp": 1.03358221, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.5380171592123817, + "language_loss": 0.82730341, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84950662, + "num_input_tokens_seen": 33031675, + "step": 1543, + "time_per_iteration": 2.5246481895446777 + }, + { + "auxiliary_loss_clip": 0.01139534, + "auxiliary_loss_mlp": 0.01067165, + "balance_loss_clip": 1.05075765, + "balance_loss_mlp": 1.04444396, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.6732379533666888, + "language_loss": 0.71992052, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74198753, + "num_input_tokens_seen": 33056355, + "step": 1544, + "time_per_iteration": 4.281844615936279 + }, + { + "auxiliary_loss_clip": 0.01168843, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.05387926, + "balance_loss_mlp": 1.02862144, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.678465301199152, + "language_loss": 0.77539343, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79761821, + "num_input_tokens_seen": 33079520, + "step": 1545, + "time_per_iteration": 2.571488618850708 + }, + { + "auxiliary_loss_clip": 0.01144364, + "auxiliary_loss_mlp": 0.01052936, + "balance_loss_clip": 1.05672765, + "balance_loss_mlp": 1.03007185, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 3.336297747914589, + "language_loss": 0.74921429, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.77118731, + "num_input_tokens_seen": 33096135, + "step": 1546, + "time_per_iteration": 2.5111212730407715 + }, + { + "auxiliary_loss_clip": 0.01167312, + "auxiliary_loss_mlp": 0.01055148, + "balance_loss_clip": 1.05585265, + "balance_loss_mlp": 1.03110349, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.38034597200775, + "language_loss": 0.84244537, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86466992, + "num_input_tokens_seen": 33115245, + "step": 1547, + "time_per_iteration": 2.5032622814178467 + }, + { + "auxiliary_loss_clip": 0.01147615, + "auxiliary_loss_mlp": 0.01053904, + "balance_loss_clip": 1.05087423, + "balance_loss_mlp": 1.03123021, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 1.8520723887781867, + "language_loss": 0.67115939, + "learning_rate": 3.958417579416199e-06, + "loss": 0.6931746, + "num_input_tokens_seen": 33136640, + "step": 1548, + "time_per_iteration": 2.7064614295959473 + }, + { + "auxiliary_loss_clip": 0.01126483, + "auxiliary_loss_mlp": 0.01060167, + "balance_loss_clip": 1.04851925, + "balance_loss_mlp": 1.03771996, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 1.8088962824417802, + "language_loss": 0.83601594, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.8578825, + "num_input_tokens_seen": 33155060, + "step": 1549, + "time_per_iteration": 2.73105525970459 + }, + { + "auxiliary_loss_clip": 0.01181228, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_clip": 1.06314993, + "balance_loss_mlp": 1.02571917, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.5965788569986605, + "language_loss": 0.75563467, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77792382, + "num_input_tokens_seen": 33175420, + "step": 1550, + "time_per_iteration": 2.6842777729034424 + }, + { + "auxiliary_loss_clip": 0.01154562, + "auxiliary_loss_mlp": 0.0107418, + "balance_loss_clip": 1.05622411, + "balance_loss_mlp": 1.04930091, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.3493347221132788, + "language_loss": 0.83552182, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85780919, + "num_input_tokens_seen": 33194120, + "step": 1551, + "time_per_iteration": 2.7099874019622803 + }, + { + "auxiliary_loss_clip": 0.01075734, + "auxiliary_loss_mlp": 0.00766389, + "balance_loss_clip": 1.03210068, + "balance_loss_mlp": 1.00013769, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7449183070160537, + "language_loss": 0.61834413, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63676536, + "num_input_tokens_seen": 33261080, + "step": 1552, + "time_per_iteration": 3.2462594509124756 + }, + { + "auxiliary_loss_clip": 0.01070657, + "auxiliary_loss_mlp": 0.01004956, + "balance_loss_clip": 1.03673935, + "balance_loss_mlp": 1.00109327, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8368492667506437, + "language_loss": 0.58971882, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61047494, + "num_input_tokens_seen": 33330235, + "step": 1553, + "time_per_iteration": 3.2628090381622314 + }, + { + "auxiliary_loss_clip": 0.01147933, + "auxiliary_loss_mlp": 0.01060253, + "balance_loss_clip": 1.05166829, + "balance_loss_mlp": 1.03722179, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 2.5641375542280027, + "language_loss": 0.87677389, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89885575, + "num_input_tokens_seen": 33349035, + "step": 1554, + "time_per_iteration": 2.677097797393799 + }, + { + "auxiliary_loss_clip": 0.01154812, + "auxiliary_loss_mlp": 0.01055901, + "balance_loss_clip": 1.05565763, + "balance_loss_mlp": 1.03284621, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.6326689384923005, + "language_loss": 0.81504542, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83715254, + "num_input_tokens_seen": 33368060, + "step": 1555, + "time_per_iteration": 2.6704957485198975 + }, + { + "auxiliary_loss_clip": 0.01066278, + "auxiliary_loss_mlp": 0.01003165, + "balance_loss_clip": 1.0290277, + "balance_loss_mlp": 1.00025642, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8773386819893125, + "language_loss": 0.59670997, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61740446, + "num_input_tokens_seen": 33430825, + "step": 1556, + "time_per_iteration": 3.1198155879974365 + }, + { + "auxiliary_loss_clip": 0.01174925, + "auxiliary_loss_mlp": 0.01062222, + "balance_loss_clip": 1.05662477, + "balance_loss_mlp": 1.04000115, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.610907622455541, + "language_loss": 0.84447134, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86684287, + "num_input_tokens_seen": 33454855, + "step": 1557, + "time_per_iteration": 2.645479440689087 + }, + { + "auxiliary_loss_clip": 0.01116845, + "auxiliary_loss_mlp": 0.0106975, + "balance_loss_clip": 1.0537982, + "balance_loss_mlp": 1.04558575, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.6515773580343684, + "language_loss": 0.78140432, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80327022, + "num_input_tokens_seen": 33476000, + "step": 1558, + "time_per_iteration": 2.690361261367798 + }, + { + "auxiliary_loss_clip": 0.01160089, + "auxiliary_loss_mlp": 0.0105764, + "balance_loss_clip": 1.05481517, + "balance_loss_mlp": 1.03533614, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.014208780221313, + "language_loss": 0.80347133, + "learning_rate": 3.957544040455379e-06, + "loss": 0.82564867, + "num_input_tokens_seen": 33493845, + "step": 1559, + "time_per_iteration": 2.5224459171295166 + }, + { + "auxiliary_loss_clip": 0.01143735, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_clip": 1.0540303, + "balance_loss_mlp": 1.04560351, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 2.2205618371026787, + "language_loss": 0.76457441, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78668231, + "num_input_tokens_seen": 33510850, + "step": 1560, + "time_per_iteration": 2.5376551151275635 + }, + { + "auxiliary_loss_clip": 0.01139054, + "auxiliary_loss_mlp": 0.01069846, + "balance_loss_clip": 1.05124843, + "balance_loss_mlp": 1.04750586, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.8126012061260448, + "language_loss": 0.80665231, + "learning_rate": 3.95738425007858e-06, + "loss": 0.82874125, + "num_input_tokens_seen": 33530430, + "step": 1561, + "time_per_iteration": 2.6096343994140625 + }, + { + "auxiliary_loss_clip": 0.01171429, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_clip": 1.05358434, + "balance_loss_mlp": 1.03012037, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.5615982844768523, + "language_loss": 0.61340475, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63563883, + "num_input_tokens_seen": 33551975, + "step": 1562, + "time_per_iteration": 2.5777204036712646 + }, + { + "auxiliary_loss_clip": 0.01162986, + "auxiliary_loss_mlp": 0.01063768, + "balance_loss_clip": 1.0583961, + "balance_loss_mlp": 1.04271543, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.3159779529922355, + "language_loss": 0.85293931, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87520689, + "num_input_tokens_seen": 33569850, + "step": 1563, + "time_per_iteration": 2.4840667247772217 + }, + { + "auxiliary_loss_clip": 0.01166875, + "auxiliary_loss_mlp": 0.01051877, + "balance_loss_clip": 1.06158781, + "balance_loss_mlp": 1.03083634, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.723602365200732, + "language_loss": 0.76089466, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78308219, + "num_input_tokens_seen": 33590510, + "step": 1564, + "time_per_iteration": 2.525744676589966 + }, + { + "auxiliary_loss_clip": 0.01153263, + "auxiliary_loss_mlp": 0.01060913, + "balance_loss_clip": 1.05313063, + "balance_loss_mlp": 1.03907371, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 1.9248033115895935, + "language_loss": 0.8010028, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82314456, + "num_input_tokens_seen": 33608810, + "step": 1565, + "time_per_iteration": 2.517662286758423 + }, + { + "auxiliary_loss_clip": 0.01156001, + "auxiliary_loss_mlp": 0.01065942, + "balance_loss_clip": 1.05337048, + "balance_loss_mlp": 1.04446018, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 2.0027036561512808, + "language_loss": 0.75426602, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77648544, + "num_input_tokens_seen": 33627265, + "step": 1566, + "time_per_iteration": 2.498882532119751 + }, + { + "auxiliary_loss_clip": 0.01145739, + "auxiliary_loss_mlp": 0.00791087, + "balance_loss_clip": 1.0537672, + "balance_loss_mlp": 1.00061846, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 1.9162809977306445, + "language_loss": 0.78081524, + "learning_rate": 3.956903097664407e-06, + "loss": 0.80018359, + "num_input_tokens_seen": 33644810, + "step": 1567, + "time_per_iteration": 4.040203094482422 + }, + { + "auxiliary_loss_clip": 0.01157319, + "auxiliary_loss_mlp": 0.01053271, + "balance_loss_clip": 1.05423057, + "balance_loss_mlp": 1.03280234, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 1.6797906091082415, + "language_loss": 0.82782412, + "learning_rate": 3.956822645856749e-06, + "loss": 0.84992999, + "num_input_tokens_seen": 33665665, + "step": 1568, + "time_per_iteration": 2.545086622238159 + }, + { + "auxiliary_loss_clip": 0.01186709, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_clip": 1.05818725, + "balance_loss_mlp": 1.03147125, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 1.811068449316987, + "language_loss": 0.76564991, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78805465, + "num_input_tokens_seen": 33684760, + "step": 1569, + "time_per_iteration": 2.4414665699005127 + }, + { + "auxiliary_loss_clip": 0.01117311, + "auxiliary_loss_mlp": 0.01058066, + "balance_loss_clip": 1.04607022, + "balance_loss_mlp": 1.03386641, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.553642742440088, + "language_loss": 0.85949272, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88124645, + "num_input_tokens_seen": 33700750, + "step": 1570, + "time_per_iteration": 3.926435708999634 + }, + { + "auxiliary_loss_clip": 0.01124471, + "auxiliary_loss_mlp": 0.01050622, + "balance_loss_clip": 1.05169415, + "balance_loss_mlp": 1.02740002, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.6781488569606398, + "language_loss": 0.76192272, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78367364, + "num_input_tokens_seen": 33724430, + "step": 1571, + "time_per_iteration": 2.6570467948913574 + }, + { + "auxiliary_loss_clip": 0.01140299, + "auxiliary_loss_mlp": 0.01053618, + "balance_loss_clip": 1.05349779, + "balance_loss_mlp": 1.0321126, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.6107560473279343, + "language_loss": 0.79300523, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81494439, + "num_input_tokens_seen": 33743455, + "step": 1572, + "time_per_iteration": 2.603290319442749 + }, + { + "auxiliary_loss_clip": 0.01149411, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_clip": 1.05774832, + "balance_loss_mlp": 1.03965771, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.845596307753666, + "language_loss": 0.87758064, + "learning_rate": 3.956419273835913e-06, + "loss": 0.89970803, + "num_input_tokens_seen": 33763435, + "step": 1573, + "time_per_iteration": 4.065506219863892 + }, + { + "auxiliary_loss_clip": 0.01157506, + "auxiliary_loss_mlp": 0.01062657, + "balance_loss_clip": 1.0543654, + "balance_loss_mlp": 1.03815949, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 1.8958624934267565, + "language_loss": 0.8138324, + "learning_rate": 3.95633837685665e-06, + "loss": 0.836034, + "num_input_tokens_seen": 33784325, + "step": 1574, + "time_per_iteration": 2.5620226860046387 + }, + { + "auxiliary_loss_clip": 0.01155564, + "auxiliary_loss_mlp": 0.01050124, + "balance_loss_clip": 1.05379033, + "balance_loss_mlp": 1.02967978, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 2.020582177531197, + "language_loss": 0.80942029, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83147717, + "num_input_tokens_seen": 33802510, + "step": 1575, + "time_per_iteration": 2.5229716300964355 + }, + { + "auxiliary_loss_clip": 0.01181576, + "auxiliary_loss_mlp": 0.01065737, + "balance_loss_clip": 1.05725086, + "balance_loss_mlp": 1.04381442, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.2519886215679477, + "language_loss": 0.86866045, + "learning_rate": 3.956176360347553e-06, + "loss": 0.89113355, + "num_input_tokens_seen": 33819980, + "step": 1576, + "time_per_iteration": 2.4851019382476807 + }, + { + "auxiliary_loss_clip": 0.01062112, + "auxiliary_loss_mlp": 0.01003088, + "balance_loss_clip": 1.02676868, + "balance_loss_mlp": 1.00017917, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9824651459594869, + "language_loss": 0.65772516, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67837715, + "num_input_tokens_seen": 33878925, + "step": 1577, + "time_per_iteration": 3.0789618492126465 + }, + { + "auxiliary_loss_clip": 0.01146057, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_clip": 1.05167437, + "balance_loss_mlp": 1.02854943, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 2.2890915309834394, + "language_loss": 0.79377615, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81573355, + "num_input_tokens_seen": 33897600, + "step": 1578, + "time_per_iteration": 2.5738210678100586 + }, + { + "auxiliary_loss_clip": 0.01179882, + "auxiliary_loss_mlp": 0.01060457, + "balance_loss_clip": 1.05503333, + "balance_loss_mlp": 1.03888035, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.6668022653782018, + "language_loss": 0.78247821, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80488157, + "num_input_tokens_seen": 33917365, + "step": 1579, + "time_per_iteration": 2.5372278690338135 + }, + { + "auxiliary_loss_clip": 0.01130099, + "auxiliary_loss_mlp": 0.0106504, + "balance_loss_clip": 1.04821754, + "balance_loss_mlp": 1.03938651, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 1.8665271762516817, + "language_loss": 0.73206317, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75401461, + "num_input_tokens_seen": 33936680, + "step": 1580, + "time_per_iteration": 2.6054985523223877 + }, + { + "auxiliary_loss_clip": 0.01154688, + "auxiliary_loss_mlp": 0.01053463, + "balance_loss_clip": 1.05548716, + "balance_loss_mlp": 1.03214884, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 1.8066684288696029, + "language_loss": 0.77299428, + "learning_rate": 3.955770021006627e-06, + "loss": 0.79507583, + "num_input_tokens_seen": 33960685, + "step": 1581, + "time_per_iteration": 2.6722211837768555 + }, + { + "auxiliary_loss_clip": 0.01145183, + "auxiliary_loss_mlp": 0.01058678, + "balance_loss_clip": 1.05566859, + "balance_loss_mlp": 1.03807831, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 2.5748874627263825, + "language_loss": 0.87237191, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89441049, + "num_input_tokens_seen": 33980015, + "step": 1582, + "time_per_iteration": 2.561161994934082 + }, + { + "auxiliary_loss_clip": 0.01172247, + "auxiliary_loss_mlp": 0.01059282, + "balance_loss_clip": 1.05483866, + "balance_loss_mlp": 1.03621507, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 2.6515190297396125, + "language_loss": 0.66803294, + "learning_rate": 3.955606966107699e-06, + "loss": 0.69034821, + "num_input_tokens_seen": 33997705, + "step": 1583, + "time_per_iteration": 3.8958373069763184 + }, + { + "auxiliary_loss_clip": 0.01175935, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.06086278, + "balance_loss_mlp": 1.02719975, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8374531940463028, + "language_loss": 0.70828837, + "learning_rate": 3.95552532742147e-06, + "loss": 0.73054349, + "num_input_tokens_seen": 34017465, + "step": 1584, + "time_per_iteration": 2.603602647781372 + }, + { + "auxiliary_loss_clip": 0.01138074, + "auxiliary_loss_mlp": 0.01051429, + "balance_loss_clip": 1.05427539, + "balance_loss_mlp": 1.03137803, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.5541186872121535, + "language_loss": 0.80700874, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82890373, + "num_input_tokens_seen": 34038550, + "step": 1585, + "time_per_iteration": 2.6725151538848877 + }, + { + "auxiliary_loss_clip": 0.01158281, + "auxiliary_loss_mlp": 0.0105799, + "balance_loss_clip": 1.05465961, + "balance_loss_mlp": 1.03524542, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.7946699703258409, + "language_loss": 0.71772158, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73988426, + "num_input_tokens_seen": 34058665, + "step": 1586, + "time_per_iteration": 2.5921781063079834 + }, + { + "auxiliary_loss_clip": 0.0104148, + "auxiliary_loss_mlp": 0.01015463, + "balance_loss_clip": 1.02561498, + "balance_loss_mlp": 1.01226854, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8217054312831975, + "language_loss": 0.55420333, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57477272, + "num_input_tokens_seen": 34109655, + "step": 1587, + "time_per_iteration": 2.9907262325286865 + }, + { + "auxiliary_loss_clip": 0.01132781, + "auxiliary_loss_mlp": 0.01059561, + "balance_loss_clip": 1.05048096, + "balance_loss_mlp": 1.0370419, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.7151279503503607, + "language_loss": 0.81507254, + "learning_rate": 3.955198031170391e-06, + "loss": 0.8369959, + "num_input_tokens_seen": 34131115, + "step": 1588, + "time_per_iteration": 2.6332335472106934 + }, + { + "auxiliary_loss_clip": 0.01133024, + "auxiliary_loss_mlp": 0.01056704, + "balance_loss_clip": 1.04970169, + "balance_loss_mlp": 1.03497171, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 1.4483719651195655, + "language_loss": 0.81644881, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83834606, + "num_input_tokens_seen": 34151925, + "step": 1589, + "time_per_iteration": 2.6319456100463867 + }, + { + "auxiliary_loss_clip": 0.01125227, + "auxiliary_loss_mlp": 0.00790088, + "balance_loss_clip": 1.05571377, + "balance_loss_mlp": 1.00042248, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.5578465068098453, + "language_loss": 0.64810532, + "learning_rate": 3.955033938184601e-06, + "loss": 0.66725844, + "num_input_tokens_seen": 34175395, + "step": 1590, + "time_per_iteration": 2.802513599395752 + }, + { + "auxiliary_loss_clip": 0.0114683, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.05494964, + "balance_loss_mlp": 1.03508401, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.7587805480952114, + "language_loss": 0.83304977, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85508299, + "num_input_tokens_seen": 34197760, + "step": 1591, + "time_per_iteration": 2.7032668590545654 + }, + { + "auxiliary_loss_clip": 0.01162141, + "auxiliary_loss_mlp": 0.01061088, + "balance_loss_clip": 1.05458307, + "balance_loss_mlp": 1.03924894, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.4615183474267748, + "language_loss": 0.73958135, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76181352, + "num_input_tokens_seen": 34215330, + "step": 1592, + "time_per_iteration": 2.4945335388183594 + }, + { + "auxiliary_loss_clip": 0.01170255, + "auxiliary_loss_mlp": 0.01054284, + "balance_loss_clip": 1.05395377, + "balance_loss_mlp": 1.03338683, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.8140746689779716, + "language_loss": 0.74236357, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76460898, + "num_input_tokens_seen": 34237745, + "step": 1593, + "time_per_iteration": 2.5711050033569336 + }, + { + "auxiliary_loss_clip": 0.0117636, + "auxiliary_loss_mlp": 0.01060253, + "balance_loss_clip": 1.06005764, + "balance_loss_mlp": 1.03914142, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.9226243537285777, + "language_loss": 0.69967383, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72203994, + "num_input_tokens_seen": 34256565, + "step": 1594, + "time_per_iteration": 2.543513536453247 + }, + { + "auxiliary_loss_clip": 0.01171795, + "auxiliary_loss_mlp": 0.0105171, + "balance_loss_clip": 1.0574615, + "balance_loss_mlp": 1.03150403, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.615500947700834, + "language_loss": 0.82860553, + "learning_rate": 3.954622408410747e-06, + "loss": 0.85084057, + "num_input_tokens_seen": 34275970, + "step": 1595, + "time_per_iteration": 2.502314329147339 + }, + { + "auxiliary_loss_clip": 0.01154701, + "auxiliary_loss_mlp": 0.01053144, + "balance_loss_clip": 1.05324662, + "balance_loss_mlp": 1.03056574, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.087397241660806, + "language_loss": 0.84946036, + "learning_rate": 3.954539880085045e-06, + "loss": 0.87153888, + "num_input_tokens_seen": 34295490, + "step": 1596, + "time_per_iteration": 2.5228888988494873 + }, + { + "auxiliary_loss_clip": 0.0117116, + "auxiliary_loss_mlp": 0.01059995, + "balance_loss_clip": 1.05956197, + "balance_loss_mlp": 1.0377984, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 1.7311957853656985, + "language_loss": 0.69261676, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71492833, + "num_input_tokens_seen": 34319990, + "step": 1597, + "time_per_iteration": 2.651576519012451 + }, + { + "auxiliary_loss_clip": 0.01171067, + "auxiliary_loss_mlp": 0.00789379, + "balance_loss_clip": 1.05501509, + "balance_loss_mlp": 1.00048077, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.1068607379899316, + "language_loss": 0.74869502, + "learning_rate": 3.954374601087729e-06, + "loss": 0.76829946, + "num_input_tokens_seen": 34339225, + "step": 1598, + "time_per_iteration": 2.5016512870788574 + }, + { + "auxiliary_loss_clip": 0.01177049, + "auxiliary_loss_mlp": 0.01056688, + "balance_loss_clip": 1.06161857, + "balance_loss_mlp": 1.0334543, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.7816984255454562, + "language_loss": 0.69098473, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7133221, + "num_input_tokens_seen": 34361020, + "step": 1599, + "time_per_iteration": 2.5947725772857666 + }, + { + "auxiliary_loss_clip": 0.01153374, + "auxiliary_loss_mlp": 0.01053662, + "balance_loss_clip": 1.05785561, + "balance_loss_mlp": 1.03270519, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.3011375527716127, + "language_loss": 0.84006119, + "learning_rate": 3.954209025650093e-06, + "loss": 0.86213154, + "num_input_tokens_seen": 34378630, + "step": 1600, + "time_per_iteration": 2.546644926071167 + }, + { + "auxiliary_loss_clip": 0.01154079, + "auxiliary_loss_mlp": 0.01061189, + "balance_loss_clip": 1.05673552, + "balance_loss_mlp": 1.03994584, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 3.5630800040465354, + "language_loss": 0.80386758, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82602024, + "num_input_tokens_seen": 34397110, + "step": 1601, + "time_per_iteration": 2.555633068084717 + }, + { + "auxiliary_loss_clip": 0.01180418, + "auxiliary_loss_mlp": 0.01059641, + "balance_loss_clip": 1.05976677, + "balance_loss_mlp": 1.03813553, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.0957812894236056, + "language_loss": 0.82404244, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84644306, + "num_input_tokens_seen": 34414165, + "step": 1602, + "time_per_iteration": 2.488130807876587 + }, + { + "auxiliary_loss_clip": 0.01138748, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.05498934, + "balance_loss_mlp": 1.03298354, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 2.024729785526405, + "language_loss": 0.62427366, + "learning_rate": 3.953960106722989e-06, + "loss": 0.646222, + "num_input_tokens_seen": 34434445, + "step": 1603, + "time_per_iteration": 2.5865659713745117 + }, + { + "auxiliary_loss_clip": 0.01192301, + "auxiliary_loss_mlp": 0.01054053, + "balance_loss_clip": 1.06404531, + "balance_loss_mlp": 1.03070021, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.803145413669395, + "language_loss": 0.70914143, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73160493, + "num_input_tokens_seen": 34453095, + "step": 1604, + "time_per_iteration": 2.500105381011963 + }, + { + "auxiliary_loss_clip": 0.01172347, + "auxiliary_loss_mlp": 0.01055494, + "balance_loss_clip": 1.05684614, + "balance_loss_mlp": 1.03460848, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.052260879204556, + "language_loss": 0.80037278, + "learning_rate": 3.953793790294527e-06, + "loss": 0.82265127, + "num_input_tokens_seen": 34473680, + "step": 1605, + "time_per_iteration": 2.5506277084350586 + }, + { + "auxiliary_loss_clip": 0.01160308, + "auxiliary_loss_mlp": 0.01048593, + "balance_loss_clip": 1.05722558, + "balance_loss_mlp": 1.02729011, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 1.9479130103932938, + "language_loss": 0.74469459, + "learning_rate": 3.953710520946634e-06, + "loss": 0.7667836, + "num_input_tokens_seen": 34492610, + "step": 1606, + "time_per_iteration": 2.590967893600464 + }, + { + "auxiliary_loss_clip": 0.01170801, + "auxiliary_loss_mlp": 0.01050321, + "balance_loss_clip": 1.05982244, + "balance_loss_mlp": 1.0290184, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.7043043581497508, + "language_loss": 0.75546157, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77767277, + "num_input_tokens_seen": 34511855, + "step": 1607, + "time_per_iteration": 4.0212013721466064 + }, + { + "auxiliary_loss_clip": 0.01140258, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_clip": 1.05070972, + "balance_loss_mlp": 1.02770782, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 2.283111050827459, + "language_loss": 0.8672145, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88910139, + "num_input_tokens_seen": 34528905, + "step": 1608, + "time_per_iteration": 2.5288562774658203 + }, + { + "auxiliary_loss_clip": 0.01124362, + "auxiliary_loss_mlp": 0.01061066, + "balance_loss_clip": 1.05402339, + "balance_loss_mlp": 1.03790379, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 1.9495030665714963, + "language_loss": 0.71472728, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73658156, + "num_input_tokens_seen": 34548480, + "step": 1609, + "time_per_iteration": 4.204582691192627 + }, + { + "auxiliary_loss_clip": 0.01149569, + "auxiliary_loss_mlp": 0.0105436, + "balance_loss_clip": 1.05748034, + "balance_loss_mlp": 1.03320003, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 2.43810446376779, + "language_loss": 0.85146856, + "learning_rate": 3.953376702737693e-06, + "loss": 0.8735078, + "num_input_tokens_seen": 34565410, + "step": 1610, + "time_per_iteration": 2.5538172721862793 + }, + { + "auxiliary_loss_clip": 0.01160164, + "auxiliary_loss_mlp": 0.01052855, + "balance_loss_clip": 1.05961573, + "balance_loss_mlp": 1.03040779, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.367182292596714, + "language_loss": 0.67250943, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69463956, + "num_input_tokens_seen": 34584840, + "step": 1611, + "time_per_iteration": 2.5747592449188232 + }, + { + "auxiliary_loss_clip": 0.01128078, + "auxiliary_loss_mlp": 0.01055117, + "balance_loss_clip": 1.04903853, + "balance_loss_mlp": 1.03400528, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.8517518865806721, + "language_loss": 0.81197065, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83380264, + "num_input_tokens_seen": 34603360, + "step": 1612, + "time_per_iteration": 4.050261735916138 + }, + { + "auxiliary_loss_clip": 0.01180765, + "auxiliary_loss_mlp": 0.01065992, + "balance_loss_clip": 1.0618031, + "balance_loss_mlp": 1.04420066, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.3148777028240257, + "language_loss": 0.80804038, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83050787, + "num_input_tokens_seen": 34620760, + "step": 1613, + "time_per_iteration": 2.469222068786621 + }, + { + "auxiliary_loss_clip": 0.01146077, + "auxiliary_loss_mlp": 0.01055178, + "balance_loss_clip": 1.0545764, + "balance_loss_mlp": 1.03267121, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 2.9984815886501, + "language_loss": 0.84457779, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86659032, + "num_input_tokens_seen": 34640695, + "step": 1614, + "time_per_iteration": 2.622927188873291 + }, + { + "auxiliary_loss_clip": 0.01075224, + "auxiliary_loss_mlp": 0.00765255, + "balance_loss_clip": 1.03357005, + "balance_loss_mlp": 0.99997675, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.715900130683655, + "language_loss": 0.54681492, + "learning_rate": 3.952957763374992e-06, + "loss": 0.5652197, + "num_input_tokens_seen": 34702395, + "step": 1615, + "time_per_iteration": 3.13012433052063 + }, + { + "auxiliary_loss_clip": 0.01038508, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.02612329, + "balance_loss_mlp": 1.04012144, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 1.5692030054164459, + "language_loss": 0.58304858, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60386407, + "num_input_tokens_seen": 34768910, + "step": 1616, + "time_per_iteration": 3.3022513389587402 + }, + { + "auxiliary_loss_clip": 0.01153464, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_clip": 1.05405474, + "balance_loss_mlp": 1.03820419, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.7553179729811879, + "language_loss": 0.69175774, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71391678, + "num_input_tokens_seen": 34787680, + "step": 1617, + "time_per_iteration": 2.557760238647461 + }, + { + "auxiliary_loss_clip": 0.01150948, + "auxiliary_loss_mlp": 0.01058239, + "balance_loss_clip": 1.05295682, + "balance_loss_mlp": 1.03316879, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.9830684514632018, + "language_loss": 0.80637634, + "learning_rate": 3.952705511055698e-06, + "loss": 0.8284682, + "num_input_tokens_seen": 34808330, + "step": 1618, + "time_per_iteration": 2.650507688522339 + }, + { + "auxiliary_loss_clip": 0.01164381, + "auxiliary_loss_mlp": 0.0105505, + "balance_loss_clip": 1.05918765, + "balance_loss_mlp": 1.03470111, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 2.2738281007528864, + "language_loss": 0.92968202, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95187628, + "num_input_tokens_seen": 34830020, + "step": 1619, + "time_per_iteration": 2.679593563079834 + }, + { + "auxiliary_loss_clip": 0.0117373, + "auxiliary_loss_mlp": 0.01053486, + "balance_loss_clip": 1.05972767, + "balance_loss_mlp": 1.03091979, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 2.913442179790009, + "language_loss": 0.88440168, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.90667391, + "num_input_tokens_seen": 34850330, + "step": 1620, + "time_per_iteration": 2.5934834480285645 + }, + { + "auxiliary_loss_clip": 0.01153771, + "auxiliary_loss_mlp": 0.0106011, + "balance_loss_clip": 1.0549885, + "balance_loss_mlp": 1.03635192, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 2.9050568676686734, + "language_loss": 0.77300215, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79514092, + "num_input_tokens_seen": 34871640, + "step": 1621, + "time_per_iteration": 2.555344581604004 + }, + { + "auxiliary_loss_clip": 0.01130585, + "auxiliary_loss_mlp": 0.01070364, + "balance_loss_clip": 1.04757261, + "balance_loss_mlp": 1.04600954, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 2.0002203077207388, + "language_loss": 0.77988565, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80189514, + "num_input_tokens_seen": 34888100, + "step": 1622, + "time_per_iteration": 2.5250275135040283 + }, + { + "auxiliary_loss_clip": 0.0115098, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_clip": 1.0549016, + "balance_loss_mlp": 1.04105997, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 2.183733939562923, + "language_loss": 0.85782409, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.87997484, + "num_input_tokens_seen": 34910485, + "step": 1623, + "time_per_iteration": 3.9650585651397705 + }, + { + "auxiliary_loss_clip": 0.01174145, + "auxiliary_loss_mlp": 0.01060125, + "balance_loss_clip": 1.05885553, + "balance_loss_mlp": 1.03735614, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.14173767402631, + "language_loss": 0.80094415, + "learning_rate": 3.952199007240184e-06, + "loss": 0.82328689, + "num_input_tokens_seen": 34928615, + "step": 1624, + "time_per_iteration": 2.4656178951263428 + }, + { + "auxiliary_loss_clip": 0.01172799, + "auxiliary_loss_mlp": 0.01049495, + "balance_loss_clip": 1.05518317, + "balance_loss_mlp": 1.02784681, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 4.874141006948073, + "language_loss": 0.85775095, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87997389, + "num_input_tokens_seen": 34946045, + "step": 1625, + "time_per_iteration": 2.4817724227905273 + }, + { + "auxiliary_loss_clip": 0.011782, + "auxiliary_loss_mlp": 0.01058402, + "balance_loss_clip": 1.05884194, + "balance_loss_mlp": 1.03641975, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 2.5385447376871593, + "language_loss": 0.85727751, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87964356, + "num_input_tokens_seen": 34962865, + "step": 1626, + "time_per_iteration": 2.5131142139434814 + }, + { + "auxiliary_loss_clip": 0.01164454, + "auxiliary_loss_mlp": 0.00791424, + "balance_loss_clip": 1.0545181, + "balance_loss_mlp": 1.00047016, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 2.299988995369925, + "language_loss": 0.83429509, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85385388, + "num_input_tokens_seen": 34983505, + "step": 1627, + "time_per_iteration": 2.522826671600342 + }, + { + "auxiliary_loss_clip": 0.01163348, + "auxiliary_loss_mlp": 0.01052245, + "balance_loss_clip": 1.05384767, + "balance_loss_mlp": 1.03060913, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.7611235425513743, + "language_loss": 0.84626412, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86842012, + "num_input_tokens_seen": 35001825, + "step": 1628, + "time_per_iteration": 2.5108327865600586 + }, + { + "auxiliary_loss_clip": 0.01168183, + "auxiliary_loss_mlp": 0.01053681, + "balance_loss_clip": 1.05426323, + "balance_loss_mlp": 1.03203273, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.5226892972821573, + "language_loss": 0.75767112, + "learning_rate": 3.951774884939523e-06, + "loss": 0.7798897, + "num_input_tokens_seen": 35023075, + "step": 1629, + "time_per_iteration": 2.5114212036132812 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.01050903, + "balance_loss_clip": 1.05487442, + "balance_loss_mlp": 1.02753782, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.6814834332227748, + "language_loss": 0.78497255, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80670077, + "num_input_tokens_seen": 35043480, + "step": 1630, + "time_per_iteration": 2.5980312824249268 + }, + { + "auxiliary_loss_clip": 0.01165219, + "auxiliary_loss_mlp": 0.01052324, + "balance_loss_clip": 1.06012726, + "balance_loss_mlp": 1.02893555, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.8311904413356315, + "language_loss": 0.86840439, + "learning_rate": 3.951604717916228e-06, + "loss": 0.89057982, + "num_input_tokens_seen": 35061490, + "step": 1631, + "time_per_iteration": 2.508877992630005 + }, + { + "auxiliary_loss_clip": 0.01169532, + "auxiliary_loss_mlp": 0.01055391, + "balance_loss_clip": 1.06342173, + "balance_loss_mlp": 1.03426695, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.0096937773853667, + "language_loss": 0.83485591, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85710514, + "num_input_tokens_seen": 35079670, + "step": 1632, + "time_per_iteration": 2.6362996101379395 + }, + { + "auxiliary_loss_clip": 0.01143371, + "auxiliary_loss_mlp": 0.01056548, + "balance_loss_clip": 1.053509, + "balance_loss_mlp": 1.03516197, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.601855260145, + "language_loss": 0.78539312, + "learning_rate": 3.951434254872751e-06, + "loss": 0.8073923, + "num_input_tokens_seen": 35099205, + "step": 1633, + "time_per_iteration": 2.546217203140259 + }, + { + "auxiliary_loss_clip": 0.01167454, + "auxiliary_loss_mlp": 0.01057377, + "balance_loss_clip": 1.05468655, + "balance_loss_mlp": 1.03520393, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.6676515355333956, + "language_loss": 0.72987723, + "learning_rate": 3.951348912351521e-06, + "loss": 0.7521255, + "num_input_tokens_seen": 35115270, + "step": 1634, + "time_per_iteration": 2.4800024032592773 + }, + { + "auxiliary_loss_clip": 0.0115281, + "auxiliary_loss_mlp": 0.01067766, + "balance_loss_clip": 1.05234647, + "balance_loss_mlp": 1.04462755, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.7483558361781433, + "language_loss": 0.73556089, + "learning_rate": 3.951263495834947e-06, + "loss": 0.7577666, + "num_input_tokens_seen": 35134065, + "step": 1635, + "time_per_iteration": 2.6484220027923584 + }, + { + "auxiliary_loss_clip": 0.01148295, + "auxiliary_loss_mlp": 0.01068445, + "balance_loss_clip": 1.05243528, + "balance_loss_mlp": 1.04398322, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 2.19015959098006, + "language_loss": 0.782107, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80427438, + "num_input_tokens_seen": 35154870, + "step": 1636, + "time_per_iteration": 2.5847184658050537 + }, + { + "auxiliary_loss_clip": 0.01159539, + "auxiliary_loss_mlp": 0.01060221, + "balance_loss_clip": 1.05643737, + "balance_loss_mlp": 1.0386678, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 1.9842131891787749, + "language_loss": 0.70046991, + "learning_rate": 3.951092440828715e-06, + "loss": 0.72266746, + "num_input_tokens_seen": 35171850, + "step": 1637, + "time_per_iteration": 2.520642042160034 + }, + { + "auxiliary_loss_clip": 0.01182458, + "auxiliary_loss_mlp": 0.01058787, + "balance_loss_clip": 1.05792713, + "balance_loss_mlp": 1.03697133, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.2794940839722067, + "language_loss": 0.77430743, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79671991, + "num_input_tokens_seen": 35188795, + "step": 1638, + "time_per_iteration": 2.4612765312194824 + }, + { + "auxiliary_loss_clip": 0.01136477, + "auxiliary_loss_mlp": 0.01050584, + "balance_loss_clip": 1.05597258, + "balance_loss_mlp": 1.02943635, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.3764647152365017, + "language_loss": 0.72463906, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74650967, + "num_input_tokens_seen": 35212100, + "step": 1639, + "time_per_iteration": 2.6480844020843506 + }, + { + "auxiliary_loss_clip": 0.0116851, + "auxiliary_loss_mlp": 0.01048103, + "balance_loss_clip": 1.05630398, + "balance_loss_mlp": 1.02582312, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.94833170590115, + "language_loss": 0.88643616, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90860236, + "num_input_tokens_seen": 35230390, + "step": 1640, + "time_per_iteration": 2.5264103412628174 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.01044139, + "balance_loss_clip": 1.05681753, + "balance_loss_mlp": 1.02305067, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 1.8349023994786526, + "language_loss": 0.80978274, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83193433, + "num_input_tokens_seen": 35250405, + "step": 1641, + "time_per_iteration": 2.4889795780181885 + }, + { + "auxiliary_loss_clip": 0.01171494, + "auxiliary_loss_mlp": 0.01056858, + "balance_loss_clip": 1.05891895, + "balance_loss_mlp": 1.03416085, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 3.2801023530711015, + "language_loss": 0.85245538, + "learning_rate": 3.95066350862165e-06, + "loss": 0.87473887, + "num_input_tokens_seen": 35262820, + "step": 1642, + "time_per_iteration": 2.517317771911621 + }, + { + "auxiliary_loss_clip": 0.01146308, + "auxiliary_loss_mlp": 0.01056224, + "balance_loss_clip": 1.05828357, + "balance_loss_mlp": 1.03464675, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.5668308231607582, + "language_loss": 0.80723858, + "learning_rate": 3.950577500259144e-06, + "loss": 0.82926393, + "num_input_tokens_seen": 35284490, + "step": 1643, + "time_per_iteration": 2.610780954360962 + }, + { + "auxiliary_loss_clip": 0.01170426, + "auxiliary_loss_mlp": 0.01068993, + "balance_loss_clip": 1.05723763, + "balance_loss_mlp": 1.04738021, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 2.3819836940352084, + "language_loss": 0.82364297, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84603715, + "num_input_tokens_seen": 35302815, + "step": 1644, + "time_per_iteration": 2.468858480453491 + }, + { + "auxiliary_loss_clip": 0.01158597, + "auxiliary_loss_mlp": 0.00791997, + "balance_loss_clip": 1.05445933, + "balance_loss_mlp": 1.00045979, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.7391004224606947, + "language_loss": 0.6867106, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70621651, + "num_input_tokens_seen": 35321175, + "step": 1645, + "time_per_iteration": 2.503849983215332 + }, + { + "auxiliary_loss_clip": 0.01061549, + "auxiliary_loss_mlp": 0.01000141, + "balance_loss_clip": 1.03201962, + "balance_loss_mlp": 0.99736369, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.8646365328312495, + "language_loss": 0.60824752, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62886441, + "num_input_tokens_seen": 35381740, + "step": 1646, + "time_per_iteration": 3.035595655441284 + }, + { + "auxiliary_loss_clip": 0.01142499, + "auxiliary_loss_mlp": 0.01053139, + "balance_loss_clip": 1.05578864, + "balance_loss_mlp": 1.0307163, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 1.8123288897000036, + "language_loss": 0.73478007, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75673652, + "num_input_tokens_seen": 35403760, + "step": 1647, + "time_per_iteration": 4.057161092758179 + }, + { + "auxiliary_loss_clip": 0.01162872, + "auxiliary_loss_mlp": 0.01061814, + "balance_loss_clip": 1.06364071, + "balance_loss_mlp": 1.04156041, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 2.1004548068421416, + "language_loss": 0.84232092, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86456776, + "num_input_tokens_seen": 35424050, + "step": 1648, + "time_per_iteration": 2.56074595451355 + }, + { + "auxiliary_loss_clip": 0.01064143, + "auxiliary_loss_mlp": 0.01012389, + "balance_loss_clip": 1.0239749, + "balance_loss_mlp": 1.00917077, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7299989773152543, + "language_loss": 0.55656505, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57733035, + "num_input_tokens_seen": 35481690, + "step": 1649, + "time_per_iteration": 4.397501707077026 + }, + { + "auxiliary_loss_clip": 0.01163279, + "auxiliary_loss_mlp": 0.010438, + "balance_loss_clip": 1.05262339, + "balance_loss_mlp": 1.02305722, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.517140147924967, + "language_loss": 0.89628202, + "learning_rate": 3.949973370853954e-06, + "loss": 0.91835278, + "num_input_tokens_seen": 35498635, + "step": 1650, + "time_per_iteration": 2.660667896270752 + }, + { + "auxiliary_loss_clip": 0.01039109, + "auxiliary_loss_mlp": 0.00763719, + "balance_loss_clip": 1.03536963, + "balance_loss_mlp": 0.99997407, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8068243113910637, + "language_loss": 0.63760859, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65563691, + "num_input_tokens_seen": 35565720, + "step": 1651, + "time_per_iteration": 4.753629207611084 + }, + { + "auxiliary_loss_clip": 0.01161725, + "auxiliary_loss_mlp": 0.01059977, + "balance_loss_clip": 1.05464423, + "balance_loss_mlp": 1.03817415, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 2.6687775502285125, + "language_loss": 0.88185847, + "learning_rate": 3.949800096914643e-06, + "loss": 0.9040755, + "num_input_tokens_seen": 35586000, + "step": 1652, + "time_per_iteration": 2.51684832572937 + }, + { + "auxiliary_loss_clip": 0.01159581, + "auxiliary_loss_mlp": 0.01054991, + "balance_loss_clip": 1.05862892, + "balance_loss_mlp": 1.03408217, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 2.0836673463583617, + "language_loss": 0.82051361, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84265924, + "num_input_tokens_seen": 35604355, + "step": 1653, + "time_per_iteration": 2.5495259761810303 + }, + { + "auxiliary_loss_clip": 0.01169666, + "auxiliary_loss_mlp": 0.00788747, + "balance_loss_clip": 1.05776036, + "balance_loss_mlp": 1.00050306, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.445831829303043, + "language_loss": 0.79415798, + "learning_rate": 3.949626527228875e-06, + "loss": 0.8137421, + "num_input_tokens_seen": 35625495, + "step": 1654, + "time_per_iteration": 2.553598403930664 + }, + { + "auxiliary_loss_clip": 0.01178112, + "auxiliary_loss_mlp": 0.0106084, + "balance_loss_clip": 1.05954063, + "balance_loss_mlp": 1.04136169, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.5733452004552226, + "language_loss": 0.81087399, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83326352, + "num_input_tokens_seen": 35645030, + "step": 1655, + "time_per_iteration": 2.4776384830474854 + }, + { + "auxiliary_loss_clip": 0.01176024, + "auxiliary_loss_mlp": 0.01055831, + "balance_loss_clip": 1.05579066, + "balance_loss_mlp": 1.03457594, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.7285410718328473, + "language_loss": 0.80968368, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83200228, + "num_input_tokens_seen": 35664305, + "step": 1656, + "time_per_iteration": 2.510178565979004 + }, + { + "auxiliary_loss_clip": 0.01167447, + "auxiliary_loss_mlp": 0.01056124, + "balance_loss_clip": 1.05892372, + "balance_loss_mlp": 1.03516722, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.783652044027898, + "language_loss": 0.88534319, + "learning_rate": 3.949365618233217e-06, + "loss": 0.90757895, + "num_input_tokens_seen": 35684060, + "step": 1657, + "time_per_iteration": 2.5849828720092773 + }, + { + "auxiliary_loss_clip": 0.0115722, + "auxiliary_loss_mlp": 0.01054301, + "balance_loss_clip": 1.05495787, + "balance_loss_mlp": 1.03187835, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.180626762492672, + "language_loss": 0.85286975, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.87498498, + "num_input_tokens_seen": 35703250, + "step": 1658, + "time_per_iteration": 2.5266733169555664 + }, + { + "auxiliary_loss_clip": 0.01069012, + "auxiliary_loss_mlp": 0.01007146, + "balance_loss_clip": 1.02211964, + "balance_loss_mlp": 1.00432062, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9001504415063547, + "language_loss": 0.60819352, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62895513, + "num_input_tokens_seen": 35762165, + "step": 1659, + "time_per_iteration": 3.0498180389404297 + }, + { + "auxiliary_loss_clip": 0.01149703, + "auxiliary_loss_mlp": 0.01047979, + "balance_loss_clip": 1.05094516, + "balance_loss_mlp": 1.02617598, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.9489072911844254, + "language_loss": 0.85449421, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87647104, + "num_input_tokens_seen": 35781520, + "step": 1660, + "time_per_iteration": 2.546248197555542 + }, + { + "auxiliary_loss_clip": 0.01145248, + "auxiliary_loss_mlp": 0.01056595, + "balance_loss_clip": 1.05312049, + "balance_loss_mlp": 1.03375423, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 1.9194332907203977, + "language_loss": 0.79992616, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82194459, + "num_input_tokens_seen": 35799565, + "step": 1661, + "time_per_iteration": 2.5986990928649902 + }, + { + "auxiliary_loss_clip": 0.01166798, + "auxiliary_loss_mlp": 0.01053611, + "balance_loss_clip": 1.05676198, + "balance_loss_mlp": 1.03150928, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 2.1339635659187044, + "language_loss": 0.83648109, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85868526, + "num_input_tokens_seen": 35821085, + "step": 1662, + "time_per_iteration": 2.5638015270233154 + }, + { + "auxiliary_loss_clip": 0.01153563, + "auxiliary_loss_mlp": 0.01057547, + "balance_loss_clip": 1.05229616, + "balance_loss_mlp": 1.03455162, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 1.9873113987988191, + "language_loss": 0.8924017, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.91451281, + "num_input_tokens_seen": 35839840, + "step": 1663, + "time_per_iteration": 4.044901371002197 + }, + { + "auxiliary_loss_clip": 0.01173182, + "auxiliary_loss_mlp": 0.01051808, + "balance_loss_clip": 1.0588994, + "balance_loss_mlp": 1.03051698, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.8648303978788838, + "language_loss": 0.69733196, + "learning_rate": 3.948754243526191e-06, + "loss": 0.71958184, + "num_input_tokens_seen": 35861545, + "step": 1664, + "time_per_iteration": 2.540600061416626 + }, + { + "auxiliary_loss_clip": 0.01135767, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.05209196, + "balance_loss_mlp": 1.0269804, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.124401292946649, + "language_loss": 0.78831786, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81015968, + "num_input_tokens_seen": 35878295, + "step": 1665, + "time_per_iteration": 2.5353245735168457 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.01064618, + "balance_loss_clip": 1.05979657, + "balance_loss_mlp": 1.04372048, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.9432685952321167, + "language_loss": 0.69955873, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72187579, + "num_input_tokens_seen": 35898990, + "step": 1666, + "time_per_iteration": 2.603055000305176 + }, + { + "auxiliary_loss_clip": 0.01107497, + "auxiliary_loss_mlp": 0.01062836, + "balance_loss_clip": 1.051337, + "balance_loss_mlp": 1.03994739, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 2.1242649286632136, + "language_loss": 0.78908211, + "learning_rate": 3.948491117273956e-06, + "loss": 0.81078541, + "num_input_tokens_seen": 35916225, + "step": 1667, + "time_per_iteration": 2.699059247970581 + }, + { + "auxiliary_loss_clip": 0.01149596, + "auxiliary_loss_mlp": 0.0106012, + "balance_loss_clip": 1.05257201, + "balance_loss_mlp": 1.03592014, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.509423955049542, + "language_loss": 0.76782459, + "learning_rate": 3.948403260744817e-06, + "loss": 0.78992176, + "num_input_tokens_seen": 35934630, + "step": 1668, + "time_per_iteration": 2.6314072608947754 + }, + { + "auxiliary_loss_clip": 0.01179354, + "auxiliary_loss_mlp": 0.01055424, + "balance_loss_clip": 1.05729651, + "balance_loss_mlp": 1.03289354, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.7683592388856917, + "language_loss": 0.78048581, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80283362, + "num_input_tokens_seen": 35953855, + "step": 1669, + "time_per_iteration": 2.584486722946167 + }, + { + "auxiliary_loss_clip": 0.0118566, + "auxiliary_loss_mlp": 0.01063474, + "balance_loss_clip": 1.05958343, + "balance_loss_mlp": 1.04080093, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.2897548890924586, + "language_loss": 0.85587478, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87836611, + "num_input_tokens_seen": 35974555, + "step": 1670, + "time_per_iteration": 2.569464921951294 + }, + { + "auxiliary_loss_clip": 0.01175746, + "auxiliary_loss_mlp": 0.01052938, + "balance_loss_clip": 1.05692422, + "balance_loss_mlp": 1.03219581, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.7214196602199756, + "language_loss": 0.76924145, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79152834, + "num_input_tokens_seen": 35996830, + "step": 1671, + "time_per_iteration": 2.552581310272217 + }, + { + "auxiliary_loss_clip": 0.01067019, + "auxiliary_loss_mlp": 0.01007038, + "balance_loss_clip": 1.0273329, + "balance_loss_mlp": 1.00402153, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7698239096763769, + "language_loss": 0.60795969, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62870026, + "num_input_tokens_seen": 36054465, + "step": 1672, + "time_per_iteration": 3.084040880203247 + }, + { + "auxiliary_loss_clip": 0.01139113, + "auxiliary_loss_mlp": 0.01055933, + "balance_loss_clip": 1.05116463, + "balance_loss_mlp": 1.03371215, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.5969193330388025, + "language_loss": 0.77272439, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79467487, + "num_input_tokens_seen": 36073480, + "step": 1673, + "time_per_iteration": 2.6322007179260254 + }, + { + "auxiliary_loss_clip": 0.01129935, + "auxiliary_loss_mlp": 0.01056838, + "balance_loss_clip": 1.04957247, + "balance_loss_mlp": 1.03430724, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.3034379952098343, + "language_loss": 0.73175371, + "learning_rate": 3.947874570130197e-06, + "loss": 0.7536214, + "num_input_tokens_seen": 36091830, + "step": 1674, + "time_per_iteration": 2.6363542079925537 + }, + { + "auxiliary_loss_clip": 0.01166851, + "auxiliary_loss_mlp": 0.00788151, + "balance_loss_clip": 1.05524683, + "balance_loss_mlp": 1.00062084, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 1.8068065731403018, + "language_loss": 0.79337931, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81292933, + "num_input_tokens_seen": 36111400, + "step": 1675, + "time_per_iteration": 2.57476806640625 + }, + { + "auxiliary_loss_clip": 0.01177028, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_clip": 1.05614805, + "balance_loss_mlp": 1.04698014, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.309546877137194, + "language_loss": 0.81560063, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83804303, + "num_input_tokens_seen": 36129345, + "step": 1676, + "time_per_iteration": 2.525285482406616 + }, + { + "auxiliary_loss_clip": 0.01173129, + "auxiliary_loss_mlp": 0.01057506, + "balance_loss_clip": 1.05993855, + "balance_loss_mlp": 1.0364778, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.2804816803363184, + "language_loss": 0.85864919, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88095558, + "num_input_tokens_seen": 36146255, + "step": 1677, + "time_per_iteration": 2.5210013389587402 + }, + { + "auxiliary_loss_clip": 0.01163514, + "auxiliary_loss_mlp": 0.01056377, + "balance_loss_clip": 1.0562588, + "balance_loss_mlp": 1.03495562, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 2.079411981914071, + "language_loss": 0.86006057, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88225949, + "num_input_tokens_seen": 36164050, + "step": 1678, + "time_per_iteration": 2.5617191791534424 + }, + { + "auxiliary_loss_clip": 0.01155252, + "auxiliary_loss_mlp": 0.01052348, + "balance_loss_clip": 1.05582702, + "balance_loss_mlp": 1.03103304, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.0366698062742126, + "language_loss": 0.89821213, + "learning_rate": 3.947431963338532e-06, + "loss": 0.92028809, + "num_input_tokens_seen": 36183530, + "step": 1679, + "time_per_iteration": 2.586604118347168 + }, + { + "auxiliary_loss_clip": 0.01071645, + "auxiliary_loss_mlp": 0.01002344, + "balance_loss_clip": 1.02477622, + "balance_loss_mlp": 0.99947089, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7885908158016292, + "language_loss": 0.52988553, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55062538, + "num_input_tokens_seen": 36248550, + "step": 1680, + "time_per_iteration": 3.121696949005127 + }, + { + "auxiliary_loss_clip": 0.01177986, + "auxiliary_loss_mlp": 0.00788999, + "balance_loss_clip": 1.05811584, + "balance_loss_mlp": 1.00050354, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.630086813277539, + "language_loss": 0.77298188, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79265171, + "num_input_tokens_seen": 36266065, + "step": 1681, + "time_per_iteration": 2.52884840965271 + }, + { + "auxiliary_loss_clip": 0.01150789, + "auxiliary_loss_mlp": 0.0105548, + "balance_loss_clip": 1.05128241, + "balance_loss_mlp": 1.03051782, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.1693538677607975, + "language_loss": 0.93943262, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96149528, + "num_input_tokens_seen": 36280960, + "step": 1682, + "time_per_iteration": 2.5790629386901855 + }, + { + "auxiliary_loss_clip": 0.01168933, + "auxiliary_loss_mlp": 0.01053914, + "balance_loss_clip": 1.05387473, + "balance_loss_mlp": 1.03255153, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 1.9131519851451735, + "language_loss": 0.87837726, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90060568, + "num_input_tokens_seen": 36299010, + "step": 1683, + "time_per_iteration": 2.621095657348633 + }, + { + "auxiliary_loss_clip": 0.01129648, + "auxiliary_loss_mlp": 0.01062188, + "balance_loss_clip": 1.04928064, + "balance_loss_mlp": 1.04005146, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.942264987982919, + "language_loss": 0.74653691, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76845527, + "num_input_tokens_seen": 36318400, + "step": 1684, + "time_per_iteration": 2.6182868480682373 + }, + { + "auxiliary_loss_clip": 0.01060455, + "auxiliary_loss_mlp": 0.01004732, + "balance_loss_clip": 1.03256738, + "balance_loss_mlp": 1.00197852, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.7545677842809894, + "language_loss": 0.61119884, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63185072, + "num_input_tokens_seen": 36381815, + "step": 1685, + "time_per_iteration": 3.189068078994751 + }, + { + "auxiliary_loss_clip": 0.01155422, + "auxiliary_loss_mlp": 0.01054111, + "balance_loss_clip": 1.05319405, + "balance_loss_mlp": 1.03217697, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.895692939587713, + "language_loss": 0.61508662, + "learning_rate": 3.946809212358516e-06, + "loss": 0.637182, + "num_input_tokens_seen": 36404320, + "step": 1686, + "time_per_iteration": 4.073743104934692 + }, + { + "auxiliary_loss_clip": 0.01146987, + "auxiliary_loss_mlp": 0.01058683, + "balance_loss_clip": 1.05954766, + "balance_loss_mlp": 1.03599751, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 4.499096299633132, + "language_loss": 0.81552637, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83758307, + "num_input_tokens_seen": 36427510, + "step": 1687, + "time_per_iteration": 4.186161041259766 + }, + { + "auxiliary_loss_clip": 0.01171413, + "auxiliary_loss_mlp": 0.01057453, + "balance_loss_clip": 1.05612135, + "balance_loss_mlp": 1.03573263, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.853100039178726, + "language_loss": 0.72592455, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74821317, + "num_input_tokens_seen": 36448230, + "step": 1688, + "time_per_iteration": 2.6118335723876953 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01061749, + "balance_loss_clip": 1.05211627, + "balance_loss_mlp": 1.03999329, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 2.0533441808248547, + "language_loss": 0.870444, + "learning_rate": 3.94654121166582e-06, + "loss": 0.89255655, + "num_input_tokens_seen": 36464395, + "step": 1689, + "time_per_iteration": 2.577509880065918 + }, + { + "auxiliary_loss_clip": 0.01167908, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.05404162, + "balance_loss_mlp": 1.03397024, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 2.85930493855529, + "language_loss": 0.88239229, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90460992, + "num_input_tokens_seen": 36486475, + "step": 1690, + "time_per_iteration": 2.63335919380188 + }, + { + "auxiliary_loss_clip": 0.01157907, + "auxiliary_loss_mlp": 0.01054691, + "balance_loss_clip": 1.05681419, + "balance_loss_mlp": 1.03285146, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 3.659333606566703, + "language_loss": 0.83294046, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85506648, + "num_input_tokens_seen": 36505310, + "step": 1691, + "time_per_iteration": 3.9470527172088623 + }, + { + "auxiliary_loss_clip": 0.01160755, + "auxiliary_loss_mlp": 0.01054743, + "balance_loss_clip": 1.05556452, + "balance_loss_mlp": 1.03273737, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.7187571352749382, + "language_loss": 0.67108333, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69323832, + "num_input_tokens_seen": 36529820, + "step": 1692, + "time_per_iteration": 2.6877622604370117 + }, + { + "auxiliary_loss_clip": 0.01143096, + "auxiliary_loss_mlp": 0.01077647, + "balance_loss_clip": 1.04981458, + "balance_loss_mlp": 1.05546176, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.5721065397001146, + "language_loss": 0.75727582, + "learning_rate": 3.94618284404223e-06, + "loss": 0.7794832, + "num_input_tokens_seen": 36549000, + "step": 1693, + "time_per_iteration": 2.574143171310425 + }, + { + "auxiliary_loss_clip": 0.01135959, + "auxiliary_loss_mlp": 0.01060143, + "balance_loss_clip": 1.05169332, + "balance_loss_mlp": 1.0360744, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.657658746763047, + "language_loss": 0.87303096, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89499193, + "num_input_tokens_seen": 36567515, + "step": 1694, + "time_per_iteration": 2.7049505710601807 + }, + { + "auxiliary_loss_clip": 0.01130458, + "auxiliary_loss_mlp": 0.01057871, + "balance_loss_clip": 1.04888105, + "balance_loss_mlp": 1.03276587, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 1.6394996483446909, + "language_loss": 0.795066, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81694925, + "num_input_tokens_seen": 36586190, + "step": 1695, + "time_per_iteration": 2.6171298027038574 + }, + { + "auxiliary_loss_clip": 0.01125818, + "auxiliary_loss_mlp": 0.01062731, + "balance_loss_clip": 1.04815197, + "balance_loss_mlp": 1.03966367, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 3.080090490157433, + "language_loss": 0.86550725, + "learning_rate": 3.945913293418447e-06, + "loss": 0.8873927, + "num_input_tokens_seen": 36607495, + "step": 1696, + "time_per_iteration": 2.652498960494995 + }, + { + "auxiliary_loss_clip": 0.01161423, + "auxiliary_loss_mlp": 0.01056131, + "balance_loss_clip": 1.05204201, + "balance_loss_mlp": 1.03418422, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 2.150340804878463, + "language_loss": 0.82021117, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84238672, + "num_input_tokens_seen": 36628555, + "step": 1697, + "time_per_iteration": 2.5648231506347656 + }, + { + "auxiliary_loss_clip": 0.01180878, + "auxiliary_loss_mlp": 0.01052825, + "balance_loss_clip": 1.0564096, + "balance_loss_mlp": 1.0301156, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 2.0452470359203367, + "language_loss": 0.80833542, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.83067244, + "num_input_tokens_seen": 36646250, + "step": 1698, + "time_per_iteration": 2.54388689994812 + }, + { + "auxiliary_loss_clip": 0.011481, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_clip": 1.05575895, + "balance_loss_mlp": 1.02640629, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 5.224893983948668, + "language_loss": 0.76066697, + "learning_rate": 3.945643078691637e-06, + "loss": 0.78262532, + "num_input_tokens_seen": 36666675, + "step": 1699, + "time_per_iteration": 2.6046366691589355 + }, + { + "auxiliary_loss_clip": 0.0115818, + "auxiliary_loss_mlp": 0.01048521, + "balance_loss_clip": 1.05896378, + "balance_loss_mlp": 1.02694392, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.61241530187309, + "language_loss": 0.80101025, + "learning_rate": 3.945552859553516e-06, + "loss": 0.8230772, + "num_input_tokens_seen": 36685225, + "step": 1700, + "time_per_iteration": 2.6014561653137207 + }, + { + "auxiliary_loss_clip": 0.0116572, + "auxiliary_loss_mlp": 0.01054607, + "balance_loss_clip": 1.05435205, + "balance_loss_mlp": 1.03252983, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.825413137448046, + "language_loss": 0.76943362, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79163682, + "num_input_tokens_seen": 36705985, + "step": 1701, + "time_per_iteration": 2.592332363128662 + }, + { + "auxiliary_loss_clip": 0.01177115, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.05987, + "balance_loss_mlp": 1.02678668, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 1.9259301644756555, + "language_loss": 0.78031534, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80257106, + "num_input_tokens_seen": 36725815, + "step": 1702, + "time_per_iteration": 2.5605955123901367 + }, + { + "auxiliary_loss_clip": 0.01150653, + "auxiliary_loss_mlp": 0.01051912, + "balance_loss_clip": 1.05345905, + "balance_loss_mlp": 1.03105044, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 1.9719572173860214, + "language_loss": 0.94493735, + "learning_rate": 3.945281759499494e-06, + "loss": 0.96696305, + "num_input_tokens_seen": 36742345, + "step": 1703, + "time_per_iteration": 4.941705703735352 + }, + { + "auxiliary_loss_clip": 0.01035382, + "auxiliary_loss_mlp": 0.01016019, + "balance_loss_clip": 1.03033888, + "balance_loss_mlp": 1.01232386, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8632107987194937, + "language_loss": 0.5502789, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57079297, + "num_input_tokens_seen": 36798775, + "step": 1704, + "time_per_iteration": 3.115715742111206 + }, + { + "auxiliary_loss_clip": 0.01180706, + "auxiliary_loss_mlp": 0.01052481, + "balance_loss_clip": 1.05731535, + "balance_loss_mlp": 1.03020132, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 2.1664678218862297, + "language_loss": 0.83809483, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86042666, + "num_input_tokens_seen": 36816295, + "step": 1705, + "time_per_iteration": 2.484908103942871 + }, + { + "auxiliary_loss_clip": 0.01047415, + "auxiliary_loss_mlp": 0.01003713, + "balance_loss_clip": 1.02968633, + "balance_loss_mlp": 1.00054216, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7658132020584265, + "language_loss": 0.60409176, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62460303, + "num_input_tokens_seen": 36882030, + "step": 1706, + "time_per_iteration": 3.1530637741088867 + }, + { + "auxiliary_loss_clip": 0.01150749, + "auxiliary_loss_mlp": 0.0105076, + "balance_loss_clip": 1.05458629, + "balance_loss_mlp": 1.0283252, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.29168507945468, + "language_loss": 0.8593719, + "learning_rate": 3.94491926006294e-06, + "loss": 0.881387, + "num_input_tokens_seen": 36899245, + "step": 1707, + "time_per_iteration": 2.5958306789398193 + }, + { + "auxiliary_loss_clip": 0.01163395, + "auxiliary_loss_mlp": 0.01049332, + "balance_loss_clip": 1.05653167, + "balance_loss_mlp": 1.02854168, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.5574241751106017, + "language_loss": 0.73236525, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75449246, + "num_input_tokens_seen": 36920950, + "step": 1708, + "time_per_iteration": 2.568849802017212 + }, + { + "auxiliary_loss_clip": 0.01151176, + "auxiliary_loss_mlp": 0.00790191, + "balance_loss_clip": 1.05506837, + "balance_loss_mlp": 1.00062799, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.7367682338185229, + "language_loss": 0.91332102, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93273473, + "num_input_tokens_seen": 36938900, + "step": 1709, + "time_per_iteration": 2.574213743209839 + }, + { + "auxiliary_loss_clip": 0.01124691, + "auxiliary_loss_mlp": 0.01055006, + "balance_loss_clip": 1.05067277, + "balance_loss_mlp": 1.03252363, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 2.1203939042485622, + "language_loss": 0.88573235, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90752935, + "num_input_tokens_seen": 36957010, + "step": 1710, + "time_per_iteration": 2.6529431343078613 + }, + { + "auxiliary_loss_clip": 0.01166649, + "auxiliary_loss_mlp": 0.01054208, + "balance_loss_clip": 1.05479193, + "balance_loss_mlp": 1.03260732, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.9038617442920547, + "language_loss": 0.79354727, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81575584, + "num_input_tokens_seen": 36977690, + "step": 1711, + "time_per_iteration": 2.5678658485412598 + }, + { + "auxiliary_loss_clip": 0.01156116, + "auxiliary_loss_mlp": 0.01056511, + "balance_loss_clip": 1.06044149, + "balance_loss_mlp": 1.03420734, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 1.9610864480894483, + "language_loss": 0.73807353, + "learning_rate": 3.944464476383668e-06, + "loss": 0.7601999, + "num_input_tokens_seen": 36997300, + "step": 1712, + "time_per_iteration": 2.641038417816162 + }, + { + "auxiliary_loss_clip": 0.01130952, + "auxiliary_loss_mlp": 0.01058215, + "balance_loss_clip": 1.05389118, + "balance_loss_mlp": 1.03588676, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 2.597412209884836, + "language_loss": 0.86902726, + "learning_rate": 3.94437329843114e-06, + "loss": 0.89091897, + "num_input_tokens_seen": 37016110, + "step": 1713, + "time_per_iteration": 2.601247787475586 + }, + { + "auxiliary_loss_clip": 0.01163596, + "auxiliary_loss_mlp": 0.0105564, + "balance_loss_clip": 1.05449724, + "balance_loss_mlp": 1.03521919, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 1.6314550495832527, + "language_loss": 0.72659647, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74878889, + "num_input_tokens_seen": 37036405, + "step": 1714, + "time_per_iteration": 2.562166213989258 + }, + { + "auxiliary_loss_clip": 0.01168192, + "auxiliary_loss_mlp": 0.01059537, + "balance_loss_clip": 1.05485415, + "balance_loss_mlp": 1.03641081, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 1.8062506730474521, + "language_loss": 0.90815586, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93043309, + "num_input_tokens_seen": 37057580, + "step": 1715, + "time_per_iteration": 2.5955770015716553 + }, + { + "auxiliary_loss_clip": 0.01166439, + "auxiliary_loss_mlp": 0.010547, + "balance_loss_clip": 1.0530566, + "balance_loss_mlp": 1.03315938, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 2.26858622803554, + "language_loss": 0.75612122, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77833259, + "num_input_tokens_seen": 37079120, + "step": 1716, + "time_per_iteration": 2.6535401344299316 + }, + { + "auxiliary_loss_clip": 0.01158163, + "auxiliary_loss_mlp": 0.01064442, + "balance_loss_clip": 1.05351543, + "balance_loss_mlp": 1.04219759, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.0400830411665534, + "language_loss": 0.85564828, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87787431, + "num_input_tokens_seen": 37099710, + "step": 1717, + "time_per_iteration": 2.6705539226531982 + }, + { + "auxiliary_loss_clip": 0.0112593, + "auxiliary_loss_mlp": 0.01063707, + "balance_loss_clip": 1.04886174, + "balance_loss_mlp": 1.04083025, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 1.9177537745585396, + "language_loss": 0.82957298, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85146928, + "num_input_tokens_seen": 37117775, + "step": 1718, + "time_per_iteration": 2.566765308380127 + }, + { + "auxiliary_loss_clip": 0.01165673, + "auxiliary_loss_mlp": 0.01047662, + "balance_loss_clip": 1.05724561, + "balance_loss_mlp": 1.02645516, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.665216593618503, + "language_loss": 0.73135453, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75348794, + "num_input_tokens_seen": 37140280, + "step": 1719, + "time_per_iteration": 2.694629669189453 + }, + { + "auxiliary_loss_clip": 0.01161854, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.05459237, + "balance_loss_mlp": 1.03028619, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 1.992297003775664, + "language_loss": 0.93187541, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.95400083, + "num_input_tokens_seen": 37158350, + "step": 1720, + "time_per_iteration": 2.5486457347869873 + }, + { + "auxiliary_loss_clip": 0.01135431, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_clip": 1.04819775, + "balance_loss_mlp": 1.0260253, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 1.8483853083601571, + "language_loss": 0.79792529, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81976289, + "num_input_tokens_seen": 37177120, + "step": 1721, + "time_per_iteration": 2.595365285873413 + }, + { + "auxiliary_loss_clip": 0.01129582, + "auxiliary_loss_mlp": 0.01062574, + "balance_loss_clip": 1.05292332, + "balance_loss_mlp": 1.03684843, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 1.7272273333039831, + "language_loss": 0.81019717, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.83211875, + "num_input_tokens_seen": 37195895, + "step": 1722, + "time_per_iteration": 2.659193754196167 + }, + { + "auxiliary_loss_clip": 0.01055753, + "auxiliary_loss_mlp": 0.01013536, + "balance_loss_clip": 1.02813399, + "balance_loss_mlp": 1.01141405, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9312637352853675, + "language_loss": 0.67082739, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69152033, + "num_input_tokens_seen": 37247270, + "step": 1723, + "time_per_iteration": 2.9061343669891357 + }, + { + "auxiliary_loss_clip": 0.01167961, + "auxiliary_loss_mlp": 0.01055294, + "balance_loss_clip": 1.05398071, + "balance_loss_mlp": 1.03405058, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.911400226851673, + "language_loss": 0.78158391, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80381644, + "num_input_tokens_seen": 37265595, + "step": 1724, + "time_per_iteration": 2.5401546955108643 + }, + { + "auxiliary_loss_clip": 0.01149953, + "auxiliary_loss_mlp": 0.01058579, + "balance_loss_clip": 1.05538702, + "balance_loss_mlp": 1.03728843, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 2.002519367820746, + "language_loss": 0.74830639, + "learning_rate": 3.943273412987676e-06, + "loss": 0.7703917, + "num_input_tokens_seen": 37286660, + "step": 1725, + "time_per_iteration": 4.199955940246582 + }, + { + "auxiliary_loss_clip": 0.01137588, + "auxiliary_loss_mlp": 0.01060117, + "balance_loss_clip": 1.05310881, + "balance_loss_mlp": 1.03759849, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 6.706749727560831, + "language_loss": 0.74800599, + "learning_rate": 3.943181276805054e-06, + "loss": 0.76998299, + "num_input_tokens_seen": 37304915, + "step": 1726, + "time_per_iteration": 4.105961322784424 + }, + { + "auxiliary_loss_clip": 0.01149723, + "auxiliary_loss_mlp": 0.01059689, + "balance_loss_clip": 1.05180025, + "balance_loss_mlp": 1.03713429, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 1.8057972068798496, + "language_loss": 0.73788404, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.75997818, + "num_input_tokens_seen": 37325265, + "step": 1727, + "time_per_iteration": 2.6490092277526855 + }, + { + "auxiliary_loss_clip": 0.01158869, + "auxiliary_loss_mlp": 0.01056231, + "balance_loss_clip": 1.05379534, + "balance_loss_mlp": 1.03402257, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.837721980558425, + "language_loss": 0.84486353, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86701459, + "num_input_tokens_seen": 37341650, + "step": 1728, + "time_per_iteration": 2.5047435760498047 + }, + { + "auxiliary_loss_clip": 0.01154996, + "auxiliary_loss_mlp": 0.01052492, + "balance_loss_clip": 1.05490375, + "balance_loss_mlp": 1.03089154, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.4477123023583727, + "language_loss": 0.70175362, + "learning_rate": 3.942904426157406e-06, + "loss": 0.72382843, + "num_input_tokens_seen": 37360270, + "step": 1729, + "time_per_iteration": 2.6496028900146484 + }, + { + "auxiliary_loss_clip": 0.01160864, + "auxiliary_loss_mlp": 0.01065731, + "balance_loss_clip": 1.05587077, + "balance_loss_mlp": 1.04199636, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.4514809508515794, + "language_loss": 0.8121745, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83444047, + "num_input_tokens_seen": 37375225, + "step": 1730, + "time_per_iteration": 4.012651681900024 + }, + { + "auxiliary_loss_clip": 0.0109008, + "auxiliary_loss_mlp": 0.01050492, + "balance_loss_clip": 1.04901159, + "balance_loss_mlp": 1.02960634, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 1.752854691111023, + "language_loss": 0.75952041, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78092611, + "num_input_tokens_seen": 37395165, + "step": 1731, + "time_per_iteration": 2.8350515365600586 + }, + { + "auxiliary_loss_clip": 0.01129959, + "auxiliary_loss_mlp": 0.01045547, + "balance_loss_clip": 1.05334055, + "balance_loss_mlp": 1.02547216, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.733877865681966, + "language_loss": 0.82336575, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84512085, + "num_input_tokens_seen": 37414845, + "step": 1732, + "time_per_iteration": 2.731889009475708 + }, + { + "auxiliary_loss_clip": 0.01139085, + "auxiliary_loss_mlp": 0.01055568, + "balance_loss_clip": 1.05689752, + "balance_loss_mlp": 1.03534997, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 1.9779216957338024, + "language_loss": 0.83053309, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85247964, + "num_input_tokens_seen": 37432490, + "step": 1733, + "time_per_iteration": 2.601238250732422 + }, + { + "auxiliary_loss_clip": 0.0115438, + "auxiliary_loss_mlp": 0.01053233, + "balance_loss_clip": 1.05672002, + "balance_loss_mlp": 1.03243113, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.664784533320177, + "language_loss": 0.76242638, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78450251, + "num_input_tokens_seen": 37449435, + "step": 1734, + "time_per_iteration": 2.582468032836914 + }, + { + "auxiliary_loss_clip": 0.01136099, + "auxiliary_loss_mlp": 0.01046632, + "balance_loss_clip": 1.05190277, + "balance_loss_mlp": 1.02613997, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.9074229514550893, + "language_loss": 0.75139558, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77322292, + "num_input_tokens_seen": 37469105, + "step": 1735, + "time_per_iteration": 2.6715738773345947 + }, + { + "auxiliary_loss_clip": 0.0116648, + "auxiliary_loss_mlp": 0.01052353, + "balance_loss_clip": 1.05610943, + "balance_loss_mlp": 1.03094268, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 8.695819173147676, + "language_loss": 0.78571337, + "learning_rate": 3.94225586284712e-06, + "loss": 0.80790168, + "num_input_tokens_seen": 37490540, + "step": 1736, + "time_per_iteration": 2.6440227031707764 + }, + { + "auxiliary_loss_clip": 0.01167978, + "auxiliary_loss_mlp": 0.01058993, + "balance_loss_clip": 1.05848515, + "balance_loss_mlp": 1.0371902, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.7766723989724773, + "language_loss": 0.70739686, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72966659, + "num_input_tokens_seen": 37511905, + "step": 1737, + "time_per_iteration": 2.6217782497406006 + }, + { + "auxiliary_loss_clip": 0.01151042, + "auxiliary_loss_mlp": 0.01064331, + "balance_loss_clip": 1.0479176, + "balance_loss_mlp": 1.03967834, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.765251892963146, + "language_loss": 0.81870794, + "learning_rate": 3.942069896136581e-06, + "loss": 0.84086168, + "num_input_tokens_seen": 37533635, + "step": 1738, + "time_per_iteration": 2.7427544593811035 + }, + { + "auxiliary_loss_clip": 0.0118327, + "auxiliary_loss_mlp": 0.01063202, + "balance_loss_clip": 1.05700469, + "balance_loss_mlp": 1.04000413, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 1.9335992027377569, + "language_loss": 0.75305939, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77552414, + "num_input_tokens_seen": 37552035, + "step": 1739, + "time_per_iteration": 2.537738084793091 + }, + { + "auxiliary_loss_clip": 0.01146042, + "auxiliary_loss_mlp": 0.01057621, + "balance_loss_clip": 1.05356896, + "balance_loss_mlp": 1.03575778, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.7344583682708439, + "language_loss": 0.77544534, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79748201, + "num_input_tokens_seen": 37571540, + "step": 1740, + "time_per_iteration": 2.617509603500366 + }, + { + "auxiliary_loss_clip": 0.01155092, + "auxiliary_loss_mlp": 0.01054745, + "balance_loss_clip": 1.06022263, + "balance_loss_mlp": 1.03297734, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.2618305475897764, + "language_loss": 0.86112869, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88322699, + "num_input_tokens_seen": 37588265, + "step": 1741, + "time_per_iteration": 2.595384359359741 + }, + { + "auxiliary_loss_clip": 0.0115545, + "auxiliary_loss_mlp": 0.0105531, + "balance_loss_clip": 1.05448651, + "balance_loss_mlp": 1.03186202, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 2.149451759866613, + "language_loss": 0.74825668, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77036428, + "num_input_tokens_seen": 37606860, + "step": 1742, + "time_per_iteration": 4.176445960998535 + }, + { + "auxiliary_loss_clip": 0.01128387, + "auxiliary_loss_mlp": 0.01062684, + "balance_loss_clip": 1.05529785, + "balance_loss_mlp": 1.04158449, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 2.079206566831589, + "language_loss": 0.87416452, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89607525, + "num_input_tokens_seen": 37625210, + "step": 1743, + "time_per_iteration": 2.648805618286133 + }, + { + "auxiliary_loss_clip": 0.01137284, + "auxiliary_loss_mlp": 0.01052903, + "balance_loss_clip": 1.05259228, + "balance_loss_mlp": 1.02907336, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.3411527798965626, + "language_loss": 0.76038426, + "learning_rate": 3.941510228674391e-06, + "loss": 0.78228611, + "num_input_tokens_seen": 37644110, + "step": 1744, + "time_per_iteration": 2.6350467205047607 + }, + { + "auxiliary_loss_clip": 0.01166897, + "auxiliary_loss_mlp": 0.01056907, + "balance_loss_clip": 1.05786586, + "balance_loss_mlp": 1.03701067, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.0808371447058547, + "language_loss": 0.7882635, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81050152, + "num_input_tokens_seen": 37665800, + "step": 1745, + "time_per_iteration": 2.719745397567749 + }, + { + "auxiliary_loss_clip": 0.01179017, + "auxiliary_loss_mlp": 0.01068318, + "balance_loss_clip": 1.05616665, + "balance_loss_mlp": 1.04650271, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.610087024585272, + "language_loss": 0.83000231, + "learning_rate": 3.941323083837794e-06, + "loss": 0.85247558, + "num_input_tokens_seen": 37685095, + "step": 1746, + "time_per_iteration": 2.5664680004119873 + }, + { + "auxiliary_loss_clip": 0.01156883, + "auxiliary_loss_mlp": 0.01062349, + "balance_loss_clip": 1.05620921, + "balance_loss_mlp": 1.04120088, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.5856931777054324, + "language_loss": 0.70258319, + "learning_rate": 3.941229400994971e-06, + "loss": 0.72477555, + "num_input_tokens_seen": 37707445, + "step": 1747, + "time_per_iteration": 2.8110556602478027 + }, + { + "auxiliary_loss_clip": 0.01158115, + "auxiliary_loss_mlp": 0.01062617, + "balance_loss_clip": 1.05945039, + "balance_loss_mlp": 1.0407654, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.455186046854418, + "language_loss": 0.84057486, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86278218, + "num_input_tokens_seen": 37728325, + "step": 1748, + "time_per_iteration": 2.6956310272216797 + }, + { + "auxiliary_loss_clip": 0.01173114, + "auxiliary_loss_mlp": 0.01051666, + "balance_loss_clip": 1.05335236, + "balance_loss_mlp": 1.02904046, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 2.0546338574822722, + "language_loss": 0.71788895, + "learning_rate": 3.941041814478041e-06, + "loss": 0.7401368, + "num_input_tokens_seen": 37748910, + "step": 1749, + "time_per_iteration": 2.560124158859253 + }, + { + "auxiliary_loss_clip": 0.01156888, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_clip": 1.05391288, + "balance_loss_mlp": 1.04420972, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 1.943711678053668, + "language_loss": 0.81800115, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84022963, + "num_input_tokens_seen": 37765745, + "step": 1750, + "time_per_iteration": 2.6097183227539062 + }, + { + "auxiliary_loss_clip": 0.01153091, + "auxiliary_loss_mlp": 0.01059819, + "balance_loss_clip": 1.05721736, + "balance_loss_mlp": 1.03794372, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.293064097438184, + "language_loss": 0.92616206, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94829118, + "num_input_tokens_seen": 37780520, + "step": 1751, + "time_per_iteration": 2.5793137550354004 + }, + { + "auxiliary_loss_clip": 0.01164309, + "auxiliary_loss_mlp": 0.01050551, + "balance_loss_clip": 1.05678129, + "balance_loss_mlp": 1.02939188, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 1.990898043919489, + "language_loss": 0.78976035, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81190896, + "num_input_tokens_seen": 37799515, + "step": 1752, + "time_per_iteration": 2.5604124069213867 + }, + { + "auxiliary_loss_clip": 0.01114484, + "auxiliary_loss_mlp": 0.01059404, + "balance_loss_clip": 1.04932964, + "balance_loss_mlp": 1.03591955, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.156400300443941, + "language_loss": 0.75487006, + "learning_rate": 3.940665758218686e-06, + "loss": 0.77660894, + "num_input_tokens_seen": 37818695, + "step": 1753, + "time_per_iteration": 2.6833736896514893 + }, + { + "auxiliary_loss_clip": 0.01139697, + "auxiliary_loss_mlp": 0.01059194, + "balance_loss_clip": 1.05534041, + "balance_loss_mlp": 1.03653228, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.121907480472037, + "language_loss": 0.83823061, + "learning_rate": 3.940571560169328e-06, + "loss": 0.8602196, + "num_input_tokens_seen": 37837860, + "step": 1754, + "time_per_iteration": 2.654599666595459 + }, + { + "auxiliary_loss_clip": 0.01131146, + "auxiliary_loss_mlp": 0.01051658, + "balance_loss_clip": 1.05292606, + "balance_loss_mlp": 1.02822185, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.5283631324744915, + "language_loss": 0.69048226, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71231031, + "num_input_tokens_seen": 37856260, + "step": 1755, + "time_per_iteration": 2.677927255630493 + }, + { + "auxiliary_loss_clip": 0.01162481, + "auxiliary_loss_mlp": 0.01063888, + "balance_loss_clip": 1.05399442, + "balance_loss_mlp": 1.04164386, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 2.243752463518738, + "language_loss": 0.76608282, + "learning_rate": 3.940382943314182e-06, + "loss": 0.78834653, + "num_input_tokens_seen": 37876960, + "step": 1756, + "time_per_iteration": 2.662473201751709 + }, + { + "auxiliary_loss_clip": 0.01180275, + "auxiliary_loss_mlp": 0.01064573, + "balance_loss_clip": 1.05767941, + "balance_loss_mlp": 1.04375958, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.6926788596486282, + "language_loss": 0.7988379, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82128632, + "num_input_tokens_seen": 37897070, + "step": 1757, + "time_per_iteration": 2.546818494796753 + }, + { + "auxiliary_loss_clip": 0.01148322, + "auxiliary_loss_mlp": 0.01057281, + "balance_loss_clip": 1.05208254, + "balance_loss_mlp": 1.03597832, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.569553608158069, + "language_loss": 0.78742921, + "learning_rate": 3.940194032140976e-06, + "loss": 0.80948532, + "num_input_tokens_seen": 37923635, + "step": 1758, + "time_per_iteration": 2.991955518722534 + }, + { + "auxiliary_loss_clip": 0.01162027, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_clip": 1.05724859, + "balance_loss_mlp": 1.0282445, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 2.1187548474276534, + "language_loss": 0.91685933, + "learning_rate": 3.940099466194054e-06, + "loss": 0.93897772, + "num_input_tokens_seen": 37942650, + "step": 1759, + "time_per_iteration": 2.6118459701538086 + }, + { + "auxiliary_loss_clip": 0.01155636, + "auxiliary_loss_mlp": 0.010545, + "balance_loss_clip": 1.05396473, + "balance_loss_mlp": 1.03144526, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.380608940994575, + "language_loss": 0.77426791, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79636925, + "num_input_tokens_seen": 37960660, + "step": 1760, + "time_per_iteration": 2.6519179344177246 + }, + { + "auxiliary_loss_clip": 0.01163092, + "auxiliary_loss_mlp": 0.01068171, + "balance_loss_clip": 1.05521607, + "balance_loss_mlp": 1.0440197, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.4336009466891206, + "language_loss": 0.8940388, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91635144, + "num_input_tokens_seen": 37978625, + "step": 1761, + "time_per_iteration": 2.625154972076416 + }, + { + "auxiliary_loss_clip": 0.01108244, + "auxiliary_loss_mlp": 0.00793484, + "balance_loss_clip": 1.04664528, + "balance_loss_mlp": 1.00047135, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.012858962011309, + "language_loss": 0.78028482, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.7993021, + "num_input_tokens_seen": 38000005, + "step": 1762, + "time_per_iteration": 2.8852591514587402 + }, + { + "auxiliary_loss_clip": 0.01054949, + "auxiliary_loss_mlp": 0.01019965, + "balance_loss_clip": 1.03022754, + "balance_loss_mlp": 1.01713932, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.7560026207896515, + "language_loss": 0.60523981, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62598896, + "num_input_tokens_seen": 38066165, + "step": 1763, + "time_per_iteration": 3.3564815521240234 + }, + { + "auxiliary_loss_clip": 0.0115479, + "auxiliary_loss_mlp": 0.01052535, + "balance_loss_clip": 1.05514836, + "balance_loss_mlp": 1.03167319, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 1.7989960193312127, + "language_loss": 0.80078626, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82285953, + "num_input_tokens_seen": 38086150, + "step": 1764, + "time_per_iteration": 4.155576467514038 + }, + { + "auxiliary_loss_clip": 0.01138897, + "auxiliary_loss_mlp": 0.01061933, + "balance_loss_clip": 1.05014038, + "balance_loss_mlp": 1.03701854, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 2.7738788708377404, + "language_loss": 0.79804468, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82005292, + "num_input_tokens_seen": 38104205, + "step": 1765, + "time_per_iteration": 2.5937130451202393 + }, + { + "auxiliary_loss_clip": 0.01163515, + "auxiliary_loss_mlp": 0.01054409, + "balance_loss_clip": 1.0559206, + "balance_loss_mlp": 1.03259408, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.9074456246690272, + "language_loss": 0.77116352, + "learning_rate": 3.939435444841306e-06, + "loss": 0.79334271, + "num_input_tokens_seen": 38122005, + "step": 1766, + "time_per_iteration": 4.074598073959351 + }, + { + "auxiliary_loss_clip": 0.011828, + "auxiliary_loss_mlp": 0.01062627, + "balance_loss_clip": 1.06135798, + "balance_loss_mlp": 1.04075205, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.7382599835416654, + "language_loss": 0.77870107, + "learning_rate": 3.939340290444895e-06, + "loss": 0.80115533, + "num_input_tokens_seen": 38143365, + "step": 1767, + "time_per_iteration": 2.6036555767059326 + }, + { + "auxiliary_loss_clip": 0.01006519, + "auxiliary_loss_mlp": 0.01005442, + "balance_loss_clip": 1.02817535, + "balance_loss_mlp": 1.00224769, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6910944019173448, + "language_loss": 0.5790484, + "learning_rate": 3.939245062508506e-06, + "loss": 0.599168, + "num_input_tokens_seen": 38210035, + "step": 1768, + "time_per_iteration": 3.875718116760254 + }, + { + "auxiliary_loss_clip": 0.01142392, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.0527916, + "balance_loss_mlp": 1.02333319, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.4147933974203857, + "language_loss": 0.86498606, + "learning_rate": 3.939149761035749e-06, + "loss": 0.88684666, + "num_input_tokens_seen": 38231230, + "step": 1769, + "time_per_iteration": 4.782676696777344 + }, + { + "auxiliary_loss_clip": 0.01139689, + "auxiliary_loss_mlp": 0.00790111, + "balance_loss_clip": 1.05339313, + "balance_loss_mlp": 1.00045466, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.868621364603988, + "language_loss": 0.61926013, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.63855815, + "num_input_tokens_seen": 38253890, + "step": 1770, + "time_per_iteration": 4.27446985244751 + }, + { + "auxiliary_loss_clip": 0.01065413, + "auxiliary_loss_mlp": 0.01005279, + "balance_loss_clip": 1.02916014, + "balance_loss_mlp": 1.00306213, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8855391904218706, + "language_loss": 0.57057232, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59127927, + "num_input_tokens_seen": 38304290, + "step": 1771, + "time_per_iteration": 3.0315463542938232 + }, + { + "auxiliary_loss_clip": 0.01141733, + "auxiliary_loss_mlp": 0.01063732, + "balance_loss_clip": 1.05332041, + "balance_loss_mlp": 1.04326344, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 2.622994640747814, + "language_loss": 0.88393074, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90598536, + "num_input_tokens_seen": 38324725, + "step": 1772, + "time_per_iteration": 2.6578779220581055 + }, + { + "auxiliary_loss_clip": 0.01183263, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.05758131, + "balance_loss_mlp": 1.03201962, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.9409064407998726, + "language_loss": 0.76253498, + "learning_rate": 3.93876781985337e-06, + "loss": 0.7849136, + "num_input_tokens_seen": 38340735, + "step": 1773, + "time_per_iteration": 2.5085318088531494 + }, + { + "auxiliary_loss_clip": 0.01127271, + "auxiliary_loss_mlp": 0.01065744, + "balance_loss_clip": 1.05096102, + "balance_loss_mlp": 1.04154468, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 1.9644523732712762, + "language_loss": 0.82493925, + "learning_rate": 3.938672150753041e-06, + "loss": 0.84686935, + "num_input_tokens_seen": 38361315, + "step": 1774, + "time_per_iteration": 2.8033511638641357 + }, + { + "auxiliary_loss_clip": 0.01151785, + "auxiliary_loss_mlp": 0.00790759, + "balance_loss_clip": 1.05551577, + "balance_loss_mlp": 1.00044644, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 3.461855544531955, + "language_loss": 0.76492035, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78434581, + "num_input_tokens_seen": 38377425, + "step": 1775, + "time_per_iteration": 2.6319680213928223 + }, + { + "auxiliary_loss_clip": 0.01069779, + "auxiliary_loss_mlp": 0.01004452, + "balance_loss_clip": 1.02561402, + "balance_loss_mlp": 1.00227058, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8371622627067166, + "language_loss": 0.574444, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59518623, + "num_input_tokens_seen": 38440275, + "step": 1776, + "time_per_iteration": 3.100587844848633 + }, + { + "auxiliary_loss_clip": 0.01149844, + "auxiliary_loss_mlp": 0.01060836, + "balance_loss_clip": 1.05126452, + "balance_loss_mlp": 1.03680301, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 2.0409723278540715, + "language_loss": 0.83484912, + "learning_rate": 3.938384702378727e-06, + "loss": 0.85695595, + "num_input_tokens_seen": 38461820, + "step": 1777, + "time_per_iteration": 2.640610933303833 + }, + { + "auxiliary_loss_clip": 0.01114721, + "auxiliary_loss_mlp": 0.00790639, + "balance_loss_clip": 1.04944456, + "balance_loss_mlp": 1.00045061, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 1.8268065489118674, + "language_loss": 0.87241507, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89146864, + "num_input_tokens_seen": 38482235, + "step": 1778, + "time_per_iteration": 2.6951727867126465 + }, + { + "auxiliary_loss_clip": 0.01151882, + "auxiliary_loss_mlp": 0.00789508, + "balance_loss_clip": 1.07103992, + "balance_loss_mlp": 1.00039494, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.202937205700062, + "language_loss": 0.84230745, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86172134, + "num_input_tokens_seen": 38500690, + "step": 1779, + "time_per_iteration": 2.706355571746826 + }, + { + "auxiliary_loss_clip": 0.01139427, + "auxiliary_loss_mlp": 0.00790054, + "balance_loss_clip": 1.05172658, + "balance_loss_mlp": 1.00048089, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 4.465665319848064, + "language_loss": 0.67390978, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69320458, + "num_input_tokens_seen": 38518405, + "step": 1780, + "time_per_iteration": 2.628316640853882 + }, + { + "auxiliary_loss_clip": 0.01160646, + "auxiliary_loss_mlp": 0.01050963, + "balance_loss_clip": 1.05531311, + "balance_loss_mlp": 1.03000629, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.4546858330096657, + "language_loss": 0.91940141, + "learning_rate": 3.938000408844265e-06, + "loss": 0.94151747, + "num_input_tokens_seen": 38535060, + "step": 1781, + "time_per_iteration": 4.139793395996094 + }, + { + "auxiliary_loss_clip": 0.01129529, + "auxiliary_loss_mlp": 0.01054964, + "balance_loss_clip": 1.05291343, + "balance_loss_mlp": 1.03448343, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.1660157395891213, + "language_loss": 0.79327738, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81512225, + "num_input_tokens_seen": 38552855, + "step": 1782, + "time_per_iteration": 2.6742563247680664 + }, + { + "auxiliary_loss_clip": 0.01156608, + "auxiliary_loss_mlp": 0.01058304, + "balance_loss_clip": 1.05443871, + "balance_loss_mlp": 1.03685772, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 4.000193145095136, + "language_loss": 0.79075485, + "learning_rate": 3.937807821127436e-06, + "loss": 0.812904, + "num_input_tokens_seen": 38570075, + "step": 1783, + "time_per_iteration": 2.5747549533843994 + }, + { + "auxiliary_loss_clip": 0.0116148, + "auxiliary_loss_mlp": 0.01060999, + "balance_loss_clip": 1.05637908, + "balance_loss_mlp": 1.03893304, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 1.8447425999471398, + "language_loss": 0.86452752, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88675231, + "num_input_tokens_seen": 38587970, + "step": 1784, + "time_per_iteration": 2.6569366455078125 + }, + { + "auxiliary_loss_clip": 0.01149089, + "auxiliary_loss_mlp": 0.01060859, + "balance_loss_clip": 1.05330253, + "balance_loss_mlp": 1.03830433, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 2.5443287682055997, + "language_loss": 1.01097846, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03307796, + "num_input_tokens_seen": 38605840, + "step": 1785, + "time_per_iteration": 2.640477418899536 + }, + { + "auxiliary_loss_clip": 0.01167503, + "auxiliary_loss_mlp": 0.01059039, + "balance_loss_clip": 1.05876362, + "balance_loss_mlp": 1.03839159, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.7537125015203787, + "language_loss": 0.84933406, + "learning_rate": 3.937518388447339e-06, + "loss": 0.8715995, + "num_input_tokens_seen": 38627070, + "step": 1786, + "time_per_iteration": 2.6270267963409424 + }, + { + "auxiliary_loss_clip": 0.01177437, + "auxiliary_loss_mlp": 0.01056548, + "balance_loss_clip": 1.05455041, + "balance_loss_mlp": 1.03296828, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 2.0139438321667664, + "language_loss": 0.788203, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81054282, + "num_input_tokens_seen": 38645840, + "step": 1787, + "time_per_iteration": 2.516249656677246 + }, + { + "auxiliary_loss_clip": 0.01171375, + "auxiliary_loss_mlp": 0.01048265, + "balance_loss_clip": 1.0564611, + "balance_loss_mlp": 1.02637827, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.685508692825638, + "language_loss": 0.82466197, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84685832, + "num_input_tokens_seen": 38664770, + "step": 1788, + "time_per_iteration": 2.5640714168548584 + }, + { + "auxiliary_loss_clip": 0.0118068, + "auxiliary_loss_mlp": 0.01069263, + "balance_loss_clip": 1.05798256, + "balance_loss_mlp": 1.04869902, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.2796312533963197, + "language_loss": 0.78017688, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80267626, + "num_input_tokens_seen": 38683865, + "step": 1789, + "time_per_iteration": 2.483795642852783 + }, + { + "auxiliary_loss_clip": 0.01181341, + "auxiliary_loss_mlp": 0.01065713, + "balance_loss_clip": 1.05815792, + "balance_loss_mlp": 1.04163301, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 4.288177340301529, + "language_loss": 0.74668002, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76915056, + "num_input_tokens_seen": 38702485, + "step": 1790, + "time_per_iteration": 2.587360143661499 + }, + { + "auxiliary_loss_clip": 0.0117492, + "auxiliary_loss_mlp": 0.00790678, + "balance_loss_clip": 1.058483, + "balance_loss_mlp": 1.00053811, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 3.8886300621752996, + "language_loss": 0.78318369, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80283964, + "num_input_tokens_seen": 38722475, + "step": 1791, + "time_per_iteration": 2.576131820678711 + }, + { + "auxiliary_loss_clip": 0.01135671, + "auxiliary_loss_mlp": 0.01060789, + "balance_loss_clip": 1.05354357, + "balance_loss_mlp": 1.03791237, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.6567974260797664, + "language_loss": 0.70828331, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73024791, + "num_input_tokens_seen": 38743285, + "step": 1792, + "time_per_iteration": 2.6692328453063965 + }, + { + "auxiliary_loss_clip": 0.01146977, + "auxiliary_loss_mlp": 0.01051283, + "balance_loss_clip": 1.0541178, + "balance_loss_mlp": 1.02793026, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.1846525909501175, + "language_loss": 0.76199532, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78397793, + "num_input_tokens_seen": 38763035, + "step": 1793, + "time_per_iteration": 2.634636640548706 + }, + { + "auxiliary_loss_clip": 0.01117598, + "auxiliary_loss_mlp": 0.01062256, + "balance_loss_clip": 1.05111289, + "balance_loss_mlp": 1.03935599, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.6065133083789347, + "language_loss": 0.85061598, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87241447, + "num_input_tokens_seen": 38784900, + "step": 1794, + "time_per_iteration": 2.6340794563293457 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01052543, + "balance_loss_clip": 1.04788637, + "balance_loss_mlp": 1.02946448, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.9790616040466207, + "language_loss": 0.75026584, + "learning_rate": 3.936646123375246e-06, + "loss": 0.7719478, + "num_input_tokens_seen": 38804695, + "step": 1795, + "time_per_iteration": 2.651160955429077 + }, + { + "auxiliary_loss_clip": 0.01126692, + "auxiliary_loss_mlp": 0.01058308, + "balance_loss_clip": 1.04790092, + "balance_loss_mlp": 1.03497899, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.840321703044773, + "language_loss": 0.81789839, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83974838, + "num_input_tokens_seen": 38822395, + "step": 1796, + "time_per_iteration": 2.622974395751953 + }, + { + "auxiliary_loss_clip": 0.01140361, + "auxiliary_loss_mlp": 0.01080245, + "balance_loss_clip": 1.05124056, + "balance_loss_mlp": 1.05428123, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 3.8973699824118273, + "language_loss": 0.73795974, + "learning_rate": 3.936451478782111e-06, + "loss": 0.76016575, + "num_input_tokens_seen": 38839865, + "step": 1797, + "time_per_iteration": 2.5474541187286377 + }, + { + "auxiliary_loss_clip": 0.01161174, + "auxiliary_loss_mlp": 0.01057976, + "balance_loss_clip": 1.05544257, + "balance_loss_mlp": 1.03723359, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 7.268428776559952, + "language_loss": 0.81293637, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83512789, + "num_input_tokens_seen": 38857300, + "step": 1798, + "time_per_iteration": 2.5784342288970947 + }, + { + "auxiliary_loss_clip": 0.01136668, + "auxiliary_loss_mlp": 0.01055245, + "balance_loss_clip": 1.04974174, + "balance_loss_mlp": 1.03203511, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.554589566994558, + "language_loss": 0.8544414, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87636054, + "num_input_tokens_seen": 38874960, + "step": 1799, + "time_per_iteration": 2.583448886871338 + }, + { + "auxiliary_loss_clip": 0.01154523, + "auxiliary_loss_mlp": 0.0106183, + "balance_loss_clip": 1.05589867, + "balance_loss_mlp": 1.04059839, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 1.849150370938147, + "language_loss": 0.77013671, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79230022, + "num_input_tokens_seen": 38893610, + "step": 1800, + "time_per_iteration": 2.5372753143310547 + }, + { + "auxiliary_loss_clip": 0.01175759, + "auxiliary_loss_mlp": 0.01051312, + "balance_loss_clip": 1.05699313, + "balance_loss_mlp": 1.0309391, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 1.7745109128260488, + "language_loss": 0.72568023, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.74795091, + "num_input_tokens_seen": 38913485, + "step": 1801, + "time_per_iteration": 2.5196115970611572 + }, + { + "auxiliary_loss_clip": 0.01185383, + "auxiliary_loss_mlp": 0.01052383, + "balance_loss_clip": 1.05851161, + "balance_loss_mlp": 1.03144956, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 2.94781789634261, + "language_loss": 0.66641873, + "learning_rate": 3.935963582331381e-06, + "loss": 0.6887964, + "num_input_tokens_seen": 38935650, + "step": 1802, + "time_per_iteration": 2.5593748092651367 + }, + { + "auxiliary_loss_clip": 0.01155844, + "auxiliary_loss_mlp": 0.01059716, + "balance_loss_clip": 1.05405486, + "balance_loss_mlp": 1.03800845, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.8953939077777198, + "language_loss": 0.81430697, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83646262, + "num_input_tokens_seen": 38954130, + "step": 1803, + "time_per_iteration": 3.935957908630371 + }, + { + "auxiliary_loss_clip": 0.01162166, + "auxiliary_loss_mlp": 0.01059302, + "balance_loss_clip": 1.05638289, + "balance_loss_mlp": 1.03689027, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 1.6030821598022085, + "language_loss": 0.91217327, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93438792, + "num_input_tokens_seen": 38972905, + "step": 1804, + "time_per_iteration": 2.511613130569458 + }, + { + "auxiliary_loss_clip": 0.01133446, + "auxiliary_loss_mlp": 0.01055467, + "balance_loss_clip": 1.05484068, + "balance_loss_mlp": 1.03243566, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.0768722251566762, + "language_loss": 0.76256883, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78445798, + "num_input_tokens_seen": 38993255, + "step": 1805, + "time_per_iteration": 4.066768169403076 + }, + { + "auxiliary_loss_clip": 0.01150757, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_clip": 1.0562923, + "balance_loss_mlp": 1.02920175, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 2.0079145023726976, + "language_loss": 0.85953987, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88154566, + "num_input_tokens_seen": 39012610, + "step": 1806, + "time_per_iteration": 2.6104023456573486 + }, + { + "auxiliary_loss_clip": 0.01167974, + "auxiliary_loss_mlp": 0.00789841, + "balance_loss_clip": 1.0546267, + "balance_loss_mlp": 1.00052047, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 8.326253138353062, + "language_loss": 0.80893493, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82851309, + "num_input_tokens_seen": 39030120, + "step": 1807, + "time_per_iteration": 2.491464138031006 + }, + { + "auxiliary_loss_clip": 0.01139557, + "auxiliary_loss_mlp": 0.01055174, + "balance_loss_clip": 1.05312788, + "balance_loss_mlp": 1.03500378, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.6686135769161918, + "language_loss": 0.78960526, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.81155252, + "num_input_tokens_seen": 39049875, + "step": 1808, + "time_per_iteration": 2.6029434204101562 + }, + { + "auxiliary_loss_clip": 0.01158396, + "auxiliary_loss_mlp": 0.01058649, + "balance_loss_clip": 1.05971026, + "balance_loss_mlp": 1.03751326, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.664868528863115, + "language_loss": 0.79258823, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81475866, + "num_input_tokens_seen": 39068935, + "step": 1809, + "time_per_iteration": 4.067451477050781 + }, + { + "auxiliary_loss_clip": 0.01179884, + "auxiliary_loss_mlp": 0.0105887, + "balance_loss_clip": 1.05867195, + "balance_loss_mlp": 1.03745961, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.820506048212518, + "language_loss": 0.85030955, + "learning_rate": 3.935179130783046e-06, + "loss": 0.87269706, + "num_input_tokens_seen": 39087370, + "step": 1810, + "time_per_iteration": 2.4912214279174805 + }, + { + "auxiliary_loss_clip": 0.01127555, + "auxiliary_loss_mlp": 0.01058848, + "balance_loss_clip": 1.04568744, + "balance_loss_mlp": 1.03518558, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.753988421935799, + "language_loss": 0.63162339, + "learning_rate": 3.935080744080564e-06, + "loss": 0.65348744, + "num_input_tokens_seen": 39106635, + "step": 1811, + "time_per_iteration": 2.6121976375579834 + }, + { + "auxiliary_loss_clip": 0.01152017, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.05398369, + "balance_loss_mlp": 1.02870357, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 2.0083973755602504, + "language_loss": 0.74176192, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76378798, + "num_input_tokens_seen": 39126335, + "step": 1812, + "time_per_iteration": 2.5792033672332764 + }, + { + "auxiliary_loss_clip": 0.01144611, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_clip": 1.05317211, + "balance_loss_mlp": 1.03032684, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 8.7937166949973, + "language_loss": 0.72628158, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74824989, + "num_input_tokens_seen": 39144820, + "step": 1813, + "time_per_iteration": 2.5260202884674072 + }, + { + "auxiliary_loss_clip": 0.0114059, + "auxiliary_loss_mlp": 0.01052726, + "balance_loss_clip": 1.0518899, + "balance_loss_mlp": 1.03153086, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 1.7002698575266966, + "language_loss": 0.82563907, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84757221, + "num_input_tokens_seen": 39165945, + "step": 1814, + "time_per_iteration": 2.6070778369903564 + }, + { + "auxiliary_loss_clip": 0.01142786, + "auxiliary_loss_mlp": 0.01055383, + "balance_loss_clip": 1.05678582, + "balance_loss_mlp": 1.03473616, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.9798192660730454, + "language_loss": 0.84046984, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86245155, + "num_input_tokens_seen": 39183520, + "step": 1815, + "time_per_iteration": 2.6458394527435303 + }, + { + "auxiliary_loss_clip": 0.01150398, + "auxiliary_loss_mlp": 0.01052937, + "balance_loss_clip": 1.05875516, + "balance_loss_mlp": 1.03040671, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.3322695340257114, + "language_loss": 0.71862137, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.74065471, + "num_input_tokens_seen": 39201190, + "step": 1816, + "time_per_iteration": 2.5723016262054443 + }, + { + "auxiliary_loss_clip": 0.01167442, + "auxiliary_loss_mlp": 0.01060679, + "balance_loss_clip": 1.05560398, + "balance_loss_mlp": 1.03837466, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.4061864197617537, + "language_loss": 0.72830284, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75058407, + "num_input_tokens_seen": 39221210, + "step": 1817, + "time_per_iteration": 2.581696033477783 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01054661, + "balance_loss_clip": 1.05510259, + "balance_loss_mlp": 1.03251159, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.7233108248418396, + "language_loss": 0.67648876, + "learning_rate": 3.934389982775706e-06, + "loss": 0.69823724, + "num_input_tokens_seen": 39242025, + "step": 1818, + "time_per_iteration": 2.697012186050415 + }, + { + "auxiliary_loss_clip": 0.01157029, + "auxiliary_loss_mlp": 0.01060348, + "balance_loss_clip": 1.06025004, + "balance_loss_mlp": 1.03846169, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.349912929034194, + "language_loss": 0.72978675, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75196052, + "num_input_tokens_seen": 39259870, + "step": 1819, + "time_per_iteration": 2.5515871047973633 + }, + { + "auxiliary_loss_clip": 0.01144886, + "auxiliary_loss_mlp": 0.00788446, + "balance_loss_clip": 1.05566919, + "balance_loss_mlp": 1.00048327, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 4.325036768376122, + "language_loss": 0.73520875, + "learning_rate": 3.934191962176335e-06, + "loss": 0.75454199, + "num_input_tokens_seen": 39278500, + "step": 1820, + "time_per_iteration": 2.567979574203491 + }, + { + "auxiliary_loss_clip": 0.01183832, + "auxiliary_loss_mlp": 0.01053124, + "balance_loss_clip": 1.06304061, + "balance_loss_mlp": 1.02999735, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.3846624158620155, + "language_loss": 0.8284986, + "learning_rate": 3.934092841857642e-06, + "loss": 0.85086811, + "num_input_tokens_seen": 39294800, + "step": 1821, + "time_per_iteration": 3.981693983078003 + }, + { + "auxiliary_loss_clip": 0.01149995, + "auxiliary_loss_mlp": 0.01052237, + "balance_loss_clip": 1.05635619, + "balance_loss_mlp": 1.03145862, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 15.263825190802061, + "language_loss": 0.7632246, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78524697, + "num_input_tokens_seen": 39314625, + "step": 1822, + "time_per_iteration": 2.6077771186828613 + }, + { + "auxiliary_loss_clip": 0.01149109, + "auxiliary_loss_mlp": 0.01050708, + "balance_loss_clip": 1.05548811, + "balance_loss_mlp": 1.03004849, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 2.2690219893346946, + "language_loss": 0.79669696, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81869513, + "num_input_tokens_seen": 39336465, + "step": 1823, + "time_per_iteration": 2.650688648223877 + }, + { + "auxiliary_loss_clip": 0.01152484, + "auxiliary_loss_mlp": 0.01047781, + "balance_loss_clip": 1.05562878, + "balance_loss_mlp": 1.02650237, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.6406845446534102, + "language_loss": 0.7968787, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81888133, + "num_input_tokens_seen": 39357930, + "step": 1824, + "time_per_iteration": 2.5979342460632324 + }, + { + "auxiliary_loss_clip": 0.01144702, + "auxiliary_loss_mlp": 0.01054561, + "balance_loss_clip": 1.05441082, + "balance_loss_mlp": 1.03336573, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 1.8144731726232992, + "language_loss": 0.87595499, + "learning_rate": 3.933695627210554e-06, + "loss": 0.89794755, + "num_input_tokens_seen": 39376380, + "step": 1825, + "time_per_iteration": 2.56832218170166 + }, + { + "auxiliary_loss_clip": 0.01132616, + "auxiliary_loss_mlp": 0.01052646, + "balance_loss_clip": 1.05034709, + "balance_loss_mlp": 1.03175998, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8416337522474673, + "language_loss": 0.76509726, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78694987, + "num_input_tokens_seen": 39399935, + "step": 1826, + "time_per_iteration": 2.738478899002075 + }, + { + "auxiliary_loss_clip": 0.01086137, + "auxiliary_loss_mlp": 0.01001929, + "balance_loss_clip": 1.04940557, + "balance_loss_mlp": 0.99894869, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8247314005659275, + "language_loss": 0.54975301, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57063371, + "num_input_tokens_seen": 39460685, + "step": 1827, + "time_per_iteration": 3.074347734451294 + }, + { + "auxiliary_loss_clip": 0.01093891, + "auxiliary_loss_mlp": 0.01005648, + "balance_loss_clip": 1.04818392, + "balance_loss_mlp": 1.00278652, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7330655965145717, + "language_loss": 0.55374467, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57474005, + "num_input_tokens_seen": 39524765, + "step": 1828, + "time_per_iteration": 3.0687246322631836 + }, + { + "auxiliary_loss_clip": 0.01158803, + "auxiliary_loss_mlp": 0.01056038, + "balance_loss_clip": 1.05855441, + "balance_loss_mlp": 1.03448462, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.8812945255579576, + "language_loss": 0.83862305, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86077148, + "num_input_tokens_seen": 39543640, + "step": 1829, + "time_per_iteration": 2.608607769012451 + }, + { + "auxiliary_loss_clip": 0.0113081, + "auxiliary_loss_mlp": 0.01055856, + "balance_loss_clip": 1.05748987, + "balance_loss_mlp": 1.03365922, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 1.808203303784673, + "language_loss": 0.88716167, + "learning_rate": 3.933197459096614e-06, + "loss": 0.90902829, + "num_input_tokens_seen": 39567525, + "step": 1830, + "time_per_iteration": 2.8256630897521973 + }, + { + "auxiliary_loss_clip": 0.01060707, + "auxiliary_loss_mlp": 0.01008219, + "balance_loss_clip": 1.03641081, + "balance_loss_mlp": 1.00544119, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.7070588952168099, + "language_loss": 0.55519515, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.5758844, + "num_input_tokens_seen": 39628470, + "step": 1831, + "time_per_iteration": 3.1167213916778564 + }, + { + "auxiliary_loss_clip": 0.01156858, + "auxiliary_loss_mlp": 0.01070157, + "balance_loss_clip": 1.05845499, + "balance_loss_mlp": 1.04669702, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.0651589554930005, + "language_loss": 0.90982598, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93209612, + "num_input_tokens_seen": 39646670, + "step": 1832, + "time_per_iteration": 2.617433786392212 + }, + { + "auxiliary_loss_clip": 0.0107535, + "auxiliary_loss_mlp": 0.01004544, + "balance_loss_clip": 1.03959787, + "balance_loss_mlp": 1.00188589, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7802060303194274, + "language_loss": 0.59930331, + "learning_rate": 3.932897678513523e-06, + "loss": 0.62010229, + "num_input_tokens_seen": 39712915, + "step": 1833, + "time_per_iteration": 3.1012496948242188 + }, + { + "auxiliary_loss_clip": 0.01169149, + "auxiliary_loss_mlp": 0.01049863, + "balance_loss_clip": 1.05594397, + "balance_loss_mlp": 1.02852428, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 3.8484767931816206, + "language_loss": 0.80120862, + "learning_rate": 3.93279760505609e-06, + "loss": 0.82339871, + "num_input_tokens_seen": 39730650, + "step": 1834, + "time_per_iteration": 2.5457255840301514 + }, + { + "auxiliary_loss_clip": 0.01145597, + "auxiliary_loss_mlp": 0.01057424, + "balance_loss_clip": 1.06100667, + "balance_loss_mlp": 1.0339998, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.787120860291665, + "language_loss": 0.90597486, + "learning_rate": 3.932697458306779e-06, + "loss": 0.9280051, + "num_input_tokens_seen": 39751065, + "step": 1835, + "time_per_iteration": 2.731062173843384 + }, + { + "auxiliary_loss_clip": 0.01131109, + "auxiliary_loss_mlp": 0.01069035, + "balance_loss_clip": 1.0531044, + "balance_loss_mlp": 1.0449667, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.215794584476227, + "language_loss": 0.64015615, + "learning_rate": 3.932597238269386e-06, + "loss": 0.66215754, + "num_input_tokens_seen": 39769245, + "step": 1836, + "time_per_iteration": 2.6456105709075928 + }, + { + "auxiliary_loss_clip": 0.01141844, + "auxiliary_loss_mlp": 0.01058118, + "balance_loss_clip": 1.05314052, + "balance_loss_mlp": 1.03760183, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 3.2712051330109975, + "language_loss": 0.72941887, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75141847, + "num_input_tokens_seen": 39790830, + "step": 1837, + "time_per_iteration": 2.6965126991271973 + }, + { + "auxiliary_loss_clip": 0.01166375, + "auxiliary_loss_mlp": 0.01060238, + "balance_loss_clip": 1.05638421, + "balance_loss_mlp": 1.03930449, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 9.959523023886538, + "language_loss": 0.78583384, + "learning_rate": 3.93239657834556e-06, + "loss": 0.80809999, + "num_input_tokens_seen": 39809475, + "step": 1838, + "time_per_iteration": 2.5298519134521484 + }, + { + "auxiliary_loss_clip": 0.01152507, + "auxiliary_loss_mlp": 0.01063497, + "balance_loss_clip": 1.05605841, + "balance_loss_mlp": 1.04257619, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 1.932347455020667, + "language_loss": 0.71451569, + "learning_rate": 3.932296138466736e-06, + "loss": 0.7366758, + "num_input_tokens_seen": 39826355, + "step": 1839, + "time_per_iteration": 2.5431206226348877 + }, + { + "auxiliary_loss_clip": 0.01186876, + "auxiliary_loss_mlp": 0.00789896, + "balance_loss_clip": 1.06305373, + "balance_loss_mlp": 1.00052571, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.2680260495806692, + "language_loss": 0.78794563, + "learning_rate": 3.93219562531505e-06, + "loss": 0.80771327, + "num_input_tokens_seen": 39845335, + "step": 1840, + "time_per_iteration": 2.4875640869140625 + }, + { + "auxiliary_loss_clip": 0.0115663, + "auxiliary_loss_mlp": 0.01056823, + "balance_loss_clip": 1.05277276, + "balance_loss_mlp": 1.03419685, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 1.8406483582570121, + "language_loss": 0.87821865, + "learning_rate": 3.932095038894311e-06, + "loss": 0.90035319, + "num_input_tokens_seen": 39865065, + "step": 1841, + "time_per_iteration": 2.5408692359924316 + }, + { + "auxiliary_loss_clip": 0.01129868, + "auxiliary_loss_mlp": 0.01058943, + "balance_loss_clip": 1.05054617, + "balance_loss_mlp": 1.03686559, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 2.09754435730924, + "language_loss": 0.90540522, + "learning_rate": 3.931994379208334e-06, + "loss": 0.9272933, + "num_input_tokens_seen": 39882780, + "step": 1842, + "time_per_iteration": 4.091450929641724 + }, + { + "auxiliary_loss_clip": 0.01155439, + "auxiliary_loss_mlp": 0.01057444, + "balance_loss_clip": 1.05345929, + "balance_loss_mlp": 1.0367254, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.1016674691042945, + "language_loss": 0.85934639, + "learning_rate": 3.931893646260937e-06, + "loss": 0.88147521, + "num_input_tokens_seen": 39900295, + "step": 1843, + "time_per_iteration": 2.5503251552581787 + }, + { + "auxiliary_loss_clip": 0.01121368, + "auxiliary_loss_mlp": 0.00793809, + "balance_loss_clip": 1.05198467, + "balance_loss_mlp": 1.00065255, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.5815527516483945, + "language_loss": 0.74958426, + "learning_rate": 3.931792840055941e-06, + "loss": 0.76873606, + "num_input_tokens_seen": 39922075, + "step": 1844, + "time_per_iteration": 2.6817572116851807 + }, + { + "auxiliary_loss_clip": 0.01178728, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.05756915, + "balance_loss_mlp": 1.03325188, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 1.9869060294999543, + "language_loss": 0.75492132, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77727282, + "num_input_tokens_seen": 39940115, + "step": 1845, + "time_per_iteration": 3.9306013584136963 + }, + { + "auxiliary_loss_clip": 0.01152332, + "auxiliary_loss_mlp": 0.01056073, + "balance_loss_clip": 1.05485082, + "balance_loss_mlp": 1.03556943, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.5952924044671073, + "language_loss": 0.76100934, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.78309333, + "num_input_tokens_seen": 39959920, + "step": 1846, + "time_per_iteration": 2.558405637741089 + }, + { + "auxiliary_loss_clip": 0.01172659, + "auxiliary_loss_mlp": 0.01050784, + "balance_loss_clip": 1.05947411, + "balance_loss_mlp": 1.02994657, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.5648900832803583, + "language_loss": 0.86055672, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88279116, + "num_input_tokens_seen": 39974755, + "step": 1847, + "time_per_iteration": 2.506427764892578 + }, + { + "auxiliary_loss_clip": 0.0117927, + "auxiliary_loss_mlp": 0.0105721, + "balance_loss_clip": 1.05658746, + "balance_loss_mlp": 1.03557348, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 2.0057276908033512, + "language_loss": 0.768646, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79101074, + "num_input_tokens_seen": 39993355, + "step": 1848, + "time_per_iteration": 2.4605278968811035 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.0105443, + "balance_loss_clip": 1.06507826, + "balance_loss_mlp": 1.03415275, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 1.877147796188161, + "language_loss": 0.77987719, + "learning_rate": 3.931287710300832e-06, + "loss": 0.8021512, + "num_input_tokens_seen": 40012410, + "step": 1849, + "time_per_iteration": 3.8956141471862793 + }, + { + "auxiliary_loss_clip": 0.0113586, + "auxiliary_loss_mlp": 0.00791065, + "balance_loss_clip": 1.05210698, + "balance_loss_mlp": 1.00054753, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 3.1417655229977126, + "language_loss": 0.72099066, + "learning_rate": 3.931186464630601e-06, + "loss": 0.74025983, + "num_input_tokens_seen": 40029315, + "step": 1850, + "time_per_iteration": 2.551762342453003 + }, + { + "auxiliary_loss_clip": 0.01167125, + "auxiliary_loss_mlp": 0.0105629, + "balance_loss_clip": 1.05670369, + "balance_loss_mlp": 1.03392625, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 4.768986641288343, + "language_loss": 0.81195486, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83418894, + "num_input_tokens_seen": 40045765, + "step": 1851, + "time_per_iteration": 2.58686900138855 + }, + { + "auxiliary_loss_clip": 0.01157661, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.05434179, + "balance_loss_mlp": 1.03700638, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 2.6214791944069544, + "language_loss": 0.88574296, + "learning_rate": 3.930983753601631e-06, + "loss": 0.90789872, + "num_input_tokens_seen": 40061660, + "step": 1852, + "time_per_iteration": 2.4976861476898193 + }, + { + "auxiliary_loss_clip": 0.01164936, + "auxiliary_loss_mlp": 0.01058761, + "balance_loss_clip": 1.05647457, + "balance_loss_mlp": 1.03646898, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.0050479996126214, + "language_loss": 0.71945798, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74169499, + "num_input_tokens_seen": 40080180, + "step": 1853, + "time_per_iteration": 2.4944236278533936 + }, + { + "auxiliary_loss_clip": 0.0108835, + "auxiliary_loss_mlp": 0.01000463, + "balance_loss_clip": 1.04430151, + "balance_loss_mlp": 0.9976142, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7788028388218661, + "language_loss": 0.53662992, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55751806, + "num_input_tokens_seen": 40138910, + "step": 1854, + "time_per_iteration": 3.0251777172088623 + }, + { + "auxiliary_loss_clip": 0.01156082, + "auxiliary_loss_mlp": 0.01055283, + "balance_loss_clip": 1.05255294, + "balance_loss_mlp": 1.03288412, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.9709736096300596, + "language_loss": 0.84764361, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.86975724, + "num_input_tokens_seen": 40157745, + "step": 1855, + "time_per_iteration": 2.629657030105591 + }, + { + "auxiliary_loss_clip": 0.011456, + "auxiliary_loss_mlp": 0.01063163, + "balance_loss_clip": 1.05123591, + "balance_loss_mlp": 1.04220653, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.055149580549347, + "language_loss": 0.81636083, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.83844852, + "num_input_tokens_seen": 40175375, + "step": 1856, + "time_per_iteration": 2.553480386734009 + }, + { + "auxiliary_loss_clip": 0.01164835, + "auxiliary_loss_mlp": 0.01046133, + "balance_loss_clip": 1.05685282, + "balance_loss_mlp": 1.02364993, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 1.9208861170591627, + "language_loss": 0.83102727, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85313696, + "num_input_tokens_seen": 40195715, + "step": 1857, + "time_per_iteration": 2.583911895751953 + }, + { + "auxiliary_loss_clip": 0.01140306, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.05162621, + "balance_loss_mlp": 1.02539158, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.764827768518248, + "language_loss": 0.83321834, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85507751, + "num_input_tokens_seen": 40213975, + "step": 1858, + "time_per_iteration": 2.538135290145874 + }, + { + "auxiliary_loss_clip": 0.01136188, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.04949391, + "balance_loss_mlp": 1.04387474, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.626757369439611, + "language_loss": 0.91007292, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93210566, + "num_input_tokens_seen": 40233905, + "step": 1859, + "time_per_iteration": 2.6012790203094482 + }, + { + "auxiliary_loss_clip": 0.0116522, + "auxiliary_loss_mlp": 0.0104764, + "balance_loss_clip": 1.05453277, + "balance_loss_mlp": 1.02640867, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.0446558221302804, + "language_loss": 0.81790996, + "learning_rate": 3.930169980870018e-06, + "loss": 0.8400386, + "num_input_tokens_seen": 40252810, + "step": 1860, + "time_per_iteration": 4.004274845123291 + }, + { + "auxiliary_loss_clip": 0.01148418, + "auxiliary_loss_mlp": 0.01053224, + "balance_loss_clip": 1.05479288, + "balance_loss_mlp": 1.03251743, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.0710394428329644, + "language_loss": 0.75136256, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77337903, + "num_input_tokens_seen": 40272000, + "step": 1861, + "time_per_iteration": 2.5907626152038574 + }, + { + "auxiliary_loss_clip": 0.01175527, + "auxiliary_loss_mlp": 0.0105178, + "balance_loss_clip": 1.0565393, + "balance_loss_mlp": 1.03169298, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 1.8470671115722501, + "language_loss": 0.89317679, + "learning_rate": 3.929965805687474e-06, + "loss": 0.9154498, + "num_input_tokens_seen": 40290660, + "step": 1862, + "time_per_iteration": 2.491556167602539 + }, + { + "auxiliary_loss_clip": 0.01162962, + "auxiliary_loss_mlp": 0.01061748, + "balance_loss_clip": 1.06159031, + "balance_loss_mlp": 1.04051685, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.238995063327083, + "language_loss": 0.86920369, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89145076, + "num_input_tokens_seen": 40307820, + "step": 1863, + "time_per_iteration": 2.5610547065734863 + }, + { + "auxiliary_loss_clip": 0.01159561, + "auxiliary_loss_mlp": 0.01048972, + "balance_loss_clip": 1.0567677, + "balance_loss_mlp": 1.02528524, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 1.7925506074273452, + "language_loss": 0.6465134, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66859877, + "num_input_tokens_seen": 40327430, + "step": 1864, + "time_per_iteration": 2.541024923324585 + }, + { + "auxiliary_loss_clip": 0.01111495, + "auxiliary_loss_mlp": 0.01052745, + "balance_loss_clip": 1.0531584, + "balance_loss_mlp": 1.03299153, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 1.934476526680671, + "language_loss": 0.74077439, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76241672, + "num_input_tokens_seen": 40344545, + "step": 1865, + "time_per_iteration": 2.6272270679473877 + }, + { + "auxiliary_loss_clip": 0.01121633, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.0603292, + "balance_loss_mlp": 1.03331983, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.3367808507531747, + "language_loss": 0.85006654, + "learning_rate": 3.929556577139446e-06, + "loss": 0.87185526, + "num_input_tokens_seen": 40362300, + "step": 1866, + "time_per_iteration": 2.788203001022339 + }, + { + "auxiliary_loss_clip": 0.01097892, + "auxiliary_loss_mlp": 0.00791029, + "balance_loss_clip": 1.04438055, + "balance_loss_mlp": 1.00048387, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.6196850999224477, + "language_loss": 0.81092072, + "learning_rate": 3.929454087070286e-06, + "loss": 0.8298099, + "num_input_tokens_seen": 40384720, + "step": 1867, + "time_per_iteration": 2.9450159072875977 + }, + { + "auxiliary_loss_clip": 0.01180402, + "auxiliary_loss_mlp": 0.01062242, + "balance_loss_clip": 1.06056559, + "balance_loss_mlp": 1.04217911, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 3.3749988729896514, + "language_loss": 0.87053734, + "learning_rate": 3.929351523836035e-06, + "loss": 0.89296377, + "num_input_tokens_seen": 40404000, + "step": 1868, + "time_per_iteration": 2.7104318141937256 + }, + { + "auxiliary_loss_clip": 0.01162331, + "auxiliary_loss_mlp": 0.00787777, + "balance_loss_clip": 1.06332552, + "balance_loss_mlp": 1.00042367, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.499387721427469, + "language_loss": 0.68110585, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70060694, + "num_input_tokens_seen": 40418665, + "step": 1869, + "time_per_iteration": 2.5927987098693848 + }, + { + "auxiliary_loss_clip": 0.01137113, + "auxiliary_loss_mlp": 0.01066253, + "balance_loss_clip": 1.05374122, + "balance_loss_mlp": 1.04338896, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 1.660571942935114, + "language_loss": 0.77366078, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79569447, + "num_input_tokens_seen": 40437870, + "step": 1870, + "time_per_iteration": 2.5993523597717285 + }, + { + "auxiliary_loss_clip": 0.01133691, + "auxiliary_loss_mlp": 0.01058588, + "balance_loss_clip": 1.05813646, + "balance_loss_mlp": 1.03684473, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 3.2026448233914917, + "language_loss": 0.75982332, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78174615, + "num_input_tokens_seen": 40455570, + "step": 1871, + "time_per_iteration": 2.6004831790924072 + }, + { + "auxiliary_loss_clip": 0.01110715, + "auxiliary_loss_mlp": 0.01054853, + "balance_loss_clip": 1.05619121, + "balance_loss_mlp": 1.03511167, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 7.347308747135719, + "language_loss": 0.81685311, + "learning_rate": 3.928940539325929e-06, + "loss": 0.83850878, + "num_input_tokens_seen": 40473600, + "step": 1872, + "time_per_iteration": 2.6458818912506104 + }, + { + "auxiliary_loss_clip": 0.01181961, + "auxiliary_loss_mlp": 0.01059048, + "balance_loss_clip": 1.06120253, + "balance_loss_mlp": 1.0384959, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.471650145934481, + "language_loss": 0.83951235, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.86192244, + "num_input_tokens_seen": 40490025, + "step": 1873, + "time_per_iteration": 2.493483066558838 + }, + { + "auxiliary_loss_clip": 0.01150683, + "auxiliary_loss_mlp": 0.01060788, + "balance_loss_clip": 1.05991793, + "balance_loss_mlp": 1.03749418, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.0168533436932354, + "language_loss": 0.92299461, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94510937, + "num_input_tokens_seen": 40511580, + "step": 1874, + "time_per_iteration": 2.6261487007141113 + }, + { + "auxiliary_loss_clip": 0.01147722, + "auxiliary_loss_mlp": 0.01064727, + "balance_loss_clip": 1.05879855, + "balance_loss_mlp": 1.04398417, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.496725841895783, + "language_loss": 0.75507939, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77720392, + "num_input_tokens_seen": 40530155, + "step": 1875, + "time_per_iteration": 2.604264259338379 + }, + { + "auxiliary_loss_clip": 0.01172029, + "auxiliary_loss_mlp": 0.01062584, + "balance_loss_clip": 1.06581581, + "balance_loss_mlp": 1.04284275, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.8887322958585373, + "language_loss": 0.71989042, + "learning_rate": 3.928528384485984e-06, + "loss": 0.74223661, + "num_input_tokens_seen": 40549500, + "step": 1876, + "time_per_iteration": 2.5825932025909424 + }, + { + "auxiliary_loss_clip": 0.01154793, + "auxiliary_loss_mlp": 0.01055473, + "balance_loss_clip": 1.06034207, + "balance_loss_mlp": 1.03532624, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 2.3345314884180612, + "language_loss": 0.77120805, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.7933107, + "num_input_tokens_seen": 40567475, + "step": 1877, + "time_per_iteration": 2.564706325531006 + }, + { + "auxiliary_loss_clip": 0.01172613, + "auxiliary_loss_mlp": 0.01061512, + "balance_loss_clip": 1.05958843, + "balance_loss_mlp": 1.03972065, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 3.4041226561217206, + "language_loss": 0.88391173, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90625298, + "num_input_tokens_seen": 40583280, + "step": 1878, + "time_per_iteration": 2.548037052154541 + }, + { + "auxiliary_loss_clip": 0.01146137, + "auxiliary_loss_mlp": 0.01050443, + "balance_loss_clip": 1.06093335, + "balance_loss_mlp": 1.03012943, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.4201103076754067, + "language_loss": 0.80899942, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83096522, + "num_input_tokens_seen": 40603080, + "step": 1879, + "time_per_iteration": 2.5954737663269043 + }, + { + "auxiliary_loss_clip": 0.01158775, + "auxiliary_loss_mlp": 0.01061243, + "balance_loss_clip": 1.05675399, + "balance_loss_mlp": 1.03958249, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 1.8898153245100069, + "language_loss": 0.70412952, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72632968, + "num_input_tokens_seen": 40623255, + "step": 1880, + "time_per_iteration": 2.6673436164855957 + }, + { + "auxiliary_loss_clip": 0.01158773, + "auxiliary_loss_mlp": 0.01051521, + "balance_loss_clip": 1.06023836, + "balance_loss_mlp": 1.03077888, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.9890117232694808, + "language_loss": 0.72357339, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74567634, + "num_input_tokens_seen": 40641570, + "step": 1881, + "time_per_iteration": 2.5426506996154785 + }, + { + "auxiliary_loss_clip": 0.0114945, + "auxiliary_loss_mlp": 0.00790794, + "balance_loss_clip": 1.05878782, + "balance_loss_mlp": 1.00048566, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.4902958546830343, + "language_loss": 0.7432152, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76261765, + "num_input_tokens_seen": 40658775, + "step": 1882, + "time_per_iteration": 4.034909248352051 + }, + { + "auxiliary_loss_clip": 0.01179097, + "auxiliary_loss_mlp": 0.01055507, + "balance_loss_clip": 1.05986428, + "balance_loss_mlp": 1.0330956, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 1.954690265152739, + "language_loss": 0.79383844, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81618452, + "num_input_tokens_seen": 40679555, + "step": 1883, + "time_per_iteration": 2.5244951248168945 + }, + { + "auxiliary_loss_clip": 0.01144944, + "auxiliary_loss_mlp": 0.010608, + "balance_loss_clip": 1.05671942, + "balance_loss_mlp": 1.03885365, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 3.816636005386573, + "language_loss": 0.77386433, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79592174, + "num_input_tokens_seen": 40697295, + "step": 1884, + "time_per_iteration": 3.915804386138916 + }, + { + "auxiliary_loss_clip": 0.01065712, + "auxiliary_loss_mlp": 0.01021723, + "balance_loss_clip": 1.03096223, + "balance_loss_mlp": 1.01945806, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.8153076625875907, + "language_loss": 0.55188382, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5727582, + "num_input_tokens_seen": 40758095, + "step": 1885, + "time_per_iteration": 2.972829818725586 + }, + { + "auxiliary_loss_clip": 0.01090805, + "auxiliary_loss_mlp": 0.01052048, + "balance_loss_clip": 1.04978943, + "balance_loss_mlp": 1.03171122, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 3.448660969780395, + "language_loss": 0.90617806, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92760658, + "num_input_tokens_seen": 40777140, + "step": 1886, + "time_per_iteration": 2.8583128452301025 + }, + { + "auxiliary_loss_clip": 0.01116702, + "auxiliary_loss_mlp": 0.01050971, + "balance_loss_clip": 1.04973984, + "balance_loss_mlp": 1.02966881, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.9955528117499717, + "language_loss": 0.84944594, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87112266, + "num_input_tokens_seen": 40797505, + "step": 1887, + "time_per_iteration": 3.1684646606445312 + }, + { + "auxiliary_loss_clip": 0.01140605, + "auxiliary_loss_mlp": 0.01060207, + "balance_loss_clip": 1.05636179, + "balance_loss_mlp": 1.04033446, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 2.8281922948923817, + "language_loss": 0.76534182, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78734994, + "num_input_tokens_seen": 40812970, + "step": 1888, + "time_per_iteration": 2.5803732872009277 + }, + { + "auxiliary_loss_clip": 0.01136594, + "auxiliary_loss_mlp": 0.01058186, + "balance_loss_clip": 1.05829632, + "balance_loss_mlp": 1.03584671, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 1.9497679663926808, + "language_loss": 0.68146962, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70341742, + "num_input_tokens_seen": 40837745, + "step": 1889, + "time_per_iteration": 4.115885019302368 + }, + { + "auxiliary_loss_clip": 0.01179205, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.06061447, + "balance_loss_mlp": 1.0302614, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 1.684075744404259, + "language_loss": 0.84553623, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86784101, + "num_input_tokens_seen": 40856490, + "step": 1890, + "time_per_iteration": 2.5070602893829346 + }, + { + "auxiliary_loss_clip": 0.01148256, + "auxiliary_loss_mlp": 0.0105464, + "balance_loss_clip": 1.05423069, + "balance_loss_mlp": 1.03315854, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.1237298654530274, + "language_loss": 0.64960027, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67162919, + "num_input_tokens_seen": 40874070, + "step": 1891, + "time_per_iteration": 2.546318292617798 + }, + { + "auxiliary_loss_clip": 0.01144592, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.05299747, + "balance_loss_mlp": 1.02110052, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.046758111873564, + "language_loss": 0.88291633, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90477741, + "num_input_tokens_seen": 40892425, + "step": 1892, + "time_per_iteration": 2.583527088165283 + }, + { + "auxiliary_loss_clip": 0.01124322, + "auxiliary_loss_mlp": 0.01066965, + "balance_loss_clip": 1.06061697, + "balance_loss_mlp": 1.04408836, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.268645745834048, + "language_loss": 0.7320196, + "learning_rate": 3.926763675749339e-06, + "loss": 0.75393248, + "num_input_tokens_seen": 40912190, + "step": 1893, + "time_per_iteration": 2.67035174369812 + }, + { + "auxiliary_loss_clip": 0.01174173, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.05503154, + "balance_loss_mlp": 1.0361948, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 1.9166089922972995, + "language_loss": 0.79864019, + "learning_rate": 3.92665921159591e-06, + "loss": 0.82095563, + "num_input_tokens_seen": 40928395, + "step": 1894, + "time_per_iteration": 2.4883739948272705 + }, + { + "auxiliary_loss_clip": 0.01152243, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_clip": 1.05744994, + "balance_loss_mlp": 1.02897501, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.7274814720262737, + "language_loss": 0.7988736, + "learning_rate": 3.926554674383371e-06, + "loss": 0.8208943, + "num_input_tokens_seen": 40946555, + "step": 1895, + "time_per_iteration": 2.6512622833251953 + }, + { + "auxiliary_loss_clip": 0.01077485, + "auxiliary_loss_mlp": 0.00998564, + "balance_loss_clip": 1.03282714, + "balance_loss_mlp": 0.9959057, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.7982667827522315, + "language_loss": 0.63365048, + "learning_rate": 3.926450064115686e-06, + "loss": 0.6544109, + "num_input_tokens_seen": 41004910, + "step": 1896, + "time_per_iteration": 3.1248323917388916 + }, + { + "auxiliary_loss_clip": 0.01148628, + "auxiliary_loss_mlp": 0.0105663, + "balance_loss_clip": 1.05979478, + "balance_loss_mlp": 1.03367066, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.864366556311423, + "language_loss": 0.85042977, + "learning_rate": 3.926345380796821e-06, + "loss": 0.8724823, + "num_input_tokens_seen": 41026385, + "step": 1897, + "time_per_iteration": 2.5657858848571777 + }, + { + "auxiliary_loss_clip": 0.01178024, + "auxiliary_loss_mlp": 0.00788925, + "balance_loss_clip": 1.05856359, + "balance_loss_mlp": 1.00051737, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.327665840173554, + "language_loss": 0.79736882, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.8170383, + "num_input_tokens_seen": 41045315, + "step": 1898, + "time_per_iteration": 2.537583827972412 + }, + { + "auxiliary_loss_clip": 0.01118196, + "auxiliary_loss_mlp": 0.0105755, + "balance_loss_clip": 1.04978824, + "balance_loss_mlp": 1.03463757, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.9007181847981656, + "language_loss": 0.73375577, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75551319, + "num_input_tokens_seen": 41063390, + "step": 1899, + "time_per_iteration": 2.744248867034912 + }, + { + "auxiliary_loss_clip": 0.01035912, + "auxiliary_loss_mlp": 0.01000395, + "balance_loss_clip": 1.02813053, + "balance_loss_mlp": 0.99771243, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9482571840407764, + "language_loss": 0.63419819, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65456116, + "num_input_tokens_seen": 41124180, + "step": 1900, + "time_per_iteration": 4.778116464614868 + }, + { + "auxiliary_loss_clip": 0.0111673, + "auxiliary_loss_mlp": 0.01058601, + "balance_loss_clip": 1.05218935, + "balance_loss_mlp": 1.03761983, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.7062337161697443, + "language_loss": 0.78045052, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80220377, + "num_input_tokens_seen": 41143485, + "step": 1901, + "time_per_iteration": 2.6519482135772705 + }, + { + "auxiliary_loss_clip": 0.01171748, + "auxiliary_loss_mlp": 0.01054487, + "balance_loss_clip": 1.06265771, + "balance_loss_mlp": 1.03437591, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 2.024315124124593, + "language_loss": 0.84022021, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86248255, + "num_input_tokens_seen": 41161695, + "step": 1902, + "time_per_iteration": 2.538619041442871 + }, + { + "auxiliary_loss_clip": 0.01160328, + "auxiliary_loss_mlp": 0.01058466, + "balance_loss_clip": 1.05564833, + "balance_loss_mlp": 1.03525555, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.8361108185730248, + "language_loss": 0.77874124, + "learning_rate": 3.925715747031356e-06, + "loss": 0.80092919, + "num_input_tokens_seen": 41181715, + "step": 1903, + "time_per_iteration": 2.534442186355591 + }, + { + "auxiliary_loss_clip": 0.01145921, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.05547249, + "balance_loss_mlp": 1.02307522, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.3416208838509345, + "language_loss": 0.75778663, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77965307, + "num_input_tokens_seen": 41201770, + "step": 1904, + "time_per_iteration": 2.5766942501068115 + }, + { + "auxiliary_loss_clip": 0.01150505, + "auxiliary_loss_mlp": 0.01055344, + "balance_loss_clip": 1.05374825, + "balance_loss_mlp": 1.03363633, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.3025028569907353, + "language_loss": 0.92186946, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.943928, + "num_input_tokens_seen": 41220590, + "step": 1905, + "time_per_iteration": 2.53835391998291 + }, + { + "auxiliary_loss_clip": 0.01163163, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.05517948, + "balance_loss_mlp": 1.0234158, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.9096003057496778, + "language_loss": 0.77987641, + "learning_rate": 3.925399944279861e-06, + "loss": 0.80195552, + "num_input_tokens_seen": 41237250, + "step": 1906, + "time_per_iteration": 2.556370496749878 + }, + { + "auxiliary_loss_clip": 0.01176815, + "auxiliary_loss_mlp": 0.010521, + "balance_loss_clip": 1.05754435, + "balance_loss_mlp": 1.03115511, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.3342416115853433, + "language_loss": 0.82055521, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84284437, + "num_input_tokens_seen": 41256680, + "step": 1907, + "time_per_iteration": 2.4806699752807617 + }, + { + "auxiliary_loss_clip": 0.01138914, + "auxiliary_loss_mlp": 0.01066498, + "balance_loss_clip": 1.0550406, + "balance_loss_mlp": 1.04548192, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 2.239196241720458, + "language_loss": 0.84837008, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87042427, + "num_input_tokens_seen": 41270955, + "step": 1908, + "time_per_iteration": 2.5775980949401855 + }, + { + "auxiliary_loss_clip": 0.01027783, + "auxiliary_loss_mlp": 0.01024527, + "balance_loss_clip": 1.02303648, + "balance_loss_mlp": 1.02085495, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9385447055674249, + "language_loss": 0.61037284, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63089597, + "num_input_tokens_seen": 41319180, + "step": 1909, + "time_per_iteration": 2.9043357372283936 + }, + { + "auxiliary_loss_clip": 0.01178825, + "auxiliary_loss_mlp": 0.01045188, + "balance_loss_clip": 1.05979609, + "balance_loss_mlp": 1.02547026, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 2.0353822459375355, + "language_loss": 0.79288948, + "learning_rate": 3.924977851804197e-06, + "loss": 0.81512964, + "num_input_tokens_seen": 41337480, + "step": 1910, + "time_per_iteration": 2.4507837295532227 + }, + { + "auxiliary_loss_clip": 0.01154211, + "auxiliary_loss_mlp": 0.01050981, + "balance_loss_clip": 1.05955172, + "balance_loss_mlp": 1.02985728, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 2.41235688454919, + "language_loss": 0.76953816, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79159009, + "num_input_tokens_seen": 41354650, + "step": 1911, + "time_per_iteration": 2.547003746032715 + }, + { + "auxiliary_loss_clip": 0.01156394, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_clip": 1.05794644, + "balance_loss_mlp": 1.02905631, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 2.3336074278566605, + "language_loss": 0.79297233, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81503391, + "num_input_tokens_seen": 41376935, + "step": 1912, + "time_per_iteration": 2.6180617809295654 + }, + { + "auxiliary_loss_clip": 0.01178111, + "auxiliary_loss_mlp": 0.00787147, + "balance_loss_clip": 1.06011724, + "balance_loss_mlp": 1.0002811, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 2.030598254446951, + "language_loss": 0.77925682, + "learning_rate": 3.924660515982246e-06, + "loss": 0.79890943, + "num_input_tokens_seen": 41396105, + "step": 1913, + "time_per_iteration": 2.5066099166870117 + }, + { + "auxiliary_loss_clip": 0.01165769, + "auxiliary_loss_mlp": 0.01049237, + "balance_loss_clip": 1.05666971, + "balance_loss_mlp": 1.02764821, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 2.371024618031955, + "language_loss": 0.70397252, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72612262, + "num_input_tokens_seen": 41415600, + "step": 1914, + "time_per_iteration": 2.5484611988067627 + }, + { + "auxiliary_loss_clip": 0.01012723, + "auxiliary_loss_mlp": 0.01006321, + "balance_loss_clip": 1.03563905, + "balance_loss_mlp": 1.00363874, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.9357980481105419, + "language_loss": 0.61003411, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63022459, + "num_input_tokens_seen": 41478760, + "step": 1915, + "time_per_iteration": 3.432157039642334 + }, + { + "auxiliary_loss_clip": 0.01162918, + "auxiliary_loss_mlp": 0.01052688, + "balance_loss_clip": 1.06031561, + "balance_loss_mlp": 1.03154063, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.7143009833317167, + "language_loss": 0.93483245, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95698851, + "num_input_tokens_seen": 41495720, + "step": 1916, + "time_per_iteration": 2.9268112182617188 + }, + { + "auxiliary_loss_clip": 0.01158413, + "auxiliary_loss_mlp": 0.01054609, + "balance_loss_clip": 1.05977106, + "balance_loss_mlp": 1.03112531, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 2.0157731922041067, + "language_loss": 0.72656035, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.74869055, + "num_input_tokens_seen": 41513585, + "step": 1917, + "time_per_iteration": 2.5720624923706055 + }, + { + "auxiliary_loss_clip": 0.01140704, + "auxiliary_loss_mlp": 0.01053645, + "balance_loss_clip": 1.06093013, + "balance_loss_mlp": 1.03187776, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.0969018430689452, + "language_loss": 0.74734461, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76928806, + "num_input_tokens_seen": 41533390, + "step": 1918, + "time_per_iteration": 2.6697754859924316 + }, + { + "auxiliary_loss_clip": 0.0114751, + "auxiliary_loss_mlp": 0.01049467, + "balance_loss_clip": 1.0579778, + "balance_loss_mlp": 1.02846277, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.1609339937375167, + "language_loss": 0.86267936, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88464916, + "num_input_tokens_seen": 41551015, + "step": 1919, + "time_per_iteration": 2.5914146900177 + }, + { + "auxiliary_loss_clip": 0.01133394, + "auxiliary_loss_mlp": 0.01063146, + "balance_loss_clip": 1.05137539, + "balance_loss_mlp": 1.03924489, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 1.8259372946050207, + "language_loss": 0.8663013, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88826668, + "num_input_tokens_seen": 41568055, + "step": 1920, + "time_per_iteration": 2.571202039718628 + }, + { + "auxiliary_loss_clip": 0.01163714, + "auxiliary_loss_mlp": 0.01053988, + "balance_loss_clip": 1.05892301, + "balance_loss_mlp": 1.03211355, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.292949919586134, + "language_loss": 0.7953546, + "learning_rate": 3.923811076152589e-06, + "loss": 0.81753159, + "num_input_tokens_seen": 41587435, + "step": 1921, + "time_per_iteration": 2.558208703994751 + }, + { + "auxiliary_loss_clip": 0.01169868, + "auxiliary_loss_mlp": 0.01055186, + "balance_loss_clip": 1.05598998, + "balance_loss_mlp": 1.03232217, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 2.024649564688404, + "language_loss": 0.7838974, + "learning_rate": 3.923704567851557e-06, + "loss": 0.80614793, + "num_input_tokens_seen": 41604975, + "step": 1922, + "time_per_iteration": 3.9560394287109375 + }, + { + "auxiliary_loss_clip": 0.01092272, + "auxiliary_loss_mlp": 0.01056487, + "balance_loss_clip": 1.04826188, + "balance_loss_mlp": 1.0351485, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.9102703672335517, + "language_loss": 0.8432079, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86469549, + "num_input_tokens_seen": 41626155, + "step": 1923, + "time_per_iteration": 4.195303440093994 + }, + { + "auxiliary_loss_clip": 0.01170076, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_clip": 1.06149077, + "balance_loss_mlp": 1.0349704, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 1.9958719694398275, + "language_loss": 0.8089413, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.83121926, + "num_input_tokens_seen": 41644805, + "step": 1924, + "time_per_iteration": 2.505502939224243 + }, + { + "auxiliary_loss_clip": 0.01049134, + "auxiliary_loss_mlp": 0.01010493, + "balance_loss_clip": 1.02376056, + "balance_loss_mlp": 1.00767922, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 1.4854659416285658, + "language_loss": 0.61240774, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63300401, + "num_input_tokens_seen": 41709345, + "step": 1925, + "time_per_iteration": 3.1417033672332764 + }, + { + "auxiliary_loss_clip": 0.01153893, + "auxiliary_loss_mlp": 0.0107828, + "balance_loss_clip": 1.05289161, + "balance_loss_mlp": 1.05551064, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 2.472658242961548, + "language_loss": 0.74869287, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77101463, + "num_input_tokens_seen": 41730210, + "step": 1926, + "time_per_iteration": 2.5251874923706055 + }, + { + "auxiliary_loss_clip": 0.01120729, + "auxiliary_loss_mlp": 0.0079222, + "balance_loss_clip": 1.0487293, + "balance_loss_mlp": 1.00039124, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.664990321402828, + "language_loss": 0.72276115, + "learning_rate": 3.923170932221222e-06, + "loss": 0.74189067, + "num_input_tokens_seen": 41750270, + "step": 1927, + "time_per_iteration": 2.676698923110962 + }, + { + "auxiliary_loss_clip": 0.01134802, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.05017328, + "balance_loss_mlp": 1.03682876, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 1.602124482311988, + "language_loss": 0.86777711, + "learning_rate": 3.92306398629845e-06, + "loss": 0.88971531, + "num_input_tokens_seen": 41772975, + "step": 1928, + "time_per_iteration": 2.594370126724243 + }, + { + "auxiliary_loss_clip": 0.01128618, + "auxiliary_loss_mlp": 0.01058507, + "balance_loss_clip": 1.05106521, + "balance_loss_mlp": 1.03659678, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.7790131998342602, + "language_loss": 0.77471578, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79658705, + "num_input_tokens_seen": 41791765, + "step": 1929, + "time_per_iteration": 4.270169496536255 + }, + { + "auxiliary_loss_clip": 0.01173422, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_clip": 1.05737853, + "balance_loss_mlp": 1.04811549, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6859634026900834, + "language_loss": 0.7681818, + "learning_rate": 3.922849875688626e-06, + "loss": 0.79059184, + "num_input_tokens_seen": 41815615, + "step": 1930, + "time_per_iteration": 2.6056387424468994 + }, + { + "auxiliary_loss_clip": 0.01142222, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.04893994, + "balance_loss_mlp": 1.03577471, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.6853009737034128, + "language_loss": 0.72025985, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74225342, + "num_input_tokens_seen": 41834810, + "step": 1931, + "time_per_iteration": 2.5567400455474854 + }, + { + "auxiliary_loss_clip": 0.01145438, + "auxiliary_loss_mlp": 0.01061475, + "balance_loss_clip": 1.05237246, + "balance_loss_mlp": 1.03806269, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.4888675079296723, + "language_loss": 0.82425553, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84632462, + "num_input_tokens_seen": 41854975, + "step": 1932, + "time_per_iteration": 2.579721450805664 + }, + { + "auxiliary_loss_clip": 0.01023924, + "auxiliary_loss_mlp": 0.01049916, + "balance_loss_clip": 1.01934254, + "balance_loss_mlp": 1.04602933, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7827035055082493, + "language_loss": 0.61103976, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63177818, + "num_input_tokens_seen": 41911105, + "step": 1933, + "time_per_iteration": 3.0238239765167236 + }, + { + "auxiliary_loss_clip": 0.01105922, + "auxiliary_loss_mlp": 0.00789588, + "balance_loss_clip": 1.04649758, + "balance_loss_mlp": 1.00036359, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.5058598218612023, + "language_loss": 0.8639046, + "learning_rate": 3.922420779525586e-06, + "loss": 0.88285977, + "num_input_tokens_seen": 41931750, + "step": 1934, + "time_per_iteration": 2.6607162952423096 + }, + { + "auxiliary_loss_clip": 0.01123297, + "auxiliary_loss_mlp": 0.01062427, + "balance_loss_clip": 1.05374289, + "balance_loss_mlp": 1.03851414, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.144550003215016, + "language_loss": 0.65829146, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.68014866, + "num_input_tokens_seen": 41949400, + "step": 1935, + "time_per_iteration": 2.607598066329956 + }, + { + "auxiliary_loss_clip": 0.01179847, + "auxiliary_loss_mlp": 0.01050503, + "balance_loss_clip": 1.05791306, + "balance_loss_mlp": 1.03119135, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 2.1171782460491015, + "language_loss": 0.75684816, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77915162, + "num_input_tokens_seen": 41968100, + "step": 1936, + "time_per_iteration": 2.8032686710357666 + }, + { + "auxiliary_loss_clip": 0.01177202, + "auxiliary_loss_mlp": 0.01053097, + "balance_loss_clip": 1.05641246, + "balance_loss_mlp": 1.03116262, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.9017282566749611, + "language_loss": 0.84369898, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86600196, + "num_input_tokens_seen": 41986375, + "step": 1937, + "time_per_iteration": 2.475137948989868 + }, + { + "auxiliary_loss_clip": 0.011498, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_clip": 1.0535816, + "balance_loss_mlp": 1.02832651, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 1.8466810912860052, + "language_loss": 0.75983083, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78181207, + "num_input_tokens_seen": 42006055, + "step": 1938, + "time_per_iteration": 2.6125638484954834 + }, + { + "auxiliary_loss_clip": 0.01180046, + "auxiliary_loss_mlp": 0.01059418, + "balance_loss_clip": 1.05835891, + "balance_loss_mlp": 1.03830564, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 4.1295115310420645, + "language_loss": 0.79637706, + "learning_rate": 3.921882769138696e-06, + "loss": 0.81877172, + "num_input_tokens_seen": 42024995, + "step": 1939, + "time_per_iteration": 3.964932680130005 + }, + { + "auxiliary_loss_clip": 0.01148617, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_clip": 1.05302477, + "balance_loss_mlp": 1.03184152, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 2.9326637301079894, + "language_loss": 0.86208946, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88411331, + "num_input_tokens_seen": 42042640, + "step": 1940, + "time_per_iteration": 2.5870211124420166 + }, + { + "auxiliary_loss_clip": 0.01151142, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.05604935, + "balance_loss_mlp": 1.04300058, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.5300998649636999, + "language_loss": 0.75707459, + "learning_rate": 3.921667054809449e-06, + "loss": 0.77921724, + "num_input_tokens_seen": 42067005, + "step": 1941, + "time_per_iteration": 2.731602907180786 + }, + { + "auxiliary_loss_clip": 0.01148081, + "auxiliary_loss_mlp": 0.00788458, + "balance_loss_clip": 1.05372739, + "balance_loss_mlp": 1.00038576, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.2155113577374133, + "language_loss": 0.8888998, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90826523, + "num_input_tokens_seen": 42082295, + "step": 1942, + "time_per_iteration": 2.524832010269165 + }, + { + "auxiliary_loss_clip": 0.01163016, + "auxiliary_loss_mlp": 0.01051186, + "balance_loss_clip": 1.05643845, + "balance_loss_mlp": 1.03171873, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 3.53212509218543, + "language_loss": 0.68458301, + "learning_rate": 3.921451049000975e-06, + "loss": 0.70672506, + "num_input_tokens_seen": 42105295, + "step": 1943, + "time_per_iteration": 2.648345470428467 + }, + { + "auxiliary_loss_clip": 0.0115008, + "auxiliary_loss_mlp": 0.01050006, + "balance_loss_clip": 1.05486274, + "balance_loss_mlp": 1.02891755, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 1.8946407771873837, + "language_loss": 0.6952377, + "learning_rate": 3.921342936802265e-06, + "loss": 0.71723855, + "num_input_tokens_seen": 42125520, + "step": 1944, + "time_per_iteration": 2.69633412361145 + }, + { + "auxiliary_loss_clip": 0.01154459, + "auxiliary_loss_mlp": 0.0104895, + "balance_loss_clip": 1.05063009, + "balance_loss_mlp": 1.029531, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 1.56856905950453, + "language_loss": 0.82390118, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84593523, + "num_input_tokens_seen": 42146335, + "step": 1945, + "time_per_iteration": 2.5421764850616455 + }, + { + "auxiliary_loss_clip": 0.01139528, + "auxiliary_loss_mlp": 0.01055575, + "balance_loss_clip": 1.04887474, + "balance_loss_mlp": 1.03505898, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.717402648392821, + "language_loss": 0.76006138, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.78201234, + "num_input_tokens_seen": 42165320, + "step": 1946, + "time_per_iteration": 2.604285478591919 + }, + { + "auxiliary_loss_clip": 0.01133639, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_clip": 1.05023527, + "balance_loss_mlp": 1.02748299, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.551343747733575, + "language_loss": 0.69104064, + "learning_rate": 3.921018163077448e-06, + "loss": 0.71284842, + "num_input_tokens_seen": 42182955, + "step": 1947, + "time_per_iteration": 2.5713398456573486 + }, + { + "auxiliary_loss_clip": 0.01151765, + "auxiliary_loss_mlp": 0.01059623, + "balance_loss_clip": 1.0568769, + "balance_loss_mlp": 1.03905916, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 2.1565321893870473, + "language_loss": 0.85395533, + "learning_rate": 3.920909759473295e-06, + "loss": 0.87606925, + "num_input_tokens_seen": 42200760, + "step": 1948, + "time_per_iteration": 2.545053243637085 + }, + { + "auxiliary_loss_clip": 0.01052717, + "auxiliary_loss_mlp": 0.00762155, + "balance_loss_clip": 1.02684569, + "balance_loss_mlp": 0.99992353, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8258104987802959, + "language_loss": 0.65113437, + "learning_rate": 3.920801283028054e-06, + "loss": 0.66928309, + "num_input_tokens_seen": 42265745, + "step": 1949, + "time_per_iteration": 3.1296536922454834 + }, + { + "auxiliary_loss_clip": 0.01154793, + "auxiliary_loss_mlp": 0.0105525, + "balance_loss_clip": 1.05576658, + "balance_loss_mlp": 1.03524673, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.652554031648199, + "language_loss": 0.71876031, + "learning_rate": 3.920692733745835e-06, + "loss": 0.7408607, + "num_input_tokens_seen": 42286245, + "step": 1950, + "time_per_iteration": 2.575782299041748 + }, + { + "auxiliary_loss_clip": 0.01171163, + "auxiliary_loss_mlp": 0.01055248, + "balance_loss_clip": 1.06042194, + "balance_loss_mlp": 1.03495848, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 3.5600042846631967, + "language_loss": 0.76867664, + "learning_rate": 3.920584111630755e-06, + "loss": 0.7909407, + "num_input_tokens_seen": 42302710, + "step": 1951, + "time_per_iteration": 2.5013997554779053 + }, + { + "auxiliary_loss_clip": 0.01126029, + "auxiliary_loss_mlp": 0.01062555, + "balance_loss_clip": 1.05207634, + "balance_loss_mlp": 1.04255152, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 4.296258769065146, + "language_loss": 0.76005059, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78193641, + "num_input_tokens_seen": 42324115, + "step": 1952, + "time_per_iteration": 2.676187038421631 + }, + { + "auxiliary_loss_clip": 0.0112403, + "auxiliary_loss_mlp": 0.01065253, + "balance_loss_clip": 1.0465169, + "balance_loss_mlp": 1.044451, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 1.9527842658044863, + "language_loss": 0.71974593, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74163872, + "num_input_tokens_seen": 42342505, + "step": 1953, + "time_per_iteration": 2.5758216381073 + }, + { + "auxiliary_loss_clip": 0.01145063, + "auxiliary_loss_mlp": 0.00791129, + "balance_loss_clip": 1.05137837, + "balance_loss_mlp": 1.00039232, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.402925216745969, + "language_loss": 0.7966392, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81600112, + "num_input_tokens_seen": 42360525, + "step": 1954, + "time_per_iteration": 2.5639760494232178 + }, + { + "auxiliary_loss_clip": 0.01112381, + "auxiliary_loss_mlp": 0.01055454, + "balance_loss_clip": 1.04979253, + "balance_loss_mlp": 1.03491378, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.1787824611088413, + "language_loss": 0.85597301, + "learning_rate": 3.920148894924246e-06, + "loss": 0.87765133, + "num_input_tokens_seen": 42377045, + "step": 1955, + "time_per_iteration": 2.636625051498413 + }, + { + "auxiliary_loss_clip": 0.01159354, + "auxiliary_loss_mlp": 0.00787991, + "balance_loss_clip": 1.04994416, + "balance_loss_mlp": 1.00052047, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 3.3125428097310614, + "language_loss": 0.78361839, + "learning_rate": 3.920039908706701e-06, + "loss": 0.80309188, + "num_input_tokens_seen": 42393960, + "step": 1956, + "time_per_iteration": 2.542607307434082 + }, + { + "auxiliary_loss_clip": 0.01154906, + "auxiliary_loss_mlp": 0.01055998, + "balance_loss_clip": 1.05496716, + "balance_loss_mlp": 1.03461146, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 2.1324728538142885, + "language_loss": 0.80481529, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82692432, + "num_input_tokens_seen": 42413160, + "step": 1957, + "time_per_iteration": 2.520036458969116 + }, + { + "auxiliary_loss_clip": 0.01164036, + "auxiliary_loss_mlp": 0.01051958, + "balance_loss_clip": 1.05630755, + "balance_loss_mlp": 1.03275371, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 2.586762538211415, + "language_loss": 0.77884495, + "learning_rate": 3.919821717851428e-06, + "loss": 0.80100489, + "num_input_tokens_seen": 42432590, + "step": 1958, + "time_per_iteration": 2.481658697128296 + }, + { + "auxiliary_loss_clip": 0.01145242, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_clip": 1.05310941, + "balance_loss_mlp": 1.02514076, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 2.1725631331195294, + "language_loss": 0.76450169, + "learning_rate": 3.919712513221976e-06, + "loss": 0.78641605, + "num_input_tokens_seen": 42450135, + "step": 1959, + "time_per_iteration": 2.5086166858673096 + }, + { + "auxiliary_loss_clip": 0.01152948, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_clip": 1.05321646, + "balance_loss_mlp": 1.03089082, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 2.063595149988874, + "language_loss": 0.70174026, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72378439, + "num_input_tokens_seen": 42470050, + "step": 1960, + "time_per_iteration": 4.080750942230225 + }, + { + "auxiliary_loss_clip": 0.01155197, + "auxiliary_loss_mlp": 0.0104813, + "balance_loss_clip": 1.05512226, + "balance_loss_mlp": 1.02747154, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.956726239374607, + "language_loss": 0.81298459, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83501786, + "num_input_tokens_seen": 42484335, + "step": 1961, + "time_per_iteration": 2.4982075691223145 + }, + { + "auxiliary_loss_clip": 0.01158266, + "auxiliary_loss_mlp": 0.00787815, + "balance_loss_clip": 1.05442894, + "balance_loss_mlp": 1.00036514, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 1.926060178934187, + "language_loss": 0.92448777, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94394857, + "num_input_tokens_seen": 42502720, + "step": 1962, + "time_per_iteration": 2.5467402935028076 + }, + { + "auxiliary_loss_clip": 0.01136677, + "auxiliary_loss_mlp": 0.01058498, + "balance_loss_clip": 1.05159271, + "balance_loss_mlp": 1.03832746, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 1.9373984828027473, + "language_loss": 0.87723261, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89918435, + "num_input_tokens_seen": 42519460, + "step": 1963, + "time_per_iteration": 4.011033535003662 + }, + { + "auxiliary_loss_clip": 0.01150714, + "auxiliary_loss_mlp": 0.00787186, + "balance_loss_clip": 1.05342221, + "balance_loss_mlp": 1.00042772, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 1.834469878358167, + "language_loss": 0.84043765, + "learning_rate": 3.919165398222265e-06, + "loss": 0.85981667, + "num_input_tokens_seen": 42539420, + "step": 1964, + "time_per_iteration": 2.565253734588623 + }, + { + "auxiliary_loss_clip": 0.01122515, + "auxiliary_loss_mlp": 0.01067763, + "balance_loss_clip": 1.04990399, + "balance_loss_mlp": 1.04681849, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 2.227677511546627, + "language_loss": 0.82754588, + "learning_rate": 3.919055756880879e-06, + "loss": 0.84944868, + "num_input_tokens_seen": 42558225, + "step": 1965, + "time_per_iteration": 2.568997383117676 + }, + { + "auxiliary_loss_clip": 0.01172918, + "auxiliary_loss_mlp": 0.01052617, + "balance_loss_clip": 1.05470777, + "balance_loss_mlp": 1.03266144, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.7555763619202627, + "language_loss": 0.74552518, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76778054, + "num_input_tokens_seen": 42580790, + "step": 1966, + "time_per_iteration": 2.7178306579589844 + }, + { + "auxiliary_loss_clip": 0.01153458, + "auxiliary_loss_mlp": 0.01055266, + "balance_loss_clip": 1.0578984, + "balance_loss_mlp": 1.03535843, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 2.609089037870072, + "language_loss": 0.7291019, + "learning_rate": 3.918836255889908e-06, + "loss": 0.75118911, + "num_input_tokens_seen": 42597355, + "step": 1967, + "time_per_iteration": 2.704791307449341 + }, + { + "auxiliary_loss_clip": 0.01159084, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.05475926, + "balance_loss_mlp": 1.02790415, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.1498568147701747, + "language_loss": 0.88202584, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90409923, + "num_input_tokens_seen": 42616060, + "step": 1968, + "time_per_iteration": 3.9222774505615234 + }, + { + "auxiliary_loss_clip": 0.01161526, + "auxiliary_loss_mlp": 0.01049447, + "balance_loss_clip": 1.06200528, + "balance_loss_mlp": 1.02915776, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 1.9222071596799626, + "language_loss": 0.66695571, + "learning_rate": 3.918616463849087e-06, + "loss": 0.6890654, + "num_input_tokens_seen": 42636285, + "step": 1969, + "time_per_iteration": 2.597621202468872 + }, + { + "auxiliary_loss_clip": 0.01127666, + "auxiliary_loss_mlp": 0.01055528, + "balance_loss_clip": 1.04907107, + "balance_loss_mlp": 1.03388, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.0742985391114694, + "language_loss": 0.80762649, + "learning_rate": 3.918506458695399e-06, + "loss": 0.82945848, + "num_input_tokens_seen": 42658320, + "step": 1970, + "time_per_iteration": 2.685896396636963 + }, + { + "auxiliary_loss_clip": 0.01061301, + "auxiliary_loss_mlp": 0.01002775, + "balance_loss_clip": 1.02651095, + "balance_loss_mlp": 0.99980658, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.7978886518472535, + "language_loss": 0.66159427, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68223512, + "num_input_tokens_seen": 42721500, + "step": 1971, + "time_per_iteration": 3.0568737983703613 + }, + { + "auxiliary_loss_clip": 0.01147427, + "auxiliary_loss_mlp": 0.01050035, + "balance_loss_clip": 1.05293107, + "balance_loss_mlp": 1.03015149, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 2.298173037790721, + "language_loss": 0.79647291, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81844753, + "num_input_tokens_seen": 42739825, + "step": 1972, + "time_per_iteration": 2.5760138034820557 + }, + { + "auxiliary_loss_clip": 0.01130158, + "auxiliary_loss_mlp": 0.00786605, + "balance_loss_clip": 1.05276275, + "balance_loss_mlp": 1.00035381, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.141868707403321, + "language_loss": 0.72407103, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74323863, + "num_input_tokens_seen": 42758695, + "step": 1973, + "time_per_iteration": 2.6276304721832275 + }, + { + "auxiliary_loss_clip": 0.01133581, + "auxiliary_loss_mlp": 0.01049676, + "balance_loss_clip": 1.05302942, + "balance_loss_mlp": 1.02899289, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 1.6420775052195136, + "language_loss": 0.71851337, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74034595, + "num_input_tokens_seen": 42778510, + "step": 1974, + "time_per_iteration": 2.6196584701538086 + }, + { + "auxiliary_loss_clip": 0.01126074, + "auxiliary_loss_mlp": 0.01042934, + "balance_loss_clip": 1.04989243, + "balance_loss_mlp": 1.02293062, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.4108394087534326, + "language_loss": 0.77532029, + "learning_rate": 3.917955341761128e-06, + "loss": 0.79701042, + "num_input_tokens_seen": 42793995, + "step": 1975, + "time_per_iteration": 2.5338144302368164 + }, + { + "auxiliary_loss_clip": 0.01126023, + "auxiliary_loss_mlp": 0.01053042, + "balance_loss_clip": 1.05602562, + "balance_loss_mlp": 1.03392112, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.4182731462675484, + "language_loss": 0.75058877, + "learning_rate": 3.917844900170364e-06, + "loss": 0.7723794, + "num_input_tokens_seen": 42809000, + "step": 1976, + "time_per_iteration": 2.631248950958252 + }, + { + "auxiliary_loss_clip": 0.01161132, + "auxiliary_loss_mlp": 0.01050154, + "balance_loss_clip": 1.0560261, + "balance_loss_mlp": 1.03065157, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.5364032190738368, + "language_loss": 0.74888134, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77099419, + "num_input_tokens_seen": 42831585, + "step": 1977, + "time_per_iteration": 2.605619192123413 + }, + { + "auxiliary_loss_clip": 0.01173835, + "auxiliary_loss_mlp": 0.01055712, + "balance_loss_clip": 1.05455279, + "balance_loss_mlp": 1.0357089, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 2.184116743639055, + "language_loss": 0.73925829, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76155376, + "num_input_tokens_seen": 42848420, + "step": 1978, + "time_per_iteration": 4.137166261672974 + }, + { + "auxiliary_loss_clip": 0.01135931, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.05763042, + "balance_loss_mlp": 1.02793407, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.7608480607382333, + "language_loss": 0.73505074, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75688767, + "num_input_tokens_seen": 42866645, + "step": 1979, + "time_per_iteration": 2.549551010131836 + }, + { + "auxiliary_loss_clip": 0.01135156, + "auxiliary_loss_mlp": 0.01050761, + "balance_loss_clip": 1.05717731, + "balance_loss_mlp": 1.0312227, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.8375456953863636, + "language_loss": 0.98585558, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00771475, + "num_input_tokens_seen": 42888515, + "step": 1980, + "time_per_iteration": 2.700028896331787 + }, + { + "auxiliary_loss_clip": 0.01151442, + "auxiliary_loss_mlp": 0.01048606, + "balance_loss_clip": 1.05596757, + "balance_loss_mlp": 1.02742314, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.7601236951775365, + "language_loss": 0.86342949, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88542998, + "num_input_tokens_seen": 42909035, + "step": 1981, + "time_per_iteration": 2.5534610748291016 + }, + { + "auxiliary_loss_clip": 0.01154463, + "auxiliary_loss_mlp": 0.01061114, + "balance_loss_clip": 1.05828023, + "balance_loss_mlp": 1.03996658, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.8133637818275605, + "language_loss": 0.85123205, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87338781, + "num_input_tokens_seen": 42927555, + "step": 1982, + "time_per_iteration": 2.5735862255096436 + }, + { + "auxiliary_loss_clip": 0.01139554, + "auxiliary_loss_mlp": 0.01044767, + "balance_loss_clip": 1.04955721, + "balance_loss_mlp": 1.02454901, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.9366331920847852, + "language_loss": 0.84994996, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87179315, + "num_input_tokens_seen": 42945300, + "step": 1983, + "time_per_iteration": 2.52999210357666 + }, + { + "auxiliary_loss_clip": 0.01120939, + "auxiliary_loss_mlp": 0.01053016, + "balance_loss_clip": 1.05343795, + "balance_loss_mlp": 1.0324409, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.203109781326909, + "language_loss": 0.7671771, + "learning_rate": 3.916958749701277e-06, + "loss": 0.78891665, + "num_input_tokens_seen": 42961295, + "step": 1984, + "time_per_iteration": 2.6003708839416504 + }, + { + "auxiliary_loss_clip": 0.01156944, + "auxiliary_loss_mlp": 0.01059251, + "balance_loss_clip": 1.05589843, + "balance_loss_mlp": 1.03823423, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.7195274045624713, + "language_loss": 0.83117545, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85333741, + "num_input_tokens_seen": 42980330, + "step": 1985, + "time_per_iteration": 2.5255444049835205 + }, + { + "auxiliary_loss_clip": 0.0114878, + "auxiliary_loss_mlp": 0.01049975, + "balance_loss_clip": 1.05295563, + "balance_loss_mlp": 1.02955461, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 4.103506672348734, + "language_loss": 0.74245965, + "learning_rate": 3.916736485087216e-06, + "loss": 0.76444721, + "num_input_tokens_seen": 42996125, + "step": 1986, + "time_per_iteration": 2.5266664028167725 + }, + { + "auxiliary_loss_clip": 0.01142471, + "auxiliary_loss_mlp": 0.01055564, + "balance_loss_clip": 1.05134964, + "balance_loss_mlp": 1.03559601, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.1304200729712335, + "language_loss": 0.72022694, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74220729, + "num_input_tokens_seen": 43014180, + "step": 1987, + "time_per_iteration": 2.612283945083618 + }, + { + "auxiliary_loss_clip": 0.01149072, + "auxiliary_loss_mlp": 0.01051217, + "balance_loss_clip": 1.05065155, + "balance_loss_mlp": 1.03006923, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 4.642834081350003, + "language_loss": 0.71986842, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74187136, + "num_input_tokens_seen": 43032120, + "step": 1988, + "time_per_iteration": 2.5381247997283936 + }, + { + "auxiliary_loss_clip": 0.01156132, + "auxiliary_loss_mlp": 0.0105801, + "balance_loss_clip": 1.05174518, + "balance_loss_mlp": 1.03642106, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 2.0678046107887695, + "language_loss": 0.81098568, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83312714, + "num_input_tokens_seen": 43052215, + "step": 1989, + "time_per_iteration": 2.5279994010925293 + }, + { + "auxiliary_loss_clip": 0.01133093, + "auxiliary_loss_mlp": 0.01059243, + "balance_loss_clip": 1.04843974, + "balance_loss_mlp": 1.03616381, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 3.4092588695614134, + "language_loss": 0.74901932, + "learning_rate": 3.916291083698784e-06, + "loss": 0.77094269, + "num_input_tokens_seen": 43069720, + "step": 1990, + "time_per_iteration": 2.55549955368042 + }, + { + "auxiliary_loss_clip": 0.01055592, + "auxiliary_loss_mlp": 0.01005508, + "balance_loss_clip": 1.02737045, + "balance_loss_mlp": 1.00233674, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 1.3921132182317122, + "language_loss": 0.55283558, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57344663, + "num_input_tokens_seen": 43123130, + "step": 1991, + "time_per_iteration": 3.112800359725952 + }, + { + "auxiliary_loss_clip": 0.01129818, + "auxiliary_loss_mlp": 0.01052095, + "balance_loss_clip": 1.05416524, + "balance_loss_mlp": 1.0324378, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.378362813525763, + "language_loss": 0.78114104, + "learning_rate": 3.916067946991971e-06, + "loss": 0.80296028, + "num_input_tokens_seen": 43140015, + "step": 1992, + "time_per_iteration": 2.6352427005767822 + }, + { + "auxiliary_loss_clip": 0.01173683, + "auxiliary_loss_mlp": 0.01048972, + "balance_loss_clip": 1.05484474, + "balance_loss_mlp": 1.0285635, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.8957640860934801, + "language_loss": 0.79086792, + "learning_rate": 3.915956269650216e-06, + "loss": 0.81309438, + "num_input_tokens_seen": 43160105, + "step": 1993, + "time_per_iteration": 2.5971598625183105 + }, + { + "auxiliary_loss_clip": 0.01122315, + "auxiliary_loss_mlp": 0.01056837, + "balance_loss_clip": 1.04542804, + "balance_loss_mlp": 1.03678608, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 2.1436810170501115, + "language_loss": 0.82556438, + "learning_rate": 3.915844519655208e-06, + "loss": 0.8473559, + "num_input_tokens_seen": 43179835, + "step": 1994, + "time_per_iteration": 2.6230621337890625 + }, + { + "auxiliary_loss_clip": 0.01148081, + "auxiliary_loss_mlp": 0.0105534, + "balance_loss_clip": 1.05424118, + "balance_loss_mlp": 1.03588498, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.1423640920203644, + "language_loss": 0.88051331, + "learning_rate": 3.915732697011183e-06, + "loss": 0.90254748, + "num_input_tokens_seen": 43197210, + "step": 1995, + "time_per_iteration": 2.55560302734375 + }, + { + "auxiliary_loss_clip": 0.01145844, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_clip": 1.05626845, + "balance_loss_mlp": 1.03209925, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 1.9464753808475566, + "language_loss": 0.74250853, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76449513, + "num_input_tokens_seen": 43215050, + "step": 1996, + "time_per_iteration": 2.610278606414795 + }, + { + "auxiliary_loss_clip": 0.01137855, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_clip": 1.05536902, + "balance_loss_mlp": 1.02592468, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 2.0609430576655536, + "language_loss": 0.87941253, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90125889, + "num_input_tokens_seen": 43233900, + "step": 1997, + "time_per_iteration": 2.5905215740203857 + }, + { + "auxiliary_loss_clip": 0.01162106, + "auxiliary_loss_mlp": 0.00788274, + "balance_loss_clip": 1.05478764, + "balance_loss_mlp": 1.00034356, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 2.1219786749377216, + "language_loss": 0.78321284, + "learning_rate": 3.915396793227428e-06, + "loss": 0.80271667, + "num_input_tokens_seen": 43252105, + "step": 1998, + "time_per_iteration": 2.617291212081909 + }, + { + "auxiliary_loss_clip": 0.01160915, + "auxiliary_loss_mlp": 0.00787054, + "balance_loss_clip": 1.05450392, + "balance_loss_mlp": 1.00033081, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 1.7219176382660244, + "language_loss": 0.73413152, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75361121, + "num_input_tokens_seen": 43270315, + "step": 1999, + "time_per_iteration": 2.561990261077881 + }, + { + "auxiliary_loss_clip": 0.01172742, + "auxiliary_loss_mlp": 0.0106368, + "balance_loss_clip": 1.05551434, + "balance_loss_mlp": 1.04284203, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 2.4161561898471633, + "language_loss": 0.75183392, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77419817, + "num_input_tokens_seen": 43289935, + "step": 2000, + "time_per_iteration": 3.9698052406311035 + }, + { + "auxiliary_loss_clip": 0.01145164, + "auxiliary_loss_mlp": 0.0105017, + "balance_loss_clip": 1.05162156, + "balance_loss_mlp": 1.02964234, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5877813399335559, + "language_loss": 0.85075384, + "learning_rate": 3.915060235755344e-06, + "loss": 0.87270719, + "num_input_tokens_seen": 43309325, + "step": 2001, + "time_per_iteration": 2.5441927909851074 + }, + { + "auxiliary_loss_clip": 0.01149742, + "auxiliary_loss_mlp": 0.01050594, + "balance_loss_clip": 1.05284977, + "balance_loss_mlp": 1.03090072, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.2200803948837584, + "language_loss": 0.74054343, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76254678, + "num_input_tokens_seen": 43327010, + "step": 2002, + "time_per_iteration": 3.996798515319824 + }, + { + "auxiliary_loss_clip": 0.01128066, + "auxiliary_loss_mlp": 0.01053649, + "balance_loss_clip": 1.05197084, + "balance_loss_mlp": 1.03179812, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.170454640902419, + "language_loss": 0.77865976, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80047691, + "num_input_tokens_seen": 43345650, + "step": 2003, + "time_per_iteration": 2.588685989379883 + }, + { + "auxiliary_loss_clip": 0.01155277, + "auxiliary_loss_mlp": 0.01049344, + "balance_loss_clip": 1.05238831, + "balance_loss_mlp": 1.0282203, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.752190657598395, + "language_loss": 0.71955234, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74159855, + "num_input_tokens_seen": 43365555, + "step": 2004, + "time_per_iteration": 2.540742874145508 + }, + { + "auxiliary_loss_clip": 0.01154487, + "auxiliary_loss_mlp": 0.01059105, + "balance_loss_clip": 1.0547688, + "balance_loss_mlp": 1.03684819, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 2.017998655166215, + "language_loss": 0.78288943, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80502528, + "num_input_tokens_seen": 43384990, + "step": 2005, + "time_per_iteration": 2.5807254314422607 + }, + { + "auxiliary_loss_clip": 0.01074735, + "auxiliary_loss_mlp": 0.00761174, + "balance_loss_clip": 1.02881861, + "balance_loss_mlp": 0.99991304, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.94024045861772, + "language_loss": 0.58091384, + "learning_rate": 3.914497854306543e-06, + "loss": 0.59927297, + "num_input_tokens_seen": 43436335, + "step": 2006, + "time_per_iteration": 2.83243465423584 + }, + { + "auxiliary_loss_clip": 0.01150261, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.05402076, + "balance_loss_mlp": 1.02677417, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.7360175188930853, + "language_loss": 0.76381737, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78578603, + "num_input_tokens_seen": 43456495, + "step": 2007, + "time_per_iteration": 3.946338176727295 + }, + { + "auxiliary_loss_clip": 0.01140657, + "auxiliary_loss_mlp": 0.01057041, + "balance_loss_clip": 1.05528295, + "balance_loss_mlp": 1.03550041, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 3.6373223185263894, + "language_loss": 0.8356756, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85765266, + "num_input_tokens_seen": 43473085, + "step": 2008, + "time_per_iteration": 2.566761016845703 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01048983, + "balance_loss_clip": 1.05509973, + "balance_loss_mlp": 1.02851486, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.3230083010283455, + "language_loss": 0.83994633, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86215687, + "num_input_tokens_seen": 43491135, + "step": 2009, + "time_per_iteration": 2.4690215587615967 + }, + { + "auxiliary_loss_clip": 0.01170822, + "auxiliary_loss_mlp": 0.01053607, + "balance_loss_clip": 1.05579782, + "balance_loss_mlp": 1.031196, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.8136126651679327, + "language_loss": 0.84347177, + "learning_rate": 3.914046642358844e-06, + "loss": 0.8657161, + "num_input_tokens_seen": 43510440, + "step": 2010, + "time_per_iteration": 2.4921934604644775 + }, + { + "auxiliary_loss_clip": 0.01143567, + "auxiliary_loss_mlp": 0.00789177, + "balance_loss_clip": 1.05275059, + "balance_loss_mlp": 1.00032425, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.8437611896793933, + "language_loss": 0.84300447, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.86233199, + "num_input_tokens_seen": 43530145, + "step": 2011, + "time_per_iteration": 2.526916027069092 + }, + { + "auxiliary_loss_clip": 0.01146071, + "auxiliary_loss_mlp": 0.0105911, + "balance_loss_clip": 1.05212545, + "balance_loss_mlp": 1.03872502, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 2.0073375571575474, + "language_loss": 0.9625057, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98455751, + "num_input_tokens_seen": 43549315, + "step": 2012, + "time_per_iteration": 2.577059507369995 + }, + { + "auxiliary_loss_clip": 0.01139957, + "auxiliary_loss_mlp": 0.01047637, + "balance_loss_clip": 1.05277491, + "balance_loss_mlp": 1.02669203, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.3791727249245853, + "language_loss": 0.8073442, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82922012, + "num_input_tokens_seen": 43569240, + "step": 2013, + "time_per_iteration": 2.614588499069214 + }, + { + "auxiliary_loss_clip": 0.01119794, + "auxiliary_loss_mlp": 0.01052526, + "balance_loss_clip": 1.04754663, + "balance_loss_mlp": 1.0300436, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 4.890191771947657, + "language_loss": 0.76486516, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.78658831, + "num_input_tokens_seen": 43587710, + "step": 2014, + "time_per_iteration": 2.592043161392212 + }, + { + "auxiliary_loss_clip": 0.01161106, + "auxiliary_loss_mlp": 0.0104411, + "balance_loss_clip": 1.05368304, + "balance_loss_mlp": 1.02247357, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 1.9062109361702095, + "language_loss": 0.8688966, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89094871, + "num_input_tokens_seen": 43606000, + "step": 2015, + "time_per_iteration": 2.530132532119751 + }, + { + "auxiliary_loss_clip": 0.0116668, + "auxiliary_loss_mlp": 0.01048197, + "balance_loss_clip": 1.05273914, + "balance_loss_mlp": 1.02774048, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 1.9862447165437382, + "language_loss": 0.69492507, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71707392, + "num_input_tokens_seen": 43624815, + "step": 2016, + "time_per_iteration": 2.4523675441741943 + }, + { + "auxiliary_loss_clip": 0.01150868, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.05589557, + "balance_loss_mlp": 1.02356005, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 2.524615683920939, + "language_loss": 0.80887502, + "learning_rate": 3.913254227253225e-06, + "loss": 0.83085275, + "num_input_tokens_seen": 43643960, + "step": 2017, + "time_per_iteration": 4.033367156982422 + }, + { + "auxiliary_loss_clip": 0.01153713, + "auxiliary_loss_mlp": 0.01050789, + "balance_loss_clip": 1.05120182, + "balance_loss_mlp": 1.02809143, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 3.950811082720264, + "language_loss": 0.69207418, + "learning_rate": 3.913140734857731e-06, + "loss": 0.7141192, + "num_input_tokens_seen": 43662650, + "step": 2018, + "time_per_iteration": 2.493242025375366 + }, + { + "auxiliary_loss_clip": 0.01136535, + "auxiliary_loss_mlp": 0.01051029, + "balance_loss_clip": 1.0573678, + "balance_loss_mlp": 1.03028703, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 1.7781312670035474, + "language_loss": 0.72472823, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74660379, + "num_input_tokens_seen": 43684205, + "step": 2019, + "time_per_iteration": 2.6261119842529297 + }, + { + "auxiliary_loss_clip": 0.0110328, + "auxiliary_loss_mlp": 0.01060551, + "balance_loss_clip": 1.05013299, + "balance_loss_mlp": 1.03850925, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.7431983510191504, + "language_loss": 0.91751963, + "learning_rate": 3.912913532431586e-06, + "loss": 0.93915796, + "num_input_tokens_seen": 43706320, + "step": 2020, + "time_per_iteration": 2.6835832595825195 + }, + { + "auxiliary_loss_clip": 0.01141235, + "auxiliary_loss_mlp": 0.01052402, + "balance_loss_clip": 1.05139971, + "balance_loss_mlp": 1.0310992, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 2.052007695871934, + "language_loss": 0.77547735, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79741371, + "num_input_tokens_seen": 43724805, + "step": 2021, + "time_per_iteration": 2.562549591064453 + }, + { + "auxiliary_loss_clip": 0.01172153, + "auxiliary_loss_mlp": 0.01048357, + "balance_loss_clip": 1.05729723, + "balance_loss_mlp": 1.02782893, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.0006377196011753, + "language_loss": 0.80353928, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82574445, + "num_input_tokens_seen": 43742320, + "step": 2022, + "time_per_iteration": 2.489333391189575 + }, + { + "auxiliary_loss_clip": 0.01144874, + "auxiliary_loss_mlp": 0.01060678, + "balance_loss_clip": 1.05409217, + "balance_loss_mlp": 1.03714573, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.83849678909607, + "language_loss": 0.85045528, + "learning_rate": 3.912572184769108e-06, + "loss": 0.87251079, + "num_input_tokens_seen": 43760665, + "step": 2023, + "time_per_iteration": 2.509455442428589 + }, + { + "auxiliary_loss_clip": 0.01144942, + "auxiliary_loss_mlp": 0.01053305, + "balance_loss_clip": 1.05514789, + "balance_loss_mlp": 1.03112054, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.2587710166157784, + "language_loss": 0.85137856, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87336099, + "num_input_tokens_seen": 43779020, + "step": 2024, + "time_per_iteration": 2.551295280456543 + }, + { + "auxiliary_loss_clip": 0.01169073, + "auxiliary_loss_mlp": 0.01057404, + "balance_loss_clip": 1.05227566, + "balance_loss_mlp": 1.03654206, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 1.9302823861707765, + "language_loss": 0.71975285, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74201763, + "num_input_tokens_seen": 43798850, + "step": 2025, + "time_per_iteration": 2.528602361679077 + }, + { + "auxiliary_loss_clip": 0.01150369, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_clip": 1.0552597, + "balance_loss_mlp": 1.02692068, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 1.6785979410903853, + "language_loss": 0.75956231, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78154588, + "num_input_tokens_seen": 43820130, + "step": 2026, + "time_per_iteration": 2.5806639194488525 + }, + { + "auxiliary_loss_clip": 0.01149656, + "auxiliary_loss_mlp": 0.01047396, + "balance_loss_clip": 1.05222368, + "balance_loss_mlp": 1.02667785, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 3.1455311700894857, + "language_loss": 0.8921628, + "learning_rate": 3.912116039223659e-06, + "loss": 0.91413331, + "num_input_tokens_seen": 43838485, + "step": 2027, + "time_per_iteration": 2.518023729324341 + }, + { + "auxiliary_loss_clip": 0.01148348, + "auxiliary_loss_mlp": 0.01056591, + "balance_loss_clip": 1.05454564, + "balance_loss_mlp": 1.03779221, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 1.9796066675032544, + "language_loss": 0.75885338, + "learning_rate": 3.912001821557399e-06, + "loss": 0.78090274, + "num_input_tokens_seen": 43859080, + "step": 2028, + "time_per_iteration": 2.587980270385742 + }, + { + "auxiliary_loss_clip": 0.0112504, + "auxiliary_loss_mlp": 0.01053157, + "balance_loss_clip": 1.05348599, + "balance_loss_mlp": 1.03202128, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.3058210622153203, + "language_loss": 0.77315843, + "learning_rate": 3.911887531387839e-06, + "loss": 0.79494035, + "num_input_tokens_seen": 43879030, + "step": 2029, + "time_per_iteration": 2.651611089706421 + }, + { + "auxiliary_loss_clip": 0.01160419, + "auxiliary_loss_mlp": 0.0105406, + "balance_loss_clip": 1.05565906, + "balance_loss_mlp": 1.03335297, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 1.9890512338544601, + "language_loss": 0.79259849, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81474328, + "num_input_tokens_seen": 43898505, + "step": 2030, + "time_per_iteration": 2.554044008255005 + }, + { + "auxiliary_loss_clip": 0.01174351, + "auxiliary_loss_mlp": 0.01051122, + "balance_loss_clip": 1.05944157, + "balance_loss_mlp": 1.02925873, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 3.9073399791936425, + "language_loss": 0.74782664, + "learning_rate": 3.911658733556155e-06, + "loss": 0.7700814, + "num_input_tokens_seen": 43917945, + "step": 2031, + "time_per_iteration": 2.5027356147766113 + }, + { + "auxiliary_loss_clip": 0.01174658, + "auxiliary_loss_mlp": 0.01048432, + "balance_loss_clip": 1.05984712, + "balance_loss_mlp": 1.02917957, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.697163949994477, + "language_loss": 0.75404483, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77627575, + "num_input_tokens_seen": 43937385, + "step": 2032, + "time_per_iteration": 2.4659481048583984 + }, + { + "auxiliary_loss_clip": 0.01154027, + "auxiliary_loss_mlp": 0.01045318, + "balance_loss_clip": 1.054214, + "balance_loss_mlp": 1.02574396, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.5535648149716579, + "language_loss": 0.88946497, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91145849, + "num_input_tokens_seen": 43958130, + "step": 2033, + "time_per_iteration": 2.5348076820373535 + }, + { + "auxiliary_loss_clip": 0.01159091, + "auxiliary_loss_mlp": 0.01050839, + "balance_loss_clip": 1.05968857, + "balance_loss_mlp": 1.03022778, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.6974209813863057, + "language_loss": 0.65265763, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67475694, + "num_input_tokens_seen": 43976800, + "step": 2034, + "time_per_iteration": 2.5362017154693604 + }, + { + "auxiliary_loss_clip": 0.0115358, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_clip": 1.05763555, + "balance_loss_mlp": 1.03132391, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.5619518699488208, + "language_loss": 0.7654227, + "learning_rate": 3.911200268044055e-06, + "loss": 0.7874859, + "num_input_tokens_seen": 43996620, + "step": 2035, + "time_per_iteration": 2.5318236351013184 + }, + { + "auxiliary_loss_clip": 0.01176028, + "auxiliary_loss_mlp": 0.0104683, + "balance_loss_clip": 1.05803013, + "balance_loss_mlp": 1.02608776, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 3.0416748138661673, + "language_loss": 0.71452522, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73675382, + "num_input_tokens_seen": 44016175, + "step": 2036, + "time_per_iteration": 2.463789939880371 + }, + { + "auxiliary_loss_clip": 0.01148506, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_clip": 1.06241679, + "balance_loss_mlp": 1.03607416, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 2.0009636605528063, + "language_loss": 0.83356833, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85562801, + "num_input_tokens_seen": 44035060, + "step": 2037, + "time_per_iteration": 2.529242753982544 + }, + { + "auxiliary_loss_clip": 0.01154204, + "auxiliary_loss_mlp": 0.01059879, + "balance_loss_clip": 1.05576944, + "balance_loss_mlp": 1.03842139, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 3.7705751959197666, + "language_loss": 0.7984075, + "learning_rate": 3.910855657929267e-06, + "loss": 0.8205483, + "num_input_tokens_seen": 44053330, + "step": 2038, + "time_per_iteration": 2.5728743076324463 + }, + { + "auxiliary_loss_clip": 0.01072468, + "auxiliary_loss_mlp": 0.00760854, + "balance_loss_clip": 1.03612208, + "balance_loss_mlp": 0.9997924, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.825790934159472, + "language_loss": 0.58666921, + "learning_rate": 3.910740642965518e-06, + "loss": 0.6050024, + "num_input_tokens_seen": 44107575, + "step": 2039, + "time_per_iteration": 2.9380440711975098 + }, + { + "auxiliary_loss_clip": 0.01129466, + "auxiliary_loss_mlp": 0.01055055, + "balance_loss_clip": 1.05092323, + "balance_loss_mlp": 1.03101039, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.1570416272741846, + "language_loss": 0.80771101, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82955623, + "num_input_tokens_seen": 44126075, + "step": 2040, + "time_per_iteration": 3.9786877632141113 + }, + { + "auxiliary_loss_clip": 0.01149262, + "auxiliary_loss_mlp": 0.01050287, + "balance_loss_clip": 1.05589354, + "balance_loss_mlp": 1.02972293, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8478135098402408, + "language_loss": 0.82712954, + "learning_rate": 3.910510395675953e-06, + "loss": 0.84912503, + "num_input_tokens_seen": 44145605, + "step": 2041, + "time_per_iteration": 2.5414674282073975 + }, + { + "auxiliary_loss_clip": 0.01136204, + "auxiliary_loss_mlp": 0.01058827, + "balance_loss_clip": 1.04897404, + "balance_loss_mlp": 1.03679681, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.9026607514709946, + "language_loss": 0.67123407, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69318438, + "num_input_tokens_seen": 44164770, + "step": 2042, + "time_per_iteration": 3.9078171253204346 + }, + { + "auxiliary_loss_clip": 0.01133283, + "auxiliary_loss_mlp": 0.01049892, + "balance_loss_clip": 1.04964733, + "balance_loss_mlp": 1.02872086, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.7529512992467826, + "language_loss": 0.81954908, + "learning_rate": 3.910279858599409e-06, + "loss": 0.84138083, + "num_input_tokens_seen": 44184025, + "step": 2043, + "time_per_iteration": 2.570206880569458 + }, + { + "auxiliary_loss_clip": 0.01148209, + "auxiliary_loss_mlp": 0.01047787, + "balance_loss_clip": 1.05254745, + "balance_loss_mlp": 1.02716351, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 9.915685297964867, + "language_loss": 0.80134219, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82330215, + "num_input_tokens_seen": 44202950, + "step": 2044, + "time_per_iteration": 2.5151524543762207 + }, + { + "auxiliary_loss_clip": 0.01118756, + "auxiliary_loss_mlp": 0.01050277, + "balance_loss_clip": 1.05507159, + "balance_loss_mlp": 1.02890277, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.889530408068875, + "language_loss": 0.78303802, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80472839, + "num_input_tokens_seen": 44221115, + "step": 2045, + "time_per_iteration": 2.6343936920166016 + }, + { + "auxiliary_loss_clip": 0.01166711, + "auxiliary_loss_mlp": 0.01065409, + "balance_loss_clip": 1.06061566, + "balance_loss_mlp": 1.04373646, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 1.8320861801329267, + "language_loss": 0.6682111, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69053227, + "num_input_tokens_seen": 44240575, + "step": 2046, + "time_per_iteration": 2.496155023574829 + }, + { + "auxiliary_loss_clip": 0.01173354, + "auxiliary_loss_mlp": 0.01057097, + "balance_loss_clip": 1.05956292, + "balance_loss_mlp": 1.03608012, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.1083988624068244, + "language_loss": 0.72463769, + "learning_rate": 3.909817915225297e-06, + "loss": 0.74694222, + "num_input_tokens_seen": 44257145, + "step": 2047, + "time_per_iteration": 3.833371162414551 + }, + { + "auxiliary_loss_clip": 0.0115924, + "auxiliary_loss_mlp": 0.01062836, + "balance_loss_clip": 1.05745232, + "balance_loss_mlp": 1.04081845, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.696718603467242, + "language_loss": 0.7651993, + "learning_rate": 3.909702248319597e-06, + "loss": 0.78742003, + "num_input_tokens_seen": 44278035, + "step": 2048, + "time_per_iteration": 2.5323879718780518 + }, + { + "auxiliary_loss_clip": 0.01145629, + "auxiliary_loss_mlp": 0.01050017, + "balance_loss_clip": 1.05427206, + "balance_loss_mlp": 1.03144431, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 1.976111970156717, + "language_loss": 0.85476488, + "learning_rate": 3.909586508997797e-06, + "loss": 0.87672138, + "num_input_tokens_seen": 44296980, + "step": 2049, + "time_per_iteration": 2.554129123687744 + }, + { + "auxiliary_loss_clip": 0.01119008, + "auxiliary_loss_mlp": 0.0105184, + "balance_loss_clip": 1.05375516, + "balance_loss_mlp": 1.03116894, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 1.8488438521193578, + "language_loss": 0.75363469, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77534318, + "num_input_tokens_seen": 44318005, + "step": 2050, + "time_per_iteration": 2.6211488246917725 + }, + { + "auxiliary_loss_clip": 0.01140849, + "auxiliary_loss_mlp": 0.01055415, + "balance_loss_clip": 1.05647182, + "balance_loss_mlp": 1.03468406, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 2.8258680857596876, + "language_loss": 0.80727339, + "learning_rate": 3.909354813123452e-06, + "loss": 0.82923603, + "num_input_tokens_seen": 44335260, + "step": 2051, + "time_per_iteration": 2.5747528076171875 + }, + { + "auxiliary_loss_clip": 0.01173233, + "auxiliary_loss_mlp": 0.00788618, + "balance_loss_clip": 1.05974102, + "balance_loss_mlp": 1.00026631, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 17.22149064672077, + "language_loss": 0.80220103, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82181954, + "num_input_tokens_seen": 44355315, + "step": 2052, + "time_per_iteration": 2.500577926635742 + }, + { + "auxiliary_loss_clip": 0.01165438, + "auxiliary_loss_mlp": 0.01055465, + "balance_loss_clip": 1.05571914, + "balance_loss_mlp": 1.03368545, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 1.9898419722793599, + "language_loss": 0.74123812, + "learning_rate": 3.909122827637406e-06, + "loss": 0.76344711, + "num_input_tokens_seen": 44373020, + "step": 2053, + "time_per_iteration": 2.536297559738159 + }, + { + "auxiliary_loss_clip": 0.01173463, + "auxiliary_loss_mlp": 0.00786547, + "balance_loss_clip": 1.05528748, + "balance_loss_mlp": 1.00025809, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 2.3876318939911068, + "language_loss": 0.74355704, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76315713, + "num_input_tokens_seen": 44397525, + "step": 2054, + "time_per_iteration": 2.6981730461120605 + }, + { + "auxiliary_loss_clip": 0.01150747, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_clip": 1.05655313, + "balance_loss_mlp": 1.02283406, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 1.8232474493266515, + "language_loss": 0.8484925, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87041587, + "num_input_tokens_seen": 44415890, + "step": 2055, + "time_per_iteration": 2.5853097438812256 + }, + { + "auxiliary_loss_clip": 0.01131898, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_clip": 1.05749762, + "balance_loss_mlp": 1.02987671, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 2.3338738309521077, + "language_loss": 0.77421021, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79602063, + "num_input_tokens_seen": 44436625, + "step": 2056, + "time_per_iteration": 2.656010866165161 + }, + { + "auxiliary_loss_clip": 0.01159331, + "auxiliary_loss_mlp": 0.01054866, + "balance_loss_clip": 1.05327368, + "balance_loss_mlp": 1.03452897, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 2.1771887525898785, + "language_loss": 0.83331764, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85545957, + "num_input_tokens_seen": 44455265, + "step": 2057, + "time_per_iteration": 4.047022104263306 + }, + { + "auxiliary_loss_clip": 0.01147243, + "auxiliary_loss_mlp": 0.01059021, + "balance_loss_clip": 1.05343866, + "balance_loss_mlp": 1.03744411, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.65346921201157, + "language_loss": 0.78099674, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80305934, + "num_input_tokens_seen": 44475815, + "step": 2058, + "time_per_iteration": 2.572338342666626 + }, + { + "auxiliary_loss_clip": 0.01141706, + "auxiliary_loss_mlp": 0.01058503, + "balance_loss_clip": 1.05280006, + "balance_loss_mlp": 1.03604436, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.098711592380891, + "language_loss": 0.83176655, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85376871, + "num_input_tokens_seen": 44494045, + "step": 2059, + "time_per_iteration": 2.50699782371521 + }, + { + "auxiliary_loss_clip": 0.01138558, + "auxiliary_loss_mlp": 0.01067926, + "balance_loss_clip": 1.05410671, + "balance_loss_mlp": 1.04484713, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.59411845814705, + "language_loss": 0.81481791, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83688277, + "num_input_tokens_seen": 44509120, + "step": 2060, + "time_per_iteration": 2.517298936843872 + }, + { + "auxiliary_loss_clip": 0.01154198, + "auxiliary_loss_mlp": 0.01051497, + "balance_loss_clip": 1.05571389, + "balance_loss_mlp": 1.02976513, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 2.129700116877344, + "language_loss": 0.8629393, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88499629, + "num_input_tokens_seen": 44525780, + "step": 2061, + "time_per_iteration": 2.490999937057495 + }, + { + "auxiliary_loss_clip": 0.01158209, + "auxiliary_loss_mlp": 0.01049787, + "balance_loss_clip": 1.0554496, + "balance_loss_mlp": 1.03071332, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 1.884354291723494, + "language_loss": 0.84972739, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87180734, + "num_input_tokens_seen": 44543125, + "step": 2062, + "time_per_iteration": 2.4845337867736816 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.01059192, + "balance_loss_clip": 1.05458939, + "balance_loss_mlp": 1.03734112, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.3640226741633232, + "language_loss": 0.78783572, + "learning_rate": 3.907958557264774e-06, + "loss": 0.8097378, + "num_input_tokens_seen": 44560275, + "step": 2063, + "time_per_iteration": 2.5258708000183105 + }, + { + "auxiliary_loss_clip": 0.01127752, + "auxiliary_loss_mlp": 0.0105613, + "balance_loss_clip": 1.05703425, + "balance_loss_mlp": 1.03450525, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.4654297856300436, + "language_loss": 0.80096436, + "learning_rate": 3.907841732229663e-06, + "loss": 0.8228032, + "num_input_tokens_seen": 44577640, + "step": 2064, + "time_per_iteration": 2.573765277862549 + }, + { + "auxiliary_loss_clip": 0.0114715, + "auxiliary_loss_mlp": 0.01055731, + "balance_loss_clip": 1.05546343, + "balance_loss_mlp": 1.0349772, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.226540783606575, + "language_loss": 0.92842591, + "learning_rate": 3.907724834849002e-06, + "loss": 0.95045471, + "num_input_tokens_seen": 44594860, + "step": 2065, + "time_per_iteration": 2.5590627193450928 + }, + { + "auxiliary_loss_clip": 0.01151124, + "auxiliary_loss_mlp": 0.01049986, + "balance_loss_clip": 1.0515877, + "balance_loss_mlp": 1.02924347, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.8783838450065595, + "language_loss": 0.80754775, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82955885, + "num_input_tokens_seen": 44614780, + "step": 2066, + "time_per_iteration": 2.5524704456329346 + }, + { + "auxiliary_loss_clip": 0.01032954, + "auxiliary_loss_mlp": 0.0100698, + "balance_loss_clip": 1.02535868, + "balance_loss_mlp": 1.0036186, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8708086790424755, + "language_loss": 0.63311493, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65351427, + "num_input_tokens_seen": 44671240, + "step": 2067, + "time_per_iteration": 3.0993430614471436 + }, + { + "auxiliary_loss_clip": 0.0112284, + "auxiliary_loss_mlp": 0.01062543, + "balance_loss_clip": 1.04917657, + "balance_loss_mlp": 1.03986907, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 2.158105524468894, + "language_loss": 0.93128765, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95314145, + "num_input_tokens_seen": 44691050, + "step": 2068, + "time_per_iteration": 2.632009983062744 + }, + { + "auxiliary_loss_clip": 0.01167444, + "auxiliary_loss_mlp": 0.01051028, + "balance_loss_clip": 1.05984843, + "balance_loss_mlp": 1.03247881, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.7762742594649144, + "language_loss": 0.81257153, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83475626, + "num_input_tokens_seen": 44709850, + "step": 2069, + "time_per_iteration": 2.4981863498687744 + }, + { + "auxiliary_loss_clip": 0.011194, + "auxiliary_loss_mlp": 0.01063998, + "balance_loss_clip": 1.05134416, + "balance_loss_mlp": 1.04175353, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.6647236486068235, + "language_loss": 0.77467507, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79650909, + "num_input_tokens_seen": 44731475, + "step": 2070, + "time_per_iteration": 2.6365833282470703 + }, + { + "auxiliary_loss_clip": 0.01165195, + "auxiliary_loss_mlp": 0.01052605, + "balance_loss_clip": 1.05867755, + "balance_loss_mlp": 1.03098083, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.7594267892009468, + "language_loss": 0.8082515, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83042949, + "num_input_tokens_seen": 44749685, + "step": 2071, + "time_per_iteration": 2.538444757461548 + }, + { + "auxiliary_loss_clip": 0.01160588, + "auxiliary_loss_mlp": 0.0105288, + "balance_loss_clip": 1.06030846, + "balance_loss_mlp": 1.03164935, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.7177182785771616, + "language_loss": 0.78198957, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80412424, + "num_input_tokens_seen": 44772165, + "step": 2072, + "time_per_iteration": 2.619974136352539 + }, + { + "auxiliary_loss_clip": 0.01151462, + "auxiliary_loss_mlp": 0.01052715, + "balance_loss_clip": 1.06090426, + "balance_loss_mlp": 1.0326755, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 2.746744421384616, + "language_loss": 0.75159431, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77363604, + "num_input_tokens_seen": 44790580, + "step": 2073, + "time_per_iteration": 2.549799680709839 + }, + { + "auxiliary_loss_clip": 0.01100044, + "auxiliary_loss_mlp": 0.0105378, + "balance_loss_clip": 1.0445528, + "balance_loss_mlp": 1.03220308, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 2.0593877518450436, + "language_loss": 0.90375322, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92529148, + "num_input_tokens_seen": 44806730, + "step": 2074, + "time_per_iteration": 2.5864973068237305 + }, + { + "auxiliary_loss_clip": 0.01109093, + "auxiliary_loss_mlp": 0.01055297, + "balance_loss_clip": 1.0489583, + "balance_loss_mlp": 1.03205085, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.673360612390568, + "language_loss": 0.84022671, + "learning_rate": 3.906551883013728e-06, + "loss": 0.86187065, + "num_input_tokens_seen": 44825550, + "step": 2075, + "time_per_iteration": 2.654832601547241 + }, + { + "auxiliary_loss_clip": 0.01112397, + "auxiliary_loss_mlp": 0.01055905, + "balance_loss_clip": 1.04442465, + "balance_loss_mlp": 1.03394628, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.122315448375214, + "language_loss": 0.73557675, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.75725973, + "num_input_tokens_seen": 44844155, + "step": 2076, + "time_per_iteration": 2.592893123626709 + }, + { + "auxiliary_loss_clip": 0.01105828, + "auxiliary_loss_mlp": 0.01041205, + "balance_loss_clip": 1.04816806, + "balance_loss_mlp": 1.02103472, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.9752218317184853, + "language_loss": 0.7578367, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77930701, + "num_input_tokens_seen": 44863780, + "step": 2077, + "time_per_iteration": 2.59157133102417 + }, + { + "auxiliary_loss_clip": 0.01160006, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_clip": 1.0557847, + "balance_loss_mlp": 1.03171206, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 2.555048481326205, + "language_loss": 0.82487881, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84700751, + "num_input_tokens_seen": 44881480, + "step": 2078, + "time_per_iteration": 3.9853291511535645 + }, + { + "auxiliary_loss_clip": 0.01148818, + "auxiliary_loss_mlp": 0.01049597, + "balance_loss_clip": 1.05439496, + "balance_loss_mlp": 1.02792442, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 2.1501632259898584, + "language_loss": 0.75443089, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77641499, + "num_input_tokens_seen": 44900390, + "step": 2079, + "time_per_iteration": 2.5420773029327393 + }, + { + "auxiliary_loss_clip": 0.01170813, + "auxiliary_loss_mlp": 0.01058114, + "balance_loss_clip": 1.06097603, + "balance_loss_mlp": 1.03662086, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.348105199880116, + "language_loss": 0.83711034, + "learning_rate": 3.905962695693935e-06, + "loss": 0.85939962, + "num_input_tokens_seen": 44920375, + "step": 2080, + "time_per_iteration": 4.0212624073028564 + }, + { + "auxiliary_loss_clip": 0.01159464, + "auxiliary_loss_mlp": 0.01062115, + "balance_loss_clip": 1.05675197, + "balance_loss_mlp": 1.04102731, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 5.022266651344761, + "language_loss": 0.85143387, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8736496, + "num_input_tokens_seen": 44938415, + "step": 2081, + "time_per_iteration": 2.4705142974853516 + }, + { + "auxiliary_loss_clip": 0.0115915, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_clip": 1.05530047, + "balance_loss_mlp": 1.02656519, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.6722404065029448, + "language_loss": 0.76693249, + "learning_rate": 3.905726514814646e-06, + "loss": 0.78898287, + "num_input_tokens_seen": 44957135, + "step": 2082, + "time_per_iteration": 2.4931514263153076 + }, + { + "auxiliary_loss_clip": 0.01162388, + "auxiliary_loss_mlp": 0.0105407, + "balance_loss_clip": 1.06393361, + "balance_loss_mlp": 1.03223109, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.662140233609257, + "language_loss": 0.79031932, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81248391, + "num_input_tokens_seen": 44974480, + "step": 2083, + "time_per_iteration": 2.590010404586792 + }, + { + "auxiliary_loss_clip": 0.01148362, + "auxiliary_loss_mlp": 0.01053056, + "balance_loss_clip": 1.05453587, + "balance_loss_mlp": 1.03076386, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.846489522814048, + "language_loss": 0.89952546, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92153966, + "num_input_tokens_seen": 44990310, + "step": 2084, + "time_per_iteration": 2.512991428375244 + }, + { + "auxiliary_loss_clip": 0.01135833, + "auxiliary_loss_mlp": 0.0105049, + "balance_loss_clip": 1.05393815, + "balance_loss_mlp": 1.0297718, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.7505983868593764, + "language_loss": 0.80110592, + "learning_rate": 3.905371701516869e-06, + "loss": 0.82296908, + "num_input_tokens_seen": 45010720, + "step": 2085, + "time_per_iteration": 2.601482391357422 + }, + { + "auxiliary_loss_clip": 0.01174305, + "auxiliary_loss_mlp": 0.01051037, + "balance_loss_clip": 1.05869484, + "balance_loss_mlp": 1.02973402, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.951655777060629, + "language_loss": 0.88225186, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90450525, + "num_input_tokens_seen": 45030360, + "step": 2086, + "time_per_iteration": 3.829864978790283 + }, + { + "auxiliary_loss_clip": 0.01138736, + "auxiliary_loss_mlp": 0.01047783, + "balance_loss_clip": 1.05435681, + "balance_loss_mlp": 1.02804232, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 3.2001651808590683, + "language_loss": 0.87380779, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89567298, + "num_input_tokens_seen": 45045085, + "step": 2087, + "time_per_iteration": 2.4980859756469727 + }, + { + "auxiliary_loss_clip": 0.01150579, + "auxiliary_loss_mlp": 0.01057496, + "balance_loss_clip": 1.05866122, + "balance_loss_mlp": 1.03601432, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 7.047690166190429, + "language_loss": 0.73537087, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75745159, + "num_input_tokens_seen": 45065145, + "step": 2088, + "time_per_iteration": 2.552931070327759 + }, + { + "auxiliary_loss_clip": 0.01062087, + "auxiliary_loss_mlp": 0.0100161, + "balance_loss_clip": 1.02512181, + "balance_loss_mlp": 0.99834388, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.7737664973863669, + "language_loss": 0.61719763, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63783467, + "num_input_tokens_seen": 45126230, + "step": 2089, + "time_per_iteration": 3.0455973148345947 + }, + { + "auxiliary_loss_clip": 0.01149903, + "auxiliary_loss_mlp": 0.01059886, + "balance_loss_clip": 1.05575752, + "balance_loss_mlp": 1.03894091, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.1895548359908044, + "language_loss": 0.77834713, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80044508, + "num_input_tokens_seen": 45145545, + "step": 2090, + "time_per_iteration": 2.5586178302764893 + }, + { + "auxiliary_loss_clip": 0.01054062, + "auxiliary_loss_mlp": 0.01012238, + "balance_loss_clip": 1.03524184, + "balance_loss_mlp": 1.00916195, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.8715224752321551, + "language_loss": 0.59511793, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61578089, + "num_input_tokens_seen": 45206845, + "step": 2091, + "time_per_iteration": 3.019606351852417 + }, + { + "auxiliary_loss_clip": 0.01164671, + "auxiliary_loss_mlp": 0.01048899, + "balance_loss_clip": 1.06068516, + "balance_loss_mlp": 1.02925324, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.8897606201330899, + "language_loss": 0.63344371, + "learning_rate": 3.904541275215825e-06, + "loss": 0.65557933, + "num_input_tokens_seen": 45228495, + "step": 2092, + "time_per_iteration": 2.6786389350891113 + }, + { + "auxiliary_loss_clip": 0.01151595, + "auxiliary_loss_mlp": 0.01060905, + "balance_loss_clip": 1.05675745, + "balance_loss_mlp": 1.03864884, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 2.0228874015867238, + "language_loss": 0.8060801, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82820511, + "num_input_tokens_seen": 45245720, + "step": 2093, + "time_per_iteration": 2.5533905029296875 + }, + { + "auxiliary_loss_clip": 0.0115269, + "auxiliary_loss_mlp": 0.01064024, + "balance_loss_clip": 1.05534697, + "balance_loss_mlp": 1.04309106, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.9903363282870328, + "language_loss": 0.75990611, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78207326, + "num_input_tokens_seen": 45265650, + "step": 2094, + "time_per_iteration": 2.522902727127075 + }, + { + "auxiliary_loss_clip": 0.01120573, + "auxiliary_loss_mlp": 0.01059327, + "balance_loss_clip": 1.05024755, + "balance_loss_mlp": 1.03885877, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.6649522200582987, + "language_loss": 0.76605272, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.78785169, + "num_input_tokens_seen": 45287790, + "step": 2095, + "time_per_iteration": 2.7864866256713867 + }, + { + "auxiliary_loss_clip": 0.01147844, + "auxiliary_loss_mlp": 0.01058682, + "balance_loss_clip": 1.05101371, + "balance_loss_mlp": 1.03746319, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.267475915872121, + "language_loss": 0.83110762, + "learning_rate": 3.904065156953232e-06, + "loss": 0.8531729, + "num_input_tokens_seen": 45305720, + "step": 2096, + "time_per_iteration": 3.8837528228759766 + }, + { + "auxiliary_loss_clip": 0.01164712, + "auxiliary_loss_mlp": 0.01051083, + "balance_loss_clip": 1.0563271, + "balance_loss_mlp": 1.03031647, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 2.1580764164665145, + "language_loss": 0.75558889, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77774686, + "num_input_tokens_seen": 45325290, + "step": 2097, + "time_per_iteration": 2.498579263687134 + }, + { + "auxiliary_loss_clip": 0.0115878, + "auxiliary_loss_mlp": 0.01067196, + "balance_loss_clip": 1.05762887, + "balance_loss_mlp": 1.04766917, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.1403015298456984, + "language_loss": 0.8723219, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89458168, + "num_input_tokens_seen": 45344465, + "step": 2098, + "time_per_iteration": 2.536301612854004 + }, + { + "auxiliary_loss_clip": 0.01121241, + "auxiliary_loss_mlp": 0.01063915, + "balance_loss_clip": 1.04867184, + "balance_loss_mlp": 1.03962064, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 2.4126853117087186, + "language_loss": 0.69611359, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71796513, + "num_input_tokens_seen": 45362465, + "step": 2099, + "time_per_iteration": 2.5911481380462646 + }, + { + "auxiliary_loss_clip": 0.01149564, + "auxiliary_loss_mlp": 0.01060092, + "balance_loss_clip": 1.05170953, + "balance_loss_mlp": 1.03731167, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 2.382135647158968, + "language_loss": 0.81609446, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83819103, + "num_input_tokens_seen": 45382700, + "step": 2100, + "time_per_iteration": 2.5575499534606934 + }, + { + "auxiliary_loss_clip": 0.01160149, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_clip": 1.05969167, + "balance_loss_mlp": 1.03384948, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 1.851663473651323, + "language_loss": 0.80692542, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82907808, + "num_input_tokens_seen": 45401005, + "step": 2101, + "time_per_iteration": 2.5652222633361816 + }, + { + "auxiliary_loss_clip": 0.01066769, + "auxiliary_loss_mlp": 0.01005597, + "balance_loss_clip": 1.02283776, + "balance_loss_mlp": 1.00264072, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7118061165278406, + "language_loss": 0.57092154, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59164518, + "num_input_tokens_seen": 45466555, + "step": 2102, + "time_per_iteration": 3.075453281402588 + }, + { + "auxiliary_loss_clip": 0.01141736, + "auxiliary_loss_mlp": 0.01055972, + "balance_loss_clip": 1.05188584, + "balance_loss_mlp": 1.03444278, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 2.310266442431084, + "language_loss": 0.93076038, + "learning_rate": 3.903229170377845e-06, + "loss": 0.95273745, + "num_input_tokens_seen": 45485165, + "step": 2103, + "time_per_iteration": 2.568119764328003 + }, + { + "auxiliary_loss_clip": 0.01150666, + "auxiliary_loss_mlp": 0.01039508, + "balance_loss_clip": 1.05170727, + "balance_loss_mlp": 1.01998115, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 2.3218191623094317, + "language_loss": 0.78648579, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80838752, + "num_input_tokens_seen": 45504630, + "step": 2104, + "time_per_iteration": 2.5601093769073486 + }, + { + "auxiliary_loss_clip": 0.01139084, + "auxiliary_loss_mlp": 0.01054762, + "balance_loss_clip": 1.05585122, + "balance_loss_mlp": 1.03454399, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.897809729655489, + "language_loss": 0.81159586, + "learning_rate": 3.902989667466828e-06, + "loss": 0.8335343, + "num_input_tokens_seen": 45524885, + "step": 2105, + "time_per_iteration": 2.6200859546661377 + }, + { + "auxiliary_loss_clip": 0.0117185, + "auxiliary_loss_mlp": 0.01057018, + "balance_loss_clip": 1.05898881, + "balance_loss_mlp": 1.03416562, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.2751936284305843, + "language_loss": 0.83229303, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85458171, + "num_input_tokens_seen": 45545000, + "step": 2106, + "time_per_iteration": 2.5593364238739014 + }, + { + "auxiliary_loss_clip": 0.01127225, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_clip": 1.05398583, + "balance_loss_mlp": 1.02592802, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 2.1350813327381615, + "language_loss": 0.73579818, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75754499, + "num_input_tokens_seen": 45564210, + "step": 2107, + "time_per_iteration": 2.640746831893921 + }, + { + "auxiliary_loss_clip": 0.01170496, + "auxiliary_loss_mlp": 0.01047809, + "balance_loss_clip": 1.05622983, + "balance_loss_mlp": 1.02838969, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.2051006503057344, + "language_loss": 0.79135633, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81353939, + "num_input_tokens_seen": 45583030, + "step": 2108, + "time_per_iteration": 2.480803966522217 + }, + { + "auxiliary_loss_clip": 0.01173893, + "auxiliary_loss_mlp": 0.01046267, + "balance_loss_clip": 1.05572104, + "balance_loss_mlp": 1.0253222, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 1.9702602031584666, + "language_loss": 0.75346065, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77566224, + "num_input_tokens_seen": 45602265, + "step": 2109, + "time_per_iteration": 2.4523725509643555 + }, + { + "auxiliary_loss_clip": 0.01118189, + "auxiliary_loss_mlp": 0.01054215, + "balance_loss_clip": 1.04683781, + "balance_loss_mlp": 1.03237581, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 1.6350383555836447, + "language_loss": 0.82930046, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85102457, + "num_input_tokens_seen": 45620595, + "step": 2110, + "time_per_iteration": 2.5457940101623535 + }, + { + "auxiliary_loss_clip": 0.01149597, + "auxiliary_loss_mlp": 0.00788267, + "balance_loss_clip": 1.05362058, + "balance_loss_mlp": 1.00019503, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 2.064066772890186, + "language_loss": 0.78186965, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80124831, + "num_input_tokens_seen": 45641140, + "step": 2111, + "time_per_iteration": 2.5567195415496826 + }, + { + "auxiliary_loss_clip": 0.01135789, + "auxiliary_loss_mlp": 0.01063654, + "balance_loss_clip": 1.05634546, + "balance_loss_mlp": 1.04022956, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.9401987117212065, + "language_loss": 0.77048302, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79247749, + "num_input_tokens_seen": 45662315, + "step": 2112, + "time_per_iteration": 2.6905455589294434 + }, + { + "auxiliary_loss_clip": 0.01134435, + "auxiliary_loss_mlp": 0.01060199, + "balance_loss_clip": 1.05336332, + "balance_loss_mlp": 1.03948057, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 2.105467731701792, + "language_loss": 0.8575142, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87946051, + "num_input_tokens_seen": 45680335, + "step": 2113, + "time_per_iteration": 2.576813220977783 + }, + { + "auxiliary_loss_clip": 0.01143095, + "auxiliary_loss_mlp": 0.01059371, + "balance_loss_clip": 1.05676138, + "balance_loss_mlp": 1.03726983, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.006499153504209, + "language_loss": 0.74101031, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76303506, + "num_input_tokens_seen": 45696240, + "step": 2114, + "time_per_iteration": 2.5416319370269775 + }, + { + "auxiliary_loss_clip": 0.01161534, + "auxiliary_loss_mlp": 0.01058541, + "balance_loss_clip": 1.05990016, + "balance_loss_mlp": 1.03668988, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7303015401080875, + "language_loss": 0.83599997, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85820079, + "num_input_tokens_seen": 45713695, + "step": 2115, + "time_per_iteration": 2.4762892723083496 + }, + { + "auxiliary_loss_clip": 0.01157838, + "auxiliary_loss_mlp": 0.01058307, + "balance_loss_clip": 1.05927062, + "balance_loss_mlp": 1.03776753, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.9193942014922398, + "language_loss": 0.86589736, + "learning_rate": 3.901667242881065e-06, + "loss": 0.8880589, + "num_input_tokens_seen": 45736655, + "step": 2116, + "time_per_iteration": 2.59867525100708 + }, + { + "auxiliary_loss_clip": 0.01139273, + "auxiliary_loss_mlp": 0.00786486, + "balance_loss_clip": 1.0500443, + "balance_loss_mlp": 1.00014234, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.629082611107055, + "language_loss": 0.70487368, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72413123, + "num_input_tokens_seen": 45758195, + "step": 2117, + "time_per_iteration": 2.659724235534668 + }, + { + "auxiliary_loss_clip": 0.01133225, + "auxiliary_loss_mlp": 0.0105991, + "balance_loss_clip": 1.05065751, + "balance_loss_mlp": 1.03612804, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.7735069150600453, + "language_loss": 0.86668539, + "learning_rate": 3.901425864420852e-06, + "loss": 0.88861674, + "num_input_tokens_seen": 45774280, + "step": 2118, + "time_per_iteration": 3.9841206073760986 + }, + { + "auxiliary_loss_clip": 0.01159455, + "auxiliary_loss_mlp": 0.01051284, + "balance_loss_clip": 1.0541811, + "balance_loss_mlp": 1.0315311, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 2.2813642454351872, + "language_loss": 0.87070131, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89280874, + "num_input_tokens_seen": 45792760, + "step": 2119, + "time_per_iteration": 2.498795986175537 + }, + { + "auxiliary_loss_clip": 0.01151474, + "auxiliary_loss_mlp": 0.00786472, + "balance_loss_clip": 1.05421782, + "balance_loss_mlp": 1.00018215, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.5916180417063073, + "language_loss": 0.8828845, + "learning_rate": 3.901184197551605e-06, + "loss": 0.902264, + "num_input_tokens_seen": 45804300, + "step": 2120, + "time_per_iteration": 3.907395839691162 + }, + { + "auxiliary_loss_clip": 0.01173327, + "auxiliary_loss_mlp": 0.01045279, + "balance_loss_clip": 1.05805635, + "balance_loss_mlp": 1.02500165, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 2.8463730097575404, + "language_loss": 0.75635642, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77854246, + "num_input_tokens_seen": 45823780, + "step": 2121, + "time_per_iteration": 2.4686954021453857 + }, + { + "auxiliary_loss_clip": 0.01116754, + "auxiliary_loss_mlp": 0.01050767, + "balance_loss_clip": 1.04602039, + "balance_loss_mlp": 1.02944088, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.1774186717782387, + "language_loss": 0.8314513, + "learning_rate": 3.900942242309978e-06, + "loss": 0.85312653, + "num_input_tokens_seen": 45840495, + "step": 2122, + "time_per_iteration": 2.5774106979370117 + }, + { + "auxiliary_loss_clip": 0.01153693, + "auxiliary_loss_mlp": 0.01047116, + "balance_loss_clip": 1.05995142, + "balance_loss_mlp": 1.02696919, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 1.8599803989651877, + "language_loss": 0.79137826, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81338632, + "num_input_tokens_seen": 45857735, + "step": 2123, + "time_per_iteration": 2.501103639602661 + }, + { + "auxiliary_loss_clip": 0.0117387, + "auxiliary_loss_mlp": 0.01049224, + "balance_loss_clip": 1.05797446, + "balance_loss_mlp": 1.02881515, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.6449999211162736, + "language_loss": 0.79098451, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81321543, + "num_input_tokens_seen": 45876485, + "step": 2124, + "time_per_iteration": 2.462134599685669 + }, + { + "auxiliary_loss_clip": 0.01166424, + "auxiliary_loss_mlp": 0.00786351, + "balance_loss_clip": 1.05637228, + "balance_loss_mlp": 1.0002712, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.316799122028392, + "language_loss": 0.75754815, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77707589, + "num_input_tokens_seen": 45894645, + "step": 2125, + "time_per_iteration": 3.9837186336517334 + }, + { + "auxiliary_loss_clip": 0.01163149, + "auxiliary_loss_mlp": 0.00785466, + "balance_loss_clip": 1.05600739, + "balance_loss_mlp": 1.00023484, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.6604681835483883, + "language_loss": 0.78416944, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80365556, + "num_input_tokens_seen": 45913755, + "step": 2126, + "time_per_iteration": 2.5247931480407715 + }, + { + "auxiliary_loss_clip": 0.0112312, + "auxiliary_loss_mlp": 0.01051509, + "balance_loss_clip": 1.05019593, + "balance_loss_mlp": 1.03237545, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 3.16269872020447, + "language_loss": 0.69315511, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71490133, + "num_input_tokens_seen": 45936095, + "step": 2127, + "time_per_iteration": 2.718956708908081 + }, + { + "auxiliary_loss_clip": 0.01030106, + "auxiliary_loss_mlp": 0.00761537, + "balance_loss_clip": 1.02446103, + "balance_loss_mlp": 0.99978858, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8570470801763956, + "language_loss": 0.62764287, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64555931, + "num_input_tokens_seen": 46004655, + "step": 2128, + "time_per_iteration": 3.1720874309539795 + }, + { + "auxiliary_loss_clip": 0.01151066, + "auxiliary_loss_mlp": 0.01049292, + "balance_loss_clip": 1.05275917, + "balance_loss_mlp": 1.02627301, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.725839997752272, + "language_loss": 0.77619076, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79819429, + "num_input_tokens_seen": 46023610, + "step": 2129, + "time_per_iteration": 2.486788272857666 + }, + { + "auxiliary_loss_clip": 0.01129603, + "auxiliary_loss_mlp": 0.01053063, + "balance_loss_clip": 1.05359399, + "balance_loss_mlp": 1.03024602, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.0167942909957315, + "language_loss": 0.79414928, + "learning_rate": 3.899971538354343e-06, + "loss": 0.8159759, + "num_input_tokens_seen": 46041725, + "step": 2130, + "time_per_iteration": 2.6126604080200195 + }, + { + "auxiliary_loss_clip": 0.01143824, + "auxiliary_loss_mlp": 0.01051353, + "balance_loss_clip": 1.05142128, + "balance_loss_mlp": 1.03005004, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 1.9501830292469708, + "language_loss": 0.7087872, + "learning_rate": 3.899849876099518e-06, + "loss": 0.730739, + "num_input_tokens_seen": 46061095, + "step": 2131, + "time_per_iteration": 2.5889663696289062 + }, + { + "auxiliary_loss_clip": 0.01110381, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.05346417, + "balance_loss_mlp": 1.03020692, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.0018653189584974, + "language_loss": 0.72066754, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74228013, + "num_input_tokens_seen": 46082670, + "step": 2132, + "time_per_iteration": 2.744579553604126 + }, + { + "auxiliary_loss_clip": 0.01109497, + "auxiliary_loss_mlp": 0.01061268, + "balance_loss_clip": 1.04564714, + "balance_loss_mlp": 1.03843939, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.3194998336499597, + "language_loss": 0.8184551, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84016269, + "num_input_tokens_seen": 46102410, + "step": 2133, + "time_per_iteration": 2.597501754760742 + }, + { + "auxiliary_loss_clip": 0.01168607, + "auxiliary_loss_mlp": 0.0105897, + "balance_loss_clip": 1.05661702, + "balance_loss_mlp": 1.03667736, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 4.225062002127416, + "language_loss": 0.79806912, + "learning_rate": 3.899484457098528e-06, + "loss": 0.82034492, + "num_input_tokens_seen": 46121145, + "step": 2134, + "time_per_iteration": 2.5012195110321045 + }, + { + "auxiliary_loss_clip": 0.01159199, + "auxiliary_loss_mlp": 0.0104426, + "balance_loss_clip": 1.05750048, + "balance_loss_mlp": 1.02370846, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 1.8283862004664024, + "language_loss": 0.82657456, + "learning_rate": 3.899362506701421e-06, + "loss": 0.84860909, + "num_input_tokens_seen": 46140740, + "step": 2135, + "time_per_iteration": 2.513110399246216 + }, + { + "auxiliary_loss_clip": 0.01144955, + "auxiliary_loss_mlp": 0.01057411, + "balance_loss_clip": 1.05409229, + "balance_loss_mlp": 1.03572643, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.4109043868473474, + "language_loss": 0.77239335, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79441702, + "num_input_tokens_seen": 46156805, + "step": 2136, + "time_per_iteration": 4.037874221801758 + }, + { + "auxiliary_loss_clip": 0.01020833, + "auxiliary_loss_mlp": 0.01006478, + "balance_loss_clip": 1.01780772, + "balance_loss_mlp": 1.00346208, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.9076934166344164, + "language_loss": 0.59215569, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61242878, + "num_input_tokens_seen": 46222085, + "step": 2137, + "time_per_iteration": 3.2823076248168945 + }, + { + "auxiliary_loss_clip": 0.01156249, + "auxiliary_loss_mlp": 0.01055014, + "balance_loss_clip": 1.05418968, + "balance_loss_mlp": 1.03493905, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 4.315180122085471, + "language_loss": 0.82127023, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84338284, + "num_input_tokens_seen": 46239970, + "step": 2138, + "time_per_iteration": 2.4982001781463623 + }, + { + "auxiliary_loss_clip": 0.01161996, + "auxiliary_loss_mlp": 0.01056176, + "balance_loss_clip": 1.05601716, + "balance_loss_mlp": 1.03276372, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.948286036928586, + "language_loss": 0.79283273, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81501448, + "num_input_tokens_seen": 46257740, + "step": 2139, + "time_per_iteration": 2.509066343307495 + }, + { + "auxiliary_loss_clip": 0.01141596, + "auxiliary_loss_mlp": 0.0105093, + "balance_loss_clip": 1.05470538, + "balance_loss_mlp": 1.03027105, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.083399490515879, + "language_loss": 0.85087907, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.87280434, + "num_input_tokens_seen": 46275445, + "step": 2140, + "time_per_iteration": 2.5380935668945312 + }, + { + "auxiliary_loss_clip": 0.0114844, + "auxiliary_loss_mlp": 0.01047919, + "balance_loss_clip": 1.05486929, + "balance_loss_mlp": 1.0284642, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 1.8507500911474526, + "language_loss": 0.86233747, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88430107, + "num_input_tokens_seen": 46291710, + "step": 2141, + "time_per_iteration": 2.493436098098755 + }, + { + "auxiliary_loss_clip": 0.01148856, + "auxiliary_loss_mlp": 0.01050319, + "balance_loss_clip": 1.05369699, + "balance_loss_mlp": 1.02992249, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 6.147553121935503, + "language_loss": 0.67953151, + "learning_rate": 3.898506837508518e-06, + "loss": 0.7015233, + "num_input_tokens_seen": 46311335, + "step": 2142, + "time_per_iteration": 2.583634614944458 + }, + { + "auxiliary_loss_clip": 0.01166532, + "auxiliary_loss_mlp": 0.00788299, + "balance_loss_clip": 1.05804396, + "balance_loss_mlp": 1.0002172, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.0820356617173905, + "language_loss": 0.83046883, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85001719, + "num_input_tokens_seen": 46330985, + "step": 2143, + "time_per_iteration": 2.538635730743408 + }, + { + "auxiliary_loss_clip": 0.01176181, + "auxiliary_loss_mlp": 0.00786869, + "balance_loss_clip": 1.05966473, + "balance_loss_mlp": 1.00026882, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.6245406433935037, + "language_loss": 0.81901491, + "learning_rate": 3.898261712602539e-06, + "loss": 0.8386454, + "num_input_tokens_seen": 46351295, + "step": 2144, + "time_per_iteration": 2.502110481262207 + }, + { + "auxiliary_loss_clip": 0.01135056, + "auxiliary_loss_mlp": 0.01056182, + "balance_loss_clip": 1.04867637, + "balance_loss_mlp": 1.03386617, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 1.9696085987971752, + "language_loss": 0.78296411, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80487645, + "num_input_tokens_seen": 46368600, + "step": 2145, + "time_per_iteration": 2.5289478302001953 + }, + { + "auxiliary_loss_clip": 0.01170067, + "auxiliary_loss_mlp": 0.01053506, + "balance_loss_clip": 1.05277455, + "balance_loss_mlp": 1.03242981, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.272486837190392, + "language_loss": 0.82331336, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84554905, + "num_input_tokens_seen": 46387370, + "step": 2146, + "time_per_iteration": 2.424661159515381 + }, + { + "auxiliary_loss_clip": 0.01141624, + "auxiliary_loss_mlp": 0.01055896, + "balance_loss_clip": 1.05218172, + "balance_loss_mlp": 1.03439045, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.948522591791862, + "language_loss": 0.71229857, + "learning_rate": 3.897893485388149e-06, + "loss": 0.73427379, + "num_input_tokens_seen": 46409570, + "step": 2147, + "time_per_iteration": 2.6334431171417236 + }, + { + "auxiliary_loss_clip": 0.01152021, + "auxiliary_loss_mlp": 0.01055146, + "balance_loss_clip": 1.05755115, + "balance_loss_mlp": 1.03538156, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 1.9780371520869082, + "language_loss": 0.71091521, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73298681, + "num_input_tokens_seen": 46429320, + "step": 2148, + "time_per_iteration": 2.5342323780059814 + }, + { + "auxiliary_loss_clip": 0.01170394, + "auxiliary_loss_mlp": 0.010505, + "balance_loss_clip": 1.05817354, + "balance_loss_mlp": 1.03129542, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.9555400723040424, + "language_loss": 0.79243475, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81464368, + "num_input_tokens_seen": 46450155, + "step": 2149, + "time_per_iteration": 2.471179723739624 + }, + { + "auxiliary_loss_clip": 0.01162739, + "auxiliary_loss_mlp": 0.01051883, + "balance_loss_clip": 1.05647969, + "balance_loss_mlp": 1.03155792, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.096720767600995, + "language_loss": 0.75882399, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78097022, + "num_input_tokens_seen": 46470280, + "step": 2150, + "time_per_iteration": 2.564650297164917 + }, + { + "auxiliary_loss_clip": 0.01156101, + "auxiliary_loss_mlp": 0.01053946, + "balance_loss_clip": 1.05448318, + "balance_loss_mlp": 1.03351307, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.3273409950486617, + "language_loss": 0.69743961, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.71954012, + "num_input_tokens_seen": 46487605, + "step": 2151, + "time_per_iteration": 2.4948625564575195 + }, + { + "auxiliary_loss_clip": 0.01169865, + "auxiliary_loss_mlp": 0.01048731, + "balance_loss_clip": 1.05831742, + "balance_loss_mlp": 1.02922857, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.8914460058401705, + "language_loss": 0.8385272, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86071318, + "num_input_tokens_seen": 46505100, + "step": 2152, + "time_per_iteration": 2.4646668434143066 + }, + { + "auxiliary_loss_clip": 0.01160971, + "auxiliary_loss_mlp": 0.01059319, + "balance_loss_clip": 1.05276275, + "balance_loss_mlp": 1.03983974, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 1.6680562692915881, + "language_loss": 0.78491956, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80712247, + "num_input_tokens_seen": 46524020, + "step": 2153, + "time_per_iteration": 2.4711928367614746 + }, + { + "auxiliary_loss_clip": 0.01116303, + "auxiliary_loss_mlp": 0.00786402, + "balance_loss_clip": 1.04839325, + "balance_loss_mlp": 1.00024319, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 1.7255122829568963, + "language_loss": 0.80168808, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82071507, + "num_input_tokens_seen": 46544640, + "step": 2154, + "time_per_iteration": 2.606466054916382 + }, + { + "auxiliary_loss_clip": 0.01152146, + "auxiliary_loss_mlp": 0.01052847, + "balance_loss_clip": 1.0531702, + "balance_loss_mlp": 1.03287923, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 2.0860340001121296, + "language_loss": 0.8304193, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85246921, + "num_input_tokens_seen": 46561395, + "step": 2155, + "time_per_iteration": 2.473361015319824 + }, + { + "auxiliary_loss_clip": 0.01164725, + "auxiliary_loss_mlp": 0.01059788, + "balance_loss_clip": 1.05531561, + "balance_loss_mlp": 1.03940296, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 3.1098039575996577, + "language_loss": 0.75975662, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78200179, + "num_input_tokens_seen": 46579395, + "step": 2156, + "time_per_iteration": 2.4748191833496094 + }, + { + "auxiliary_loss_clip": 0.01108756, + "auxiliary_loss_mlp": 0.010511, + "balance_loss_clip": 1.05172873, + "balance_loss_mlp": 1.03109694, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.8575683247093286, + "language_loss": 0.86650431, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88810283, + "num_input_tokens_seen": 46597090, + "step": 2157, + "time_per_iteration": 4.055891036987305 + }, + { + "auxiliary_loss_clip": 0.01172021, + "auxiliary_loss_mlp": 0.01058072, + "balance_loss_clip": 1.05396628, + "balance_loss_mlp": 1.03706765, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.707421538950689, + "language_loss": 0.80928093, + "learning_rate": 3.896537778333651e-06, + "loss": 0.83158183, + "num_input_tokens_seen": 46617355, + "step": 2158, + "time_per_iteration": 2.528317451477051 + }, + { + "auxiliary_loss_clip": 0.01174658, + "auxiliary_loss_mlp": 0.01056091, + "balance_loss_clip": 1.05698907, + "balance_loss_mlp": 1.03592134, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.5146232483762296, + "language_loss": 0.75067258, + "learning_rate": 3.896414100642752e-06, + "loss": 0.77298003, + "num_input_tokens_seen": 46633130, + "step": 2159, + "time_per_iteration": 3.7933053970336914 + }, + { + "auxiliary_loss_clip": 0.01124536, + "auxiliary_loss_mlp": 0.01058389, + "balance_loss_clip": 1.04624653, + "balance_loss_mlp": 1.03615689, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.0962745887853504, + "language_loss": 0.82820714, + "learning_rate": 3.89629035103964e-06, + "loss": 0.85003632, + "num_input_tokens_seen": 46650575, + "step": 2160, + "time_per_iteration": 2.6002190113067627 + }, + { + "auxiliary_loss_clip": 0.01150104, + "auxiliary_loss_mlp": 0.01045543, + "balance_loss_clip": 1.05712748, + "balance_loss_mlp": 1.02582598, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.8986999423681452, + "language_loss": 0.8201381, + "learning_rate": 3.896166529529008e-06, + "loss": 0.84209454, + "num_input_tokens_seen": 46668780, + "step": 2161, + "time_per_iteration": 2.482530117034912 + }, + { + "auxiliary_loss_clip": 0.0114733, + "auxiliary_loss_mlp": 0.01055497, + "balance_loss_clip": 1.05241227, + "balance_loss_mlp": 1.03426647, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.570316691277414, + "language_loss": 0.82512373, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84715199, + "num_input_tokens_seen": 46687550, + "step": 2162, + "time_per_iteration": 2.579373836517334 + }, + { + "auxiliary_loss_clip": 0.0113983, + "auxiliary_loss_mlp": 0.01055663, + "balance_loss_clip": 1.05576921, + "balance_loss_mlp": 1.03536201, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.377332347315073, + "language_loss": 0.72698355, + "learning_rate": 3.895918670803968e-06, + "loss": 0.74893856, + "num_input_tokens_seen": 46706730, + "step": 2163, + "time_per_iteration": 2.54518723487854 + }, + { + "auxiliary_loss_clip": 0.01173205, + "auxiliary_loss_mlp": 0.00786977, + "balance_loss_clip": 1.05498874, + "balance_loss_mlp": 1.0002197, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 4.111874753095556, + "language_loss": 0.81715763, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83675945, + "num_input_tokens_seen": 46724250, + "step": 2164, + "time_per_iteration": 2.465622901916504 + }, + { + "auxiliary_loss_clip": 0.01126883, + "auxiliary_loss_mlp": 0.01050028, + "balance_loss_clip": 1.05240214, + "balance_loss_mlp": 1.03109789, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.878959300966905, + "language_loss": 0.72350132, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74527043, + "num_input_tokens_seen": 46744105, + "step": 2165, + "time_per_iteration": 4.008230924606323 + }, + { + "auxiliary_loss_clip": 0.01114331, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_clip": 1.05307508, + "balance_loss_mlp": 1.02779198, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 1.9714991783484044, + "language_loss": 0.74862719, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.7702632, + "num_input_tokens_seen": 46764250, + "step": 2166, + "time_per_iteration": 2.633329153060913 + }, + { + "auxiliary_loss_clip": 0.01173614, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_clip": 1.05638218, + "balance_loss_mlp": 1.0297513, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.6282374093467433, + "language_loss": 0.83232814, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85455394, + "num_input_tokens_seen": 46786865, + "step": 2167, + "time_per_iteration": 2.5302646160125732 + }, + { + "auxiliary_loss_clip": 0.01112257, + "auxiliary_loss_mlp": 0.01059597, + "balance_loss_clip": 1.04581797, + "balance_loss_mlp": 1.03872299, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 2.496269409164221, + "language_loss": 0.83311164, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85483015, + "num_input_tokens_seen": 46807030, + "step": 2168, + "time_per_iteration": 2.6005730628967285 + }, + { + "auxiliary_loss_clip": 0.01084501, + "auxiliary_loss_mlp": 0.0106447, + "balance_loss_clip": 1.0432198, + "balance_loss_mlp": 1.04047322, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.2129965468817496, + "language_loss": 0.80036116, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.8218509, + "num_input_tokens_seen": 46826280, + "step": 2169, + "time_per_iteration": 2.6370255947113037 + }, + { + "auxiliary_loss_clip": 0.01173081, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.05746794, + "balance_loss_mlp": 1.02235448, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 5.1373609929228685, + "language_loss": 0.6676954, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.6898526, + "num_input_tokens_seen": 46846505, + "step": 2170, + "time_per_iteration": 2.5292670726776123 + }, + { + "auxiliary_loss_clip": 0.01145631, + "auxiliary_loss_mlp": 0.0104886, + "balance_loss_clip": 1.05648088, + "balance_loss_mlp": 1.02796292, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.7603186152386903, + "language_loss": 0.67171252, + "learning_rate": 3.8949243605434e-06, + "loss": 0.6936574, + "num_input_tokens_seen": 46867380, + "step": 2171, + "time_per_iteration": 2.6889216899871826 + }, + { + "auxiliary_loss_clip": 0.01162154, + "auxiliary_loss_mlp": 0.01043598, + "balance_loss_clip": 1.05659056, + "balance_loss_mlp": 1.02239072, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.8808026432651368, + "language_loss": 0.72053403, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74259156, + "num_input_tokens_seen": 46886810, + "step": 2172, + "time_per_iteration": 2.5138072967529297 + }, + { + "auxiliary_loss_clip": 0.01133789, + "auxiliary_loss_mlp": 0.01048449, + "balance_loss_clip": 1.05736959, + "balance_loss_mlp": 1.02798104, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.9504632276505358, + "language_loss": 0.75537741, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77719986, + "num_input_tokens_seen": 46905620, + "step": 2173, + "time_per_iteration": 2.540163040161133 + }, + { + "auxiliary_loss_clip": 0.01134332, + "auxiliary_loss_mlp": 0.01054841, + "balance_loss_clip": 1.05520284, + "balance_loss_mlp": 1.03291821, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 7.1738739562743135, + "language_loss": 0.70546138, + "learning_rate": 3.894550308446551e-06, + "loss": 0.7273531, + "num_input_tokens_seen": 46925120, + "step": 2174, + "time_per_iteration": 4.077038526535034 + }, + { + "auxiliary_loss_clip": 0.01048982, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.02552557, + "balance_loss_mlp": 1.03017068, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.809058849544303, + "language_loss": 0.59043157, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61125124, + "num_input_tokens_seen": 46988195, + "step": 2175, + "time_per_iteration": 3.214470624923706 + }, + { + "auxiliary_loss_clip": 0.01161776, + "auxiliary_loss_mlp": 0.01052391, + "balance_loss_clip": 1.05538321, + "balance_loss_mlp": 1.03261399, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.499842798124225, + "language_loss": 0.80184478, + "learning_rate": 3.894300581166417e-06, + "loss": 0.82398647, + "num_input_tokens_seen": 47004720, + "step": 2176, + "time_per_iteration": 2.4782116413116455 + }, + { + "auxiliary_loss_clip": 0.01169844, + "auxiliary_loss_mlp": 0.01056206, + "balance_loss_clip": 1.05567336, + "balance_loss_mlp": 1.03436685, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.833753038904338, + "language_loss": 0.74300629, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76526678, + "num_input_tokens_seen": 47024255, + "step": 2177, + "time_per_iteration": 2.5684757232666016 + }, + { + "auxiliary_loss_clip": 0.01133836, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.05357075, + "balance_loss_mlp": 1.0261929, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 1.8313773093819214, + "language_loss": 0.82198739, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84380424, + "num_input_tokens_seen": 47042465, + "step": 2178, + "time_per_iteration": 2.5338094234466553 + }, + { + "auxiliary_loss_clip": 0.01170408, + "auxiliary_loss_mlp": 0.0104853, + "balance_loss_clip": 1.05688894, + "balance_loss_mlp": 1.0279547, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.6521617406982343, + "language_loss": 0.75144589, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77363527, + "num_input_tokens_seen": 47060370, + "step": 2179, + "time_per_iteration": 2.431851387023926 + }, + { + "auxiliary_loss_clip": 0.01129588, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_clip": 1.05016041, + "balance_loss_mlp": 1.02806401, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.065694843632352, + "language_loss": 0.843458, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86523724, + "num_input_tokens_seen": 47081415, + "step": 2180, + "time_per_iteration": 2.568575143814087 + }, + { + "auxiliary_loss_clip": 0.01164159, + "auxiliary_loss_mlp": 0.01057363, + "balance_loss_clip": 1.06023526, + "balance_loss_mlp": 1.03664446, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.0083700547383936, + "language_loss": 0.89886141, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92107671, + "num_input_tokens_seen": 47099860, + "step": 2181, + "time_per_iteration": 2.4912078380584717 + }, + { + "auxiliary_loss_clip": 0.01154717, + "auxiliary_loss_mlp": 0.01053763, + "balance_loss_clip": 1.05228388, + "balance_loss_mlp": 1.03359234, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.9163071956007542, + "language_loss": 0.68728292, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70936769, + "num_input_tokens_seen": 47118540, + "step": 2182, + "time_per_iteration": 2.493262767791748 + }, + { + "auxiliary_loss_clip": 0.01122241, + "auxiliary_loss_mlp": 0.01055062, + "balance_loss_clip": 1.04797399, + "balance_loss_mlp": 1.0336287, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 1.9052062178882778, + "language_loss": 0.7828002, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80457324, + "num_input_tokens_seen": 47136710, + "step": 2183, + "time_per_iteration": 2.5510411262512207 + }, + { + "auxiliary_loss_clip": 0.01166318, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_clip": 1.05298638, + "balance_loss_mlp": 1.02468657, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 2.752829173533398, + "language_loss": 0.85836554, + "learning_rate": 3.893298799142636e-06, + "loss": 0.88048184, + "num_input_tokens_seen": 47157155, + "step": 2184, + "time_per_iteration": 2.4765682220458984 + }, + { + "auxiliary_loss_clip": 0.01135749, + "auxiliary_loss_mlp": 0.01048307, + "balance_loss_clip": 1.05214894, + "balance_loss_mlp": 1.02729094, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 1.9163251424302596, + "language_loss": 0.82370472, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84554529, + "num_input_tokens_seen": 47176820, + "step": 2185, + "time_per_iteration": 2.5566463470458984 + }, + { + "auxiliary_loss_clip": 0.01142453, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_clip": 1.05101919, + "balance_loss_mlp": 1.02659237, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.396609341627887, + "language_loss": 0.73253274, + "learning_rate": 3.893047635600818e-06, + "loss": 0.75443196, + "num_input_tokens_seen": 47195855, + "step": 2186, + "time_per_iteration": 2.497530221939087 + }, + { + "auxiliary_loss_clip": 0.01156589, + "auxiliary_loss_mlp": 0.01047445, + "balance_loss_clip": 1.05432487, + "balance_loss_mlp": 1.025594, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.2174933689889893, + "language_loss": 0.80462039, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82666075, + "num_input_tokens_seen": 47214535, + "step": 2187, + "time_per_iteration": 2.489164113998413 + }, + { + "auxiliary_loss_clip": 0.01034194, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 1.02787876, + "balance_loss_mlp": 1.00067234, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8371840802677641, + "language_loss": 0.59022796, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61060369, + "num_input_tokens_seen": 47270300, + "step": 2188, + "time_per_iteration": 3.1616032123565674 + }, + { + "auxiliary_loss_clip": 0.01099168, + "auxiliary_loss_mlp": 0.01061005, + "balance_loss_clip": 1.04818797, + "balance_loss_mlp": 1.03922522, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 1.7878088819898568, + "language_loss": 0.74298084, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76458251, + "num_input_tokens_seen": 47290720, + "step": 2189, + "time_per_iteration": 2.654794216156006 + }, + { + "auxiliary_loss_clip": 0.01160722, + "auxiliary_loss_mlp": 0.01048539, + "balance_loss_clip": 1.05602002, + "balance_loss_mlp": 1.02801132, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 2.4461017937172707, + "language_loss": 0.72926283, + "learning_rate": 3.892544447140657e-06, + "loss": 0.75135547, + "num_input_tokens_seen": 47311820, + "step": 2190, + "time_per_iteration": 2.493030309677124 + }, + { + "auxiliary_loss_clip": 0.01159251, + "auxiliary_loss_mlp": 0.01054197, + "balance_loss_clip": 1.05655444, + "balance_loss_mlp": 1.03434825, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 7.034774842535896, + "language_loss": 0.7464143, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76854873, + "num_input_tokens_seen": 47331605, + "step": 2191, + "time_per_iteration": 2.509902000427246 + }, + { + "auxiliary_loss_clip": 0.01128348, + "auxiliary_loss_mlp": 0.01051706, + "balance_loss_clip": 1.05074573, + "balance_loss_mlp": 1.03053474, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.0603861070534557, + "language_loss": 0.79272199, + "learning_rate": 3.892292422298637e-06, + "loss": 0.81452256, + "num_input_tokens_seen": 47350455, + "step": 2192, + "time_per_iteration": 2.5402841567993164 + }, + { + "auxiliary_loss_clip": 0.01116095, + "auxiliary_loss_mlp": 0.01051068, + "balance_loss_clip": 1.04557431, + "balance_loss_mlp": 1.02986085, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.881227467203402, + "language_loss": 0.85269082, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87436241, + "num_input_tokens_seen": 47368225, + "step": 2193, + "time_per_iteration": 2.565570831298828 + }, + { + "auxiliary_loss_clip": 0.01044989, + "auxiliary_loss_mlp": 0.01005365, + "balance_loss_clip": 1.02735233, + "balance_loss_mlp": 1.00155032, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7591764753149214, + "language_loss": 0.54088545, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56138897, + "num_input_tokens_seen": 47427125, + "step": 2194, + "time_per_iteration": 3.057685375213623 + }, + { + "auxiliary_loss_clip": 0.01168586, + "auxiliary_loss_mlp": 0.01047485, + "balance_loss_clip": 1.05478764, + "balance_loss_mlp": 1.02711225, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.7533395358196873, + "language_loss": 0.72427648, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74643713, + "num_input_tokens_seen": 47450275, + "step": 2195, + "time_per_iteration": 2.4921178817749023 + }, + { + "auxiliary_loss_clip": 0.0113538, + "auxiliary_loss_mlp": 0.00787381, + "balance_loss_clip": 1.05088615, + "balance_loss_mlp": 1.00022221, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.701159954664015, + "language_loss": 0.78153384, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80076146, + "num_input_tokens_seen": 47469155, + "step": 2196, + "time_per_iteration": 4.012857913970947 + }, + { + "auxiliary_loss_clip": 0.01161574, + "auxiliary_loss_mlp": 0.01054647, + "balance_loss_clip": 1.05244303, + "balance_loss_mlp": 1.03445256, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 6.876251065242408, + "language_loss": 0.75169748, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77385968, + "num_input_tokens_seen": 47488405, + "step": 2197, + "time_per_iteration": 2.493170976638794 + }, + { + "auxiliary_loss_clip": 0.0117194, + "auxiliary_loss_mlp": 0.01054161, + "balance_loss_clip": 1.05222404, + "balance_loss_mlp": 1.03365707, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.0807756605072227, + "language_loss": 0.79888749, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82114851, + "num_input_tokens_seen": 47505650, + "step": 2198, + "time_per_iteration": 2.420905828475952 + }, + { + "auxiliary_loss_clip": 0.011715, + "auxiliary_loss_mlp": 0.01066477, + "balance_loss_clip": 1.05672145, + "balance_loss_mlp": 1.04683161, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.4744644011148464, + "language_loss": 0.82704079, + "learning_rate": 3.891408075291425e-06, + "loss": 0.84942055, + "num_input_tokens_seen": 47521540, + "step": 2199, + "time_per_iteration": 3.8034231662750244 + }, + { + "auxiliary_loss_clip": 0.01120435, + "auxiliary_loss_mlp": 0.01059857, + "balance_loss_clip": 1.04812312, + "balance_loss_mlp": 1.03937674, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 2.409824103670193, + "language_loss": 0.69135624, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.7131592, + "num_input_tokens_seen": 47543625, + "step": 2200, + "time_per_iteration": 2.6780457496643066 + }, + { + "auxiliary_loss_clip": 0.01169948, + "auxiliary_loss_mlp": 0.01062703, + "balance_loss_clip": 1.05446887, + "balance_loss_mlp": 1.04155564, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 1.7115095986568922, + "language_loss": 0.84694982, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86927629, + "num_input_tokens_seen": 47563740, + "step": 2201, + "time_per_iteration": 2.4456582069396973 + }, + { + "auxiliary_loss_clip": 0.0117365, + "auxiliary_loss_mlp": 0.01062113, + "balance_loss_clip": 1.0556941, + "balance_loss_mlp": 1.04088187, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 2.414012816075658, + "language_loss": 0.86827147, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89062911, + "num_input_tokens_seen": 47582655, + "step": 2202, + "time_per_iteration": 2.470341920852661 + }, + { + "auxiliary_loss_clip": 0.01139533, + "auxiliary_loss_mlp": 0.01050103, + "balance_loss_clip": 1.05453002, + "balance_loss_mlp": 1.03071952, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.564575091283547, + "language_loss": 0.72604513, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74794149, + "num_input_tokens_seen": 47600875, + "step": 2203, + "time_per_iteration": 2.54341721534729 + }, + { + "auxiliary_loss_clip": 0.01123218, + "auxiliary_loss_mlp": 0.01059327, + "balance_loss_clip": 1.04728186, + "balance_loss_mlp": 1.03939509, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.702316883592169, + "language_loss": 0.73216248, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75398791, + "num_input_tokens_seen": 47619250, + "step": 2204, + "time_per_iteration": 3.9496755599975586 + }, + { + "auxiliary_loss_clip": 0.01162355, + "auxiliary_loss_mlp": 0.01054553, + "balance_loss_clip": 1.05516088, + "balance_loss_mlp": 1.03330994, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.8542493687053805, + "language_loss": 0.78658545, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80875444, + "num_input_tokens_seen": 47639445, + "step": 2205, + "time_per_iteration": 2.553670644760132 + }, + { + "auxiliary_loss_clip": 0.01125449, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.04802287, + "balance_loss_mlp": 1.02844453, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 1.965718736440945, + "language_loss": 0.79050386, + "learning_rate": 3.890520213887941e-06, + "loss": 0.81224799, + "num_input_tokens_seen": 47658740, + "step": 2206, + "time_per_iteration": 2.547389268875122 + }, + { + "auxiliary_loss_clip": 0.01126179, + "auxiliary_loss_mlp": 0.01049214, + "balance_loss_clip": 1.04798937, + "balance_loss_mlp": 1.02997327, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.069445352597645, + "language_loss": 0.74852806, + "learning_rate": 3.890393089751208e-06, + "loss": 0.77028197, + "num_input_tokens_seen": 47676880, + "step": 2207, + "time_per_iteration": 2.5374834537506104 + }, + { + "auxiliary_loss_clip": 0.01143049, + "auxiliary_loss_mlp": 0.0104612, + "balance_loss_clip": 1.04978776, + "balance_loss_mlp": 1.02559257, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.020522947489435, + "language_loss": 0.83802152, + "learning_rate": 3.890265893930578e-06, + "loss": 0.85991323, + "num_input_tokens_seen": 47696635, + "step": 2208, + "time_per_iteration": 2.570549488067627 + }, + { + "auxiliary_loss_clip": 0.01149631, + "auxiliary_loss_mlp": 0.01053779, + "balance_loss_clip": 1.05457735, + "balance_loss_mlp": 1.03530145, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.9505399022483547, + "language_loss": 0.85319579, + "learning_rate": 3.890138626430876e-06, + "loss": 0.8752299, + "num_input_tokens_seen": 47717760, + "step": 2209, + "time_per_iteration": 2.5448851585388184 + }, + { + "auxiliary_loss_clip": 0.01137547, + "auxiliary_loss_mlp": 0.00787164, + "balance_loss_clip": 1.050179, + "balance_loss_mlp": 1.00014913, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.5430050599245497, + "language_loss": 0.82204175, + "learning_rate": 3.890011287256929e-06, + "loss": 0.84128886, + "num_input_tokens_seen": 47737685, + "step": 2210, + "time_per_iteration": 2.5704190731048584 + }, + { + "auxiliary_loss_clip": 0.01026763, + "auxiliary_loss_mlp": 0.00759796, + "balance_loss_clip": 1.01877069, + "balance_loss_mlp": 0.99962181, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7626687517464933, + "language_loss": 0.5799455, + "learning_rate": 3.889883876413563e-06, + "loss": 0.59781104, + "num_input_tokens_seen": 47802415, + "step": 2211, + "time_per_iteration": 3.2865874767303467 + }, + { + "auxiliary_loss_clip": 0.01056616, + "auxiliary_loss_mlp": 0.01005175, + "balance_loss_clip": 1.02942479, + "balance_loss_mlp": 1.00229025, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.8109897190704899, + "language_loss": 0.55392289, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57454085, + "num_input_tokens_seen": 47871485, + "step": 2212, + "time_per_iteration": 3.156071901321411 + }, + { + "auxiliary_loss_clip": 0.01133527, + "auxiliary_loss_mlp": 0.01053546, + "balance_loss_clip": 1.04936647, + "balance_loss_mlp": 1.0329113, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 3.252039308572412, + "language_loss": 0.74106103, + "learning_rate": 3.889628839737908e-06, + "loss": 0.76293182, + "num_input_tokens_seen": 47888315, + "step": 2213, + "time_per_iteration": 2.5139482021331787 + }, + { + "auxiliary_loss_clip": 0.01115067, + "auxiliary_loss_mlp": 0.01047614, + "balance_loss_clip": 1.04323554, + "balance_loss_mlp": 1.02923155, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 1.7741580624769837, + "language_loss": 0.79325825, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81488502, + "num_input_tokens_seen": 47906600, + "step": 2214, + "time_per_iteration": 4.062878131866455 + }, + { + "auxiliary_loss_clip": 0.01141939, + "auxiliary_loss_mlp": 0.01050434, + "balance_loss_clip": 1.05195546, + "balance_loss_mlp": 1.03009677, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 1.9713663279053735, + "language_loss": 0.69723648, + "learning_rate": 3.889373516442597e-06, + "loss": 0.7191602, + "num_input_tokens_seen": 47927630, + "step": 2215, + "time_per_iteration": 2.615144968032837 + }, + { + "auxiliary_loss_clip": 0.01162178, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.05524027, + "balance_loss_mlp": 1.02877808, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.7563946497125724, + "language_loss": 0.81072617, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83283365, + "num_input_tokens_seen": 47947935, + "step": 2216, + "time_per_iteration": 2.5161550045013428 + }, + { + "auxiliary_loss_clip": 0.01158418, + "auxiliary_loss_mlp": 0.01056799, + "balance_loss_clip": 1.05669057, + "balance_loss_mlp": 1.03643847, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 3.4567081062441636, + "language_loss": 0.87265301, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89480513, + "num_input_tokens_seen": 47965515, + "step": 2217, + "time_per_iteration": 2.4782145023345947 + }, + { + "auxiliary_loss_clip": 0.01148081, + "auxiliary_loss_mlp": 0.01048509, + "balance_loss_clip": 1.05489016, + "balance_loss_mlp": 1.02718234, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.8955657424532695, + "language_loss": 0.73382545, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75579131, + "num_input_tokens_seen": 47985675, + "step": 2218, + "time_per_iteration": 2.5637707710266113 + }, + { + "auxiliary_loss_clip": 0.01125624, + "auxiliary_loss_mlp": 0.01043617, + "balance_loss_clip": 1.04831171, + "balance_loss_mlp": 1.02288616, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 2.1152017270334835, + "language_loss": 0.87232804, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89402044, + "num_input_tokens_seen": 48004985, + "step": 2219, + "time_per_iteration": 2.620281934738159 + }, + { + "auxiliary_loss_clip": 0.01136246, + "auxiliary_loss_mlp": 0.01054465, + "balance_loss_clip": 1.05076218, + "balance_loss_mlp": 1.03517747, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.661743914438212, + "language_loss": 0.77306199, + "learning_rate": 3.888733954497574e-06, + "loss": 0.7949692, + "num_input_tokens_seen": 48024965, + "step": 2220, + "time_per_iteration": 2.5684866905212402 + }, + { + "auxiliary_loss_clip": 0.0114429, + "auxiliary_loss_mlp": 0.01045517, + "balance_loss_clip": 1.05054295, + "balance_loss_mlp": 1.02698028, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.678712857740769, + "language_loss": 0.788472, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81037009, + "num_input_tokens_seen": 48040890, + "step": 2221, + "time_per_iteration": 2.499925374984741 + }, + { + "auxiliary_loss_clip": 0.01061954, + "auxiliary_loss_mlp": 0.01018242, + "balance_loss_clip": 1.02706647, + "balance_loss_mlp": 1.01565552, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9743990458693994, + "language_loss": 0.68960214, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.7104041, + "num_input_tokens_seen": 48091855, + "step": 2222, + "time_per_iteration": 2.9014534950256348 + }, + { + "auxiliary_loss_clip": 0.01131043, + "auxiliary_loss_mlp": 0.01050514, + "balance_loss_clip": 1.0565027, + "balance_loss_mlp": 1.03181028, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 3.9129955630795763, + "language_loss": 0.6732049, + "learning_rate": 3.888349357839982e-06, + "loss": 0.6950205, + "num_input_tokens_seen": 48111350, + "step": 2223, + "time_per_iteration": 2.5704119205474854 + }, + { + "auxiliary_loss_clip": 0.01157611, + "auxiliary_loss_mlp": 0.01058556, + "balance_loss_clip": 1.05417037, + "balance_loss_mlp": 1.03771842, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 3.37015294047872, + "language_loss": 0.82448024, + "learning_rate": 3.88822101573484e-06, + "loss": 0.8466419, + "num_input_tokens_seen": 48129840, + "step": 2224, + "time_per_iteration": 2.462874174118042 + }, + { + "auxiliary_loss_clip": 0.01173842, + "auxiliary_loss_mlp": 0.01043415, + "balance_loss_clip": 1.0556736, + "balance_loss_mlp": 1.0225656, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.178013807775607, + "language_loss": 0.65914023, + "learning_rate": 3.888092602028167e-06, + "loss": 0.6813128, + "num_input_tokens_seen": 48149240, + "step": 2225, + "time_per_iteration": 2.4555718898773193 + }, + { + "auxiliary_loss_clip": 0.01154941, + "auxiliary_loss_mlp": 0.01049241, + "balance_loss_clip": 1.05082381, + "balance_loss_mlp": 1.02874947, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.587438174933198, + "language_loss": 0.89535666, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91739857, + "num_input_tokens_seen": 48166330, + "step": 2226, + "time_per_iteration": 2.4482223987579346 + }, + { + "auxiliary_loss_clip": 0.01150279, + "auxiliary_loss_mlp": 0.01054891, + "balance_loss_clip": 1.0534879, + "balance_loss_mlp": 1.03513813, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 1.9047003823116437, + "language_loss": 0.73695588, + "learning_rate": 3.887835559829712e-06, + "loss": 0.75900757, + "num_input_tokens_seen": 48187600, + "step": 2227, + "time_per_iteration": 2.5317635536193848 + }, + { + "auxiliary_loss_clip": 0.01156911, + "auxiliary_loss_mlp": 0.01049653, + "balance_loss_clip": 1.05223393, + "balance_loss_mlp": 1.0287199, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.2827540638499415, + "language_loss": 0.85356486, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.8756305, + "num_input_tokens_seen": 48204400, + "step": 2228, + "time_per_iteration": 2.4423747062683105 + }, + { + "auxiliary_loss_clip": 0.01136341, + "auxiliary_loss_mlp": 0.01050147, + "balance_loss_clip": 1.0499301, + "balance_loss_mlp": 1.02890396, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.053019076490066, + "language_loss": 0.80926967, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83113456, + "num_input_tokens_seen": 48222180, + "step": 2229, + "time_per_iteration": 2.5356006622314453 + }, + { + "auxiliary_loss_clip": 0.01111187, + "auxiliary_loss_mlp": 0.01055271, + "balance_loss_clip": 1.04928064, + "balance_loss_mlp": 1.03467131, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.8508035766422295, + "language_loss": 0.74530256, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76696718, + "num_input_tokens_seen": 48243245, + "step": 2230, + "time_per_iteration": 2.662754774093628 + }, + { + "auxiliary_loss_clip": 0.01126921, + "auxiliary_loss_mlp": 0.01058618, + "balance_loss_clip": 1.05170941, + "balance_loss_mlp": 1.03863811, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 2.2008561183438506, + "language_loss": 0.80174708, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82360244, + "num_input_tokens_seen": 48262600, + "step": 2231, + "time_per_iteration": 2.5165913105010986 + }, + { + "auxiliary_loss_clip": 0.01115755, + "auxiliary_loss_mlp": 0.0105656, + "balance_loss_clip": 1.05076432, + "balance_loss_mlp": 1.03307533, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 3.5206471717354053, + "language_loss": 0.72008747, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74181068, + "num_input_tokens_seen": 48285075, + "step": 2232, + "time_per_iteration": 2.641958713531494 + }, + { + "auxiliary_loss_clip": 0.01125598, + "auxiliary_loss_mlp": 0.01048653, + "balance_loss_clip": 1.049541, + "balance_loss_mlp": 1.02777946, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.4360235681808593, + "language_loss": 0.65453303, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67627561, + "num_input_tokens_seen": 48301285, + "step": 2233, + "time_per_iteration": 2.5781896114349365 + }, + { + "auxiliary_loss_clip": 0.01169009, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.05188584, + "balance_loss_mlp": 1.02275658, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 2.5250325043547304, + "language_loss": 0.81672657, + "learning_rate": 3.886933657403615e-06, + "loss": 0.83884978, + "num_input_tokens_seen": 48317835, + "step": 2234, + "time_per_iteration": 2.428267478942871 + }, + { + "auxiliary_loss_clip": 0.01144418, + "auxiliary_loss_mlp": 0.01054914, + "balance_loss_clip": 1.05118513, + "balance_loss_mlp": 1.03445792, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.855075258099814, + "language_loss": 0.81903124, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84102464, + "num_input_tokens_seen": 48335670, + "step": 2235, + "time_per_iteration": 4.038635730743408 + }, + { + "auxiliary_loss_clip": 0.0115275, + "auxiliary_loss_mlp": 0.01055043, + "balance_loss_clip": 1.0521009, + "balance_loss_mlp": 1.0332756, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.9654306438508606, + "language_loss": 0.86772591, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88980389, + "num_input_tokens_seen": 48357805, + "step": 2236, + "time_per_iteration": 2.5294437408447266 + }, + { + "auxiliary_loss_clip": 0.01171339, + "auxiliary_loss_mlp": 0.01049541, + "balance_loss_clip": 1.05473065, + "balance_loss_mlp": 1.02892947, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.8648353477444115, + "language_loss": 0.77301091, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79521978, + "num_input_tokens_seen": 48377845, + "step": 2237, + "time_per_iteration": 2.4452199935913086 + }, + { + "auxiliary_loss_clip": 0.01154019, + "auxiliary_loss_mlp": 0.01051495, + "balance_loss_clip": 1.05505931, + "balance_loss_mlp": 1.02969182, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 2.236149778110953, + "language_loss": 0.79114091, + "learning_rate": 3.886416710321491e-06, + "loss": 0.81319606, + "num_input_tokens_seen": 48394735, + "step": 2238, + "time_per_iteration": 3.9244141578674316 + }, + { + "auxiliary_loss_clip": 0.01146938, + "auxiliary_loss_mlp": 0.01053045, + "balance_loss_clip": 1.05336356, + "balance_loss_mlp": 1.03164673, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.053356850889276, + "language_loss": 0.68402326, + "learning_rate": 3.886287294705924e-06, + "loss": 0.7060231, + "num_input_tokens_seen": 48414200, + "step": 2239, + "time_per_iteration": 2.570068836212158 + }, + { + "auxiliary_loss_clip": 0.01150749, + "auxiliary_loss_mlp": 0.0105479, + "balance_loss_clip": 1.05182171, + "balance_loss_mlp": 1.03399968, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 8.823062688342928, + "language_loss": 0.81486773, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83692312, + "num_input_tokens_seen": 48431065, + "step": 2240, + "time_per_iteration": 2.4768226146698 + }, + { + "auxiliary_loss_clip": 0.01112949, + "auxiliary_loss_mlp": 0.0105003, + "balance_loss_clip": 1.04608297, + "balance_loss_mlp": 1.02883446, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.745191410931862, + "language_loss": 0.77761638, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79924619, + "num_input_tokens_seen": 48450335, + "step": 2241, + "time_per_iteration": 2.5779800415039062 + }, + { + "auxiliary_loss_clip": 0.01167494, + "auxiliary_loss_mlp": 0.01039281, + "balance_loss_clip": 1.05596793, + "balance_loss_mlp": 1.02106631, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.772837866893453, + "language_loss": 0.83459193, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85665965, + "num_input_tokens_seen": 48468555, + "step": 2242, + "time_per_iteration": 2.4699010848999023 + }, + { + "auxiliary_loss_clip": 0.01172808, + "auxiliary_loss_mlp": 0.01056732, + "balance_loss_clip": 1.0565846, + "balance_loss_mlp": 1.03460646, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 3.033626063147923, + "language_loss": 0.64605099, + "learning_rate": 3.885768917010744e-06, + "loss": 0.66834641, + "num_input_tokens_seen": 48488515, + "step": 2243, + "time_per_iteration": 3.882781744003296 + }, + { + "auxiliary_loss_clip": 0.01132622, + "auxiliary_loss_mlp": 0.01049182, + "balance_loss_clip": 1.04889143, + "balance_loss_mlp": 1.02878571, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 2.244916955865954, + "language_loss": 0.72554708, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74736512, + "num_input_tokens_seen": 48510515, + "step": 2244, + "time_per_iteration": 2.567077875137329 + }, + { + "auxiliary_loss_clip": 0.01154135, + "auxiliary_loss_mlp": 0.01052802, + "balance_loss_clip": 1.05227685, + "balance_loss_mlp": 1.03389549, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.60398206055625, + "language_loss": 0.8593213, + "learning_rate": 3.88550929909221e-06, + "loss": 0.88139069, + "num_input_tokens_seen": 48529940, + "step": 2245, + "time_per_iteration": 2.4716904163360596 + }, + { + "auxiliary_loss_clip": 0.01153752, + "auxiliary_loss_mlp": 0.01049583, + "balance_loss_clip": 1.05206776, + "balance_loss_mlp": 1.02966344, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.6988461449089822, + "language_loss": 0.78805339, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81008673, + "num_input_tokens_seen": 48548190, + "step": 2246, + "time_per_iteration": 2.465383291244507 + }, + { + "auxiliary_loss_clip": 0.01039027, + "auxiliary_loss_mlp": 0.0101389, + "balance_loss_clip": 1.03691232, + "balance_loss_mlp": 1.01052856, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7580966844108473, + "language_loss": 0.60521615, + "learning_rate": 3.885249395178874e-06, + "loss": 0.6257453, + "num_input_tokens_seen": 48613165, + "step": 2247, + "time_per_iteration": 3.312507390975952 + }, + { + "auxiliary_loss_clip": 0.01165995, + "auxiliary_loss_mlp": 0.01060236, + "balance_loss_clip": 1.05742824, + "balance_loss_mlp": 1.03741956, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.2066415216349817, + "language_loss": 0.80816907, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83043134, + "num_input_tokens_seen": 48631705, + "step": 2248, + "time_per_iteration": 2.4813578128814697 + }, + { + "auxiliary_loss_clip": 0.01147823, + "auxiliary_loss_mlp": 0.01046599, + "balance_loss_clip": 1.05378127, + "balance_loss_mlp": 1.02776432, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.107969383261332, + "language_loss": 0.76922864, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79117286, + "num_input_tokens_seen": 48649740, + "step": 2249, + "time_per_iteration": 2.547572612762451 + }, + { + "auxiliary_loss_clip": 0.01131557, + "auxiliary_loss_mlp": 0.0105288, + "balance_loss_clip": 1.05471885, + "balance_loss_mlp": 1.03377056, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.8680686325756064, + "language_loss": 0.84613872, + "learning_rate": 3.884859003154862e-06, + "loss": 0.8679831, + "num_input_tokens_seen": 48671565, + "step": 2250, + "time_per_iteration": 2.6063857078552246 + }, + { + "auxiliary_loss_clip": 0.01160178, + "auxiliary_loss_mlp": 0.01047076, + "balance_loss_clip": 1.05528879, + "balance_loss_mlp": 1.02591586, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 2.0068564633602892, + "language_loss": 0.82262707, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84469956, + "num_input_tokens_seen": 48690425, + "step": 2251, + "time_per_iteration": 2.511366128921509 + }, + { + "auxiliary_loss_clip": 0.01169095, + "auxiliary_loss_mlp": 0.01056288, + "balance_loss_clip": 1.05323172, + "balance_loss_mlp": 1.0344007, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.843203349767965, + "language_loss": 0.85776913, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88002294, + "num_input_tokens_seen": 48707505, + "step": 2252, + "time_per_iteration": 2.4530110359191895 + }, + { + "auxiliary_loss_clip": 0.01062149, + "auxiliary_loss_mlp": 0.01013162, + "balance_loss_clip": 1.02656972, + "balance_loss_mlp": 1.00984764, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7610592842007277, + "language_loss": 0.61787552, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63862854, + "num_input_tokens_seen": 48775895, + "step": 2253, + "time_per_iteration": 4.641030550003052 + }, + { + "auxiliary_loss_clip": 0.01156259, + "auxiliary_loss_mlp": 0.01055137, + "balance_loss_clip": 1.05356026, + "balance_loss_mlp": 1.03559816, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 1.7013606234291874, + "language_loss": 0.89305365, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91516757, + "num_input_tokens_seen": 48798370, + "step": 2254, + "time_per_iteration": 2.5474889278411865 + }, + { + "auxiliary_loss_clip": 0.01138469, + "auxiliary_loss_mlp": 0.01058727, + "balance_loss_clip": 1.0469538, + "balance_loss_mlp": 1.03350198, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 1.9640376372852004, + "language_loss": 0.84687161, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86884356, + "num_input_tokens_seen": 48817955, + "step": 2255, + "time_per_iteration": 2.567049026489258 + }, + { + "auxiliary_loss_clip": 0.01170996, + "auxiliary_loss_mlp": 0.01052596, + "balance_loss_clip": 1.05526221, + "balance_loss_mlp": 1.0319016, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 4.762015060883339, + "language_loss": 0.75053859, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77277446, + "num_input_tokens_seen": 48836330, + "step": 2256, + "time_per_iteration": 2.4916775226593018 + }, + { + "auxiliary_loss_clip": 0.01129721, + "auxiliary_loss_mlp": 0.0105483, + "balance_loss_clip": 1.05116224, + "balance_loss_mlp": 1.03270483, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 2.1438040414791186, + "language_loss": 0.83266872, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85451424, + "num_input_tokens_seen": 48851890, + "step": 2257, + "time_per_iteration": 2.5016705989837646 + }, + { + "auxiliary_loss_clip": 0.0115514, + "auxiliary_loss_mlp": 0.0079043, + "balance_loss_clip": 1.05483747, + "balance_loss_mlp": 1.00031161, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.5947981420492723, + "language_loss": 0.81606162, + "learning_rate": 3.883814813262277e-06, + "loss": 0.83551729, + "num_input_tokens_seen": 48865510, + "step": 2258, + "time_per_iteration": 2.4918956756591797 + }, + { + "auxiliary_loss_clip": 0.01161237, + "auxiliary_loss_mlp": 0.01057134, + "balance_loss_clip": 1.0545671, + "balance_loss_mlp": 1.03438926, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.225710525312403, + "language_loss": 0.82398051, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84616423, + "num_input_tokens_seen": 48882360, + "step": 2259, + "time_per_iteration": 2.4529452323913574 + }, + { + "auxiliary_loss_clip": 0.01133913, + "auxiliary_loss_mlp": 0.01059864, + "balance_loss_clip": 1.05333543, + "balance_loss_mlp": 1.04081392, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 2.0675690130900923, + "language_loss": 0.73568046, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75761819, + "num_input_tokens_seen": 48902700, + "step": 2260, + "time_per_iteration": 2.588621139526367 + }, + { + "auxiliary_loss_clip": 0.01148857, + "auxiliary_loss_mlp": 0.01060404, + "balance_loss_clip": 1.05486798, + "balance_loss_mlp": 1.04045999, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.343164579945206, + "language_loss": 0.75453687, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77662957, + "num_input_tokens_seen": 48922525, + "step": 2261, + "time_per_iteration": 2.589540481567383 + }, + { + "auxiliary_loss_clip": 0.01172461, + "auxiliary_loss_mlp": 0.0105269, + "balance_loss_clip": 1.05515051, + "balance_loss_mlp": 1.03185201, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 2.260222761454406, + "language_loss": 0.63136578, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65361726, + "num_input_tokens_seen": 48942510, + "step": 2262, + "time_per_iteration": 2.555925130844116 + }, + { + "auxiliary_loss_clip": 0.011504, + "auxiliary_loss_mlp": 0.01049711, + "balance_loss_clip": 1.05254352, + "balance_loss_mlp": 1.02956486, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.6220087003930743, + "language_loss": 0.82449925, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84650028, + "num_input_tokens_seen": 48962625, + "step": 2263, + "time_per_iteration": 2.5645132064819336 + }, + { + "auxiliary_loss_clip": 0.01102118, + "auxiliary_loss_mlp": 0.01065037, + "balance_loss_clip": 1.05124009, + "balance_loss_mlp": 1.04156482, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.9644571488614584, + "language_loss": 0.87857556, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.9002471, + "num_input_tokens_seen": 48982525, + "step": 2264, + "time_per_iteration": 2.6559717655181885 + }, + { + "auxiliary_loss_clip": 0.01161119, + "auxiliary_loss_mlp": 0.01053786, + "balance_loss_clip": 1.05457354, + "balance_loss_mlp": 1.03167284, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 3.3666289635608124, + "language_loss": 0.70885801, + "learning_rate": 3.882897396711683e-06, + "loss": 0.7310071, + "num_input_tokens_seen": 48997605, + "step": 2265, + "time_per_iteration": 2.464301824569702 + }, + { + "auxiliary_loss_clip": 0.01113855, + "auxiliary_loss_mlp": 0.01044375, + "balance_loss_clip": 1.05338681, + "balance_loss_mlp": 1.02393126, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.3154377607072756, + "language_loss": 0.66969156, + "learning_rate": 3.882766051566027e-06, + "loss": 0.69127381, + "num_input_tokens_seen": 49018535, + "step": 2266, + "time_per_iteration": 2.690749406814575 + }, + { + "auxiliary_loss_clip": 0.0113024, + "auxiliary_loss_mlp": 0.01057838, + "balance_loss_clip": 1.05773783, + "balance_loss_mlp": 1.03840721, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.7060376922362719, + "language_loss": 0.76656401, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78844476, + "num_input_tokens_seen": 49038865, + "step": 2267, + "time_per_iteration": 2.6250545978546143 + }, + { + "auxiliary_loss_clip": 0.011385, + "auxiliary_loss_mlp": 0.01050328, + "balance_loss_clip": 1.04835415, + "balance_loss_mlp": 1.02970529, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.1699971802864697, + "language_loss": 0.8203752, + "learning_rate": 3.882503147095667e-06, + "loss": 0.84226352, + "num_input_tokens_seen": 49058010, + "step": 2268, + "time_per_iteration": 2.550572156906128 + }, + { + "auxiliary_loss_clip": 0.01160305, + "auxiliary_loss_mlp": 0.0104771, + "balance_loss_clip": 1.05832088, + "balance_loss_mlp": 1.02635992, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 1.7138688790912247, + "language_loss": 0.76237422, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78445441, + "num_input_tokens_seen": 49080330, + "step": 2269, + "time_per_iteration": 2.6034188270568848 + }, + { + "auxiliary_loss_clip": 0.01143326, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_clip": 1.05427599, + "balance_loss_mlp": 1.02795219, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 1.8877535267771748, + "language_loss": 0.80779266, + "learning_rate": 3.882239957086477e-06, + "loss": 0.82971799, + "num_input_tokens_seen": 49097035, + "step": 2270, + "time_per_iteration": 2.57667875289917 + }, + { + "auxiliary_loss_clip": 0.01144307, + "auxiliary_loss_mlp": 0.01054579, + "balance_loss_clip": 1.04983866, + "balance_loss_mlp": 1.03289437, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.962106550616017, + "language_loss": 0.75796115, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77995002, + "num_input_tokens_seen": 49113945, + "step": 2271, + "time_per_iteration": 2.5156068801879883 + }, + { + "auxiliary_loss_clip": 0.01163104, + "auxiliary_loss_mlp": 0.01056016, + "balance_loss_clip": 1.05382514, + "balance_loss_mlp": 1.03405786, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 3.0506501619986435, + "language_loss": 0.80619001, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82838118, + "num_input_tokens_seen": 49132855, + "step": 2272, + "time_per_iteration": 2.4810495376586914 + }, + { + "auxiliary_loss_clip": 0.01067106, + "auxiliary_loss_mlp": 0.01004606, + "balance_loss_clip": 1.0271765, + "balance_loss_mlp": 1.00201952, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.708724761347095, + "language_loss": 0.60692811, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62764525, + "num_input_tokens_seen": 49198310, + "step": 2273, + "time_per_iteration": 3.168198347091675 + }, + { + "auxiliary_loss_clip": 0.01170231, + "auxiliary_loss_mlp": 0.00788328, + "balance_loss_clip": 1.05565345, + "balance_loss_mlp": 1.0002048, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.6662585667614633, + "language_loss": 0.77605104, + "learning_rate": 3.881712720611336e-06, + "loss": 0.79563665, + "num_input_tokens_seen": 49217250, + "step": 2274, + "time_per_iteration": 2.448814868927002 + }, + { + "auxiliary_loss_clip": 0.01157042, + "auxiliary_loss_mlp": 0.01051941, + "balance_loss_clip": 1.05134892, + "balance_loss_mlp": 1.03023314, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 1.9595969651156009, + "language_loss": 0.78577518, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80786502, + "num_input_tokens_seen": 49236615, + "step": 2275, + "time_per_iteration": 3.951289653778076 + }, + { + "auxiliary_loss_clip": 0.01158283, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.05401313, + "balance_loss_mlp": 1.02334476, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.4037057345836987, + "language_loss": 0.81551194, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83752859, + "num_input_tokens_seen": 49253935, + "step": 2276, + "time_per_iteration": 2.4618568420410156 + }, + { + "auxiliary_loss_clip": 0.01169212, + "auxiliary_loss_mlp": 0.01060679, + "balance_loss_clip": 1.05441785, + "balance_loss_mlp": 1.03599095, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 3.4355698702238504, + "language_loss": 0.69386649, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71616542, + "num_input_tokens_seen": 49273605, + "step": 2277, + "time_per_iteration": 4.023723363876343 + }, + { + "auxiliary_loss_clip": 0.01162721, + "auxiliary_loss_mlp": 0.00789042, + "balance_loss_clip": 1.05368066, + "balance_loss_mlp": 1.00027752, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.12105314921831, + "language_loss": 0.8028878, + "learning_rate": 3.88118434246049e-06, + "loss": 0.8224054, + "num_input_tokens_seen": 49291785, + "step": 2278, + "time_per_iteration": 2.4956367015838623 + }, + { + "auxiliary_loss_clip": 0.0115788, + "auxiliary_loss_mlp": 0.01054793, + "balance_loss_clip": 1.05872226, + "balance_loss_mlp": 1.03359747, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.713982243032897, + "language_loss": 0.7480275, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77015424, + "num_input_tokens_seen": 49311405, + "step": 2279, + "time_per_iteration": 2.5899479389190674 + }, + { + "auxiliary_loss_clip": 0.01103998, + "auxiliary_loss_mlp": 0.01058821, + "balance_loss_clip": 1.04718709, + "balance_loss_mlp": 1.03687441, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.1203512415170485, + "language_loss": 0.7677964, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78942454, + "num_input_tokens_seen": 49331835, + "step": 2280, + "time_per_iteration": 2.6617746353149414 + }, + { + "auxiliary_loss_clip": 0.01110334, + "auxiliary_loss_mlp": 0.01049445, + "balance_loss_clip": 1.04444206, + "balance_loss_mlp": 1.02941751, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 2.141518110197129, + "language_loss": 0.79849422, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82009196, + "num_input_tokens_seen": 49352290, + "step": 2281, + "time_per_iteration": 2.6854188442230225 + }, + { + "auxiliary_loss_clip": 0.01176151, + "auxiliary_loss_mlp": 0.01060054, + "balance_loss_clip": 1.0558219, + "balance_loss_mlp": 1.03926349, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.7533068435207608, + "language_loss": 0.83581167, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85817373, + "num_input_tokens_seen": 49370285, + "step": 2282, + "time_per_iteration": 2.4373319149017334 + }, + { + "auxiliary_loss_clip": 0.01145764, + "auxiliary_loss_mlp": 0.01060957, + "balance_loss_clip": 1.05054498, + "balance_loss_mlp": 1.04178786, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.9507376125606661, + "language_loss": 0.73682022, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.75888741, + "num_input_tokens_seen": 49389610, + "step": 2283, + "time_per_iteration": 3.8645451068878174 + }, + { + "auxiliary_loss_clip": 0.01163351, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.05674636, + "balance_loss_mlp": 1.03851891, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.1733140582094044, + "language_loss": 0.83956778, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86178786, + "num_input_tokens_seen": 49408390, + "step": 2284, + "time_per_iteration": 2.5541679859161377 + }, + { + "auxiliary_loss_clip": 0.0115311, + "auxiliary_loss_mlp": 0.01058263, + "balance_loss_clip": 1.05381465, + "balance_loss_mlp": 1.03595901, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.754013666221791, + "language_loss": 0.75187397, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77398777, + "num_input_tokens_seen": 49427725, + "step": 2285, + "time_per_iteration": 2.5971405506134033 + }, + { + "auxiliary_loss_clip": 0.01153141, + "auxiliary_loss_mlp": 0.01055792, + "balance_loss_clip": 1.05737662, + "balance_loss_mlp": 1.03475165, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.9122053785804793, + "language_loss": 0.74642563, + "learning_rate": 3.880124162414689e-06, + "loss": 0.76851499, + "num_input_tokens_seen": 49449000, + "step": 2286, + "time_per_iteration": 2.592015266418457 + }, + { + "auxiliary_loss_clip": 0.01131199, + "auxiliary_loss_mlp": 0.01052523, + "balance_loss_clip": 1.05244565, + "balance_loss_mlp": 1.03032577, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.7519639174734016, + "language_loss": 0.86702019, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88885742, + "num_input_tokens_seen": 49468360, + "step": 2287, + "time_per_iteration": 2.6526660919189453 + }, + { + "auxiliary_loss_clip": 0.01129744, + "auxiliary_loss_mlp": 0.01060001, + "balance_loss_clip": 1.04903865, + "balance_loss_mlp": 1.03792322, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 1.869795816777136, + "language_loss": 0.68093693, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70283437, + "num_input_tokens_seen": 49493450, + "step": 2288, + "time_per_iteration": 2.7225427627563477 + }, + { + "auxiliary_loss_clip": 0.01114882, + "auxiliary_loss_mlp": 0.01059203, + "balance_loss_clip": 1.05138683, + "balance_loss_mlp": 1.03525376, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 4.3017393660041465, + "language_loss": 0.86384118, + "learning_rate": 3.879725418400005e-06, + "loss": 0.88558209, + "num_input_tokens_seen": 49511220, + "step": 2289, + "time_per_iteration": 2.568692922592163 + }, + { + "auxiliary_loss_clip": 0.01135101, + "auxiliary_loss_mlp": 0.00789669, + "balance_loss_clip": 1.0479784, + "balance_loss_mlp": 1.00029635, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.8812420301166217, + "language_loss": 0.75029445, + "learning_rate": 3.879592361162969e-06, + "loss": 0.7695421, + "num_input_tokens_seen": 49529820, + "step": 2290, + "time_per_iteration": 2.545158624649048 + }, + { + "auxiliary_loss_clip": 0.0104988, + "auxiliary_loss_mlp": 0.01010109, + "balance_loss_clip": 1.03133154, + "balance_loss_mlp": 1.00753391, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.717209867591001, + "language_loss": 0.51631624, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53691614, + "num_input_tokens_seen": 49595325, + "step": 2291, + "time_per_iteration": 3.1765410900115967 + }, + { + "auxiliary_loss_clip": 0.01162161, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.05342388, + "balance_loss_mlp": 1.02704239, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.053315424079796, + "language_loss": 0.7125175, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73462015, + "num_input_tokens_seen": 49615850, + "step": 2292, + "time_per_iteration": 3.861560583114624 + }, + { + "auxiliary_loss_clip": 0.01160493, + "auxiliary_loss_mlp": 0.01045559, + "balance_loss_clip": 1.05309093, + "balance_loss_mlp": 1.02548373, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 4.049797761065504, + "language_loss": 0.80063981, + "learning_rate": 3.879192761826071e-06, + "loss": 0.82270032, + "num_input_tokens_seen": 49631860, + "step": 2293, + "time_per_iteration": 2.501896619796753 + }, + { + "auxiliary_loss_clip": 0.01159322, + "auxiliary_loss_mlp": 0.0105002, + "balance_loss_clip": 1.05344582, + "balance_loss_mlp": 1.02925372, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 99.70032578584622, + "language_loss": 0.78504199, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80713546, + "num_input_tokens_seen": 49652145, + "step": 2294, + "time_per_iteration": 2.5727427005767822 + }, + { + "auxiliary_loss_clip": 0.01127576, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_clip": 1.05055618, + "balance_loss_mlp": 1.02828944, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.4008527662801, + "language_loss": 0.79830742, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82005453, + "num_input_tokens_seen": 49669880, + "step": 2295, + "time_per_iteration": 2.571171283721924 + }, + { + "auxiliary_loss_clip": 0.0115735, + "auxiliary_loss_mlp": 0.01048948, + "balance_loss_clip": 1.05188584, + "balance_loss_mlp": 1.02787209, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.922095430192988, + "language_loss": 0.78016496, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80222797, + "num_input_tokens_seen": 49687255, + "step": 2296, + "time_per_iteration": 2.4991672039031982 + }, + { + "auxiliary_loss_clip": 0.01156893, + "auxiliary_loss_mlp": 0.01064089, + "balance_loss_clip": 1.05267072, + "balance_loss_mlp": 1.04291725, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.9801749023899466, + "language_loss": 0.78670287, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80891269, + "num_input_tokens_seen": 49706650, + "step": 2297, + "time_per_iteration": 2.5106704235076904 + }, + { + "auxiliary_loss_clip": 0.01114914, + "auxiliary_loss_mlp": 0.01048575, + "balance_loss_clip": 1.05300951, + "balance_loss_mlp": 1.02829766, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.0943315652196004, + "language_loss": 0.69595408, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71758902, + "num_input_tokens_seen": 49725715, + "step": 2298, + "time_per_iteration": 2.6328396797180176 + }, + { + "auxiliary_loss_clip": 0.0114041, + "auxiliary_loss_mlp": 0.0105385, + "balance_loss_clip": 1.05001569, + "balance_loss_mlp": 1.03341734, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 2.1335715514785, + "language_loss": 0.86504883, + "learning_rate": 3.878391639291116e-06, + "loss": 0.88699144, + "num_input_tokens_seen": 49744710, + "step": 2299, + "time_per_iteration": 2.5077884197235107 + }, + { + "auxiliary_loss_clip": 0.01170789, + "auxiliary_loss_mlp": 0.0105081, + "balance_loss_clip": 1.05343819, + "balance_loss_mlp": 1.02937651, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.2348324882807766, + "language_loss": 0.75400341, + "learning_rate": 3.878257869538267e-06, + "loss": 0.77621937, + "num_input_tokens_seen": 49764300, + "step": 2300, + "time_per_iteration": 2.5078301429748535 + }, + { + "auxiliary_loss_clip": 0.01135343, + "auxiliary_loss_mlp": 0.01045138, + "balance_loss_clip": 1.05345118, + "balance_loss_mlp": 1.02531314, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 3.2671614472973087, + "language_loss": 0.82817155, + "learning_rate": 3.878124028561692e-06, + "loss": 0.84997642, + "num_input_tokens_seen": 49778380, + "step": 2301, + "time_per_iteration": 2.516742467880249 + }, + { + "auxiliary_loss_clip": 0.01139939, + "auxiliary_loss_mlp": 0.00786474, + "balance_loss_clip": 1.05046964, + "balance_loss_mlp": 1.00028253, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.102133698562351, + "language_loss": 0.8558743, + "learning_rate": 3.877990116366466e-06, + "loss": 0.87513846, + "num_input_tokens_seen": 49797460, + "step": 2302, + "time_per_iteration": 2.559415578842163 + }, + { + "auxiliary_loss_clip": 0.01065693, + "auxiliary_loss_mlp": 0.01015031, + "balance_loss_clip": 1.02906311, + "balance_loss_mlp": 1.01257563, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7549798632232926, + "language_loss": 0.65573144, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67653871, + "num_input_tokens_seen": 49868005, + "step": 2303, + "time_per_iteration": 3.216395616531372 + }, + { + "auxiliary_loss_clip": 0.01153799, + "auxiliary_loss_mlp": 0.01039107, + "balance_loss_clip": 1.05244589, + "balance_loss_mlp": 1.01991463, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 1.9974390964043862, + "language_loss": 0.7866627, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80859172, + "num_input_tokens_seen": 49885825, + "step": 2304, + "time_per_iteration": 2.478384494781494 + }, + { + "auxiliary_loss_clip": 0.01159509, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_clip": 1.05416393, + "balance_loss_mlp": 1.02469826, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.7299363572279436, + "language_loss": 0.77805567, + "learning_rate": 3.877587952519672e-06, + "loss": 0.8000977, + "num_input_tokens_seen": 49905975, + "step": 2305, + "time_per_iteration": 2.510664224624634 + }, + { + "auxiliary_loss_clip": 0.01080826, + "auxiliary_loss_mlp": 0.01054533, + "balance_loss_clip": 1.04017437, + "balance_loss_mlp": 1.03461337, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 1.8004661151682844, + "language_loss": 0.87862384, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89997745, + "num_input_tokens_seen": 49925800, + "step": 2306, + "time_per_iteration": 2.736192226409912 + }, + { + "auxiliary_loss_clip": 0.0107096, + "auxiliary_loss_mlp": 0.01004187, + "balance_loss_clip": 1.02456248, + "balance_loss_mlp": 1.00151694, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8758400882007455, + "language_loss": 0.59013277, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61088425, + "num_input_tokens_seen": 49977620, + "step": 2307, + "time_per_iteration": 3.2623507976531982 + }, + { + "auxiliary_loss_clip": 0.01174987, + "auxiliary_loss_mlp": 0.00787522, + "balance_loss_clip": 1.05558658, + "balance_loss_mlp": 1.00036168, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.7935255047869136, + "language_loss": 0.79972869, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81935382, + "num_input_tokens_seen": 49996650, + "step": 2308, + "time_per_iteration": 2.507844924926758 + }, + { + "auxiliary_loss_clip": 0.01132596, + "auxiliary_loss_mlp": 0.01044997, + "balance_loss_clip": 1.04937148, + "balance_loss_mlp": 1.02499318, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.749328657011273, + "language_loss": 0.78708744, + "learning_rate": 3.877050737304533e-06, + "loss": 0.8088634, + "num_input_tokens_seen": 50015640, + "step": 2309, + "time_per_iteration": 2.57781982421875 + }, + { + "auxiliary_loss_clip": 0.0113207, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.04790258, + "balance_loss_mlp": 1.02396011, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 3.2186052149419533, + "language_loss": 0.6807546, + "learning_rate": 3.876916255543129e-06, + "loss": 0.70252132, + "num_input_tokens_seen": 50033500, + "step": 2310, + "time_per_iteration": 2.564969301223755 + }, + { + "auxiliary_loss_clip": 0.01170525, + "auxiliary_loss_mlp": 0.01058257, + "balance_loss_clip": 1.05495811, + "balance_loss_mlp": 1.03604889, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.952470425829212, + "language_loss": 0.84019935, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.8624872, + "num_input_tokens_seen": 50050075, + "step": 2311, + "time_per_iteration": 2.4356822967529297 + }, + { + "auxiliary_loss_clip": 0.01176707, + "auxiliary_loss_mlp": 0.0105167, + "balance_loss_clip": 1.05650949, + "balance_loss_mlp": 1.03170264, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 5.998601048978508, + "language_loss": 0.81516582, + "learning_rate": 3.876647078506866e-06, + "loss": 0.83744961, + "num_input_tokens_seen": 50070080, + "step": 2312, + "time_per_iteration": 2.5041134357452393 + }, + { + "auxiliary_loss_clip": 0.01134234, + "auxiliary_loss_mlp": 0.00787699, + "balance_loss_clip": 1.05524731, + "balance_loss_mlp": 1.00048172, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 1.932947590961904, + "language_loss": 0.8668493, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88606864, + "num_input_tokens_seen": 50090040, + "step": 2313, + "time_per_iteration": 2.5938239097595215 + }, + { + "auxiliary_loss_clip": 0.0116959, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_clip": 1.05455172, + "balance_loss_mlp": 1.03318572, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.6601209432092805, + "language_loss": 0.80064118, + "learning_rate": 3.876377616820024e-06, + "loss": 0.82287276, + "num_input_tokens_seen": 50110595, + "step": 2314, + "time_per_iteration": 4.111196994781494 + }, + { + "auxiliary_loss_clip": 0.01128452, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.04859638, + "balance_loss_mlp": 1.03119683, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.7561079191165687, + "language_loss": 0.85879827, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88060236, + "num_input_tokens_seen": 50125430, + "step": 2315, + "time_per_iteration": 2.5626728534698486 + }, + { + "auxiliary_loss_clip": 0.01158732, + "auxiliary_loss_mlp": 0.01059297, + "balance_loss_clip": 1.05063009, + "balance_loss_mlp": 1.03752983, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.2763798686562033, + "language_loss": 0.77378821, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79596847, + "num_input_tokens_seen": 50144120, + "step": 2316, + "time_per_iteration": 2.4750640392303467 + }, + { + "auxiliary_loss_clip": 0.01166923, + "auxiliary_loss_mlp": 0.00789051, + "balance_loss_clip": 1.05286932, + "balance_loss_mlp": 1.00040829, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.7350628412431384, + "language_loss": 0.77114993, + "learning_rate": 3.875972890659349e-06, + "loss": 0.79070967, + "num_input_tokens_seen": 50162500, + "step": 2317, + "time_per_iteration": 3.922683000564575 + }, + { + "auxiliary_loss_clip": 0.01150201, + "auxiliary_loss_mlp": 0.01059379, + "balance_loss_clip": 1.05195129, + "balance_loss_mlp": 1.03900659, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 3.1960268183392992, + "language_loss": 0.8023349, + "learning_rate": 3.875837839658139e-06, + "loss": 0.8244307, + "num_input_tokens_seen": 50182415, + "step": 2318, + "time_per_iteration": 2.623262405395508 + }, + { + "auxiliary_loss_clip": 0.01043583, + "auxiliary_loss_mlp": 0.01015982, + "balance_loss_clip": 1.01896882, + "balance_loss_mlp": 1.01332402, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8572661855494902, + "language_loss": 0.59080672, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61140239, + "num_input_tokens_seen": 50245160, + "step": 2319, + "time_per_iteration": 3.191784381866455 + }, + { + "auxiliary_loss_clip": 0.01121252, + "auxiliary_loss_mlp": 0.01055515, + "balance_loss_clip": 1.04437065, + "balance_loss_mlp": 1.03486824, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.487120647158795, + "language_loss": 0.65594637, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67771405, + "num_input_tokens_seen": 50268215, + "step": 2320, + "time_per_iteration": 2.690042495727539 + }, + { + "auxiliary_loss_clip": 0.01099511, + "auxiliary_loss_mlp": 0.01056554, + "balance_loss_clip": 1.04115295, + "balance_loss_mlp": 1.03557301, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.7016288878228918, + "language_loss": 0.70832968, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72989035, + "num_input_tokens_seen": 50288575, + "step": 2321, + "time_per_iteration": 2.628800630569458 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.01064565, + "balance_loss_clip": 1.04322064, + "balance_loss_mlp": 1.04074717, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 6.2834999735802555, + "language_loss": 0.86048746, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88235867, + "num_input_tokens_seen": 50308735, + "step": 2322, + "time_per_iteration": 4.068484306335449 + }, + { + "auxiliary_loss_clip": 0.01121623, + "auxiliary_loss_mlp": 0.01055267, + "balance_loss_clip": 1.04092288, + "balance_loss_mlp": 1.03625357, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 2.1079222905320787, + "language_loss": 0.66668284, + "learning_rate": 3.875161517775226e-06, + "loss": 0.68845171, + "num_input_tokens_seen": 50331025, + "step": 2323, + "time_per_iteration": 2.7061538696289062 + }, + { + "auxiliary_loss_clip": 0.01133424, + "auxiliary_loss_mlp": 0.01053734, + "balance_loss_clip": 1.04788232, + "balance_loss_mlp": 1.03226435, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 1.9992353737572508, + "language_loss": 0.88944525, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91131687, + "num_input_tokens_seen": 50349725, + "step": 2324, + "time_per_iteration": 2.550130844116211 + }, + { + "auxiliary_loss_clip": 0.0115837, + "auxiliary_loss_mlp": 0.0106197, + "balance_loss_clip": 1.05152249, + "balance_loss_mlp": 1.04029775, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 2.5371579675435676, + "language_loss": 0.70792347, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.73012686, + "num_input_tokens_seen": 50367965, + "step": 2325, + "time_per_iteration": 2.514955520629883 + }, + { + "auxiliary_loss_clip": 0.01137716, + "auxiliary_loss_mlp": 0.00788225, + "balance_loss_clip": 1.05091548, + "balance_loss_mlp": 1.00042152, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 2.230835504225761, + "language_loss": 0.82127988, + "learning_rate": 3.874754871328688e-06, + "loss": 0.84053928, + "num_input_tokens_seen": 50385605, + "step": 2326, + "time_per_iteration": 2.5588936805725098 + }, + { + "auxiliary_loss_clip": 0.01154625, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_clip": 1.05394483, + "balance_loss_mlp": 1.03004313, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.902103884437036, + "language_loss": 0.89059013, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91262424, + "num_input_tokens_seen": 50403985, + "step": 2327, + "time_per_iteration": 2.559536933898926 + }, + { + "auxiliary_loss_clip": 0.01126655, + "auxiliary_loss_mlp": 0.01063536, + "balance_loss_clip": 1.0498687, + "balance_loss_mlp": 1.04139936, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.3104974535296607, + "language_loss": 0.85148168, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87338364, + "num_input_tokens_seen": 50421590, + "step": 2328, + "time_per_iteration": 2.549285888671875 + }, + { + "auxiliary_loss_clip": 0.01155306, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.0526185, + "balance_loss_mlp": 1.02858877, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.9149575870359812, + "language_loss": 0.7385602, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76061159, + "num_input_tokens_seen": 50443945, + "step": 2329, + "time_per_iteration": 2.5491702556610107 + }, + { + "auxiliary_loss_clip": 0.01153547, + "auxiliary_loss_mlp": 0.01048324, + "balance_loss_clip": 1.04965293, + "balance_loss_mlp": 1.02755797, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 2.202278036260778, + "language_loss": 0.78366393, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80568266, + "num_input_tokens_seen": 50462065, + "step": 2330, + "time_per_iteration": 2.4849178791046143 + }, + { + "auxiliary_loss_clip": 0.01146205, + "auxiliary_loss_mlp": 0.01048697, + "balance_loss_clip": 1.04897189, + "balance_loss_mlp": 1.02956414, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 4.814381496343631, + "language_loss": 0.71911979, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74106878, + "num_input_tokens_seen": 50479565, + "step": 2331, + "time_per_iteration": 3.9731149673461914 + }, + { + "auxiliary_loss_clip": 0.01163218, + "auxiliary_loss_mlp": 0.01051142, + "balance_loss_clip": 1.05463433, + "balance_loss_mlp": 1.03113866, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.5346713654043538, + "language_loss": 0.72544229, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74758589, + "num_input_tokens_seen": 50497305, + "step": 2332, + "time_per_iteration": 2.432058095932007 + }, + { + "auxiliary_loss_clip": 0.01056406, + "auxiliary_loss_mlp": 0.01013763, + "balance_loss_clip": 1.02010274, + "balance_loss_mlp": 1.01083016, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8231860335387353, + "language_loss": 0.56044614, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58114785, + "num_input_tokens_seen": 50549735, + "step": 2333, + "time_per_iteration": 2.9088661670684814 + }, + { + "auxiliary_loss_clip": 0.01129962, + "auxiliary_loss_mlp": 0.01045959, + "balance_loss_clip": 1.05133343, + "balance_loss_mlp": 1.02577734, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.8286153567008512, + "language_loss": 0.82922751, + "learning_rate": 3.873667353183016e-06, + "loss": 0.85098672, + "num_input_tokens_seen": 50570100, + "step": 2334, + "time_per_iteration": 2.593961715698242 + }, + { + "auxiliary_loss_clip": 0.01135245, + "auxiliary_loss_mlp": 0.01047849, + "balance_loss_clip": 1.05027282, + "balance_loss_mlp": 1.02866817, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 2.0056447922539147, + "language_loss": 0.8112396, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83307058, + "num_input_tokens_seen": 50589185, + "step": 2335, + "time_per_iteration": 2.5703444480895996 + }, + { + "auxiliary_loss_clip": 0.01117534, + "auxiliary_loss_mlp": 0.01056149, + "balance_loss_clip": 1.04982769, + "balance_loss_mlp": 1.03191376, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 2.0009882049544916, + "language_loss": 0.82388794, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84562474, + "num_input_tokens_seen": 50609645, + "step": 2336, + "time_per_iteration": 2.6022095680236816 + }, + { + "auxiliary_loss_clip": 0.0115466, + "auxiliary_loss_mlp": 0.01052055, + "balance_loss_clip": 1.05770063, + "balance_loss_mlp": 1.03158712, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.683033308708322, + "language_loss": 0.80706072, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82912791, + "num_input_tokens_seen": 50628385, + "step": 2337, + "time_per_iteration": 2.499124526977539 + }, + { + "auxiliary_loss_clip": 0.01156243, + "auxiliary_loss_mlp": 0.01052887, + "balance_loss_clip": 1.0541656, + "balance_loss_mlp": 1.03192997, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.39184788189156, + "language_loss": 0.79185498, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81394625, + "num_input_tokens_seen": 50647260, + "step": 2338, + "time_per_iteration": 2.4997801780700684 + }, + { + "auxiliary_loss_clip": 0.01163251, + "auxiliary_loss_mlp": 0.01054416, + "balance_loss_clip": 1.0586555, + "balance_loss_mlp": 1.03316092, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.3827619389826697, + "language_loss": 0.79956806, + "learning_rate": 3.87298534506069e-06, + "loss": 0.8217448, + "num_input_tokens_seen": 50666130, + "step": 2339, + "time_per_iteration": 2.500779867172241 + }, + { + "auxiliary_loss_clip": 0.01106416, + "auxiliary_loss_mlp": 0.01070535, + "balance_loss_clip": 1.04872024, + "balance_loss_mlp": 1.04932785, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.850856191924272, + "language_loss": 0.6547699, + "learning_rate": 3.872848730344146e-06, + "loss": 0.67653942, + "num_input_tokens_seen": 50687440, + "step": 2340, + "time_per_iteration": 2.7825522422790527 + }, + { + "auxiliary_loss_clip": 0.01154242, + "auxiliary_loss_mlp": 0.0104826, + "balance_loss_clip": 1.055336, + "balance_loss_mlp": 1.02795887, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 6.076308971182011, + "language_loss": 0.78698039, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80900538, + "num_input_tokens_seen": 50704030, + "step": 2341, + "time_per_iteration": 2.469862699508667 + }, + { + "auxiliary_loss_clip": 0.01171483, + "auxiliary_loss_mlp": 0.01060416, + "balance_loss_clip": 1.05775392, + "balance_loss_mlp": 1.04048455, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 2.099511237845992, + "language_loss": 0.80087811, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82319713, + "num_input_tokens_seen": 50723305, + "step": 2342, + "time_per_iteration": 2.4436755180358887 + }, + { + "auxiliary_loss_clip": 0.01158874, + "auxiliary_loss_mlp": 0.01052355, + "balance_loss_clip": 1.06000793, + "balance_loss_mlp": 1.03363872, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 3.019244224005998, + "language_loss": 0.77891481, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80102712, + "num_input_tokens_seen": 50743270, + "step": 2343, + "time_per_iteration": 2.525792360305786 + }, + { + "auxiliary_loss_clip": 0.01054757, + "auxiliary_loss_mlp": 0.01010354, + "balance_loss_clip": 1.0278244, + "balance_loss_mlp": 1.00745678, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8404381977591018, + "language_loss": 0.61617935, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63683045, + "num_input_tokens_seen": 50802710, + "step": 2344, + "time_per_iteration": 3.0440263748168945 + }, + { + "auxiliary_loss_clip": 0.01155447, + "auxiliary_loss_mlp": 0.01051288, + "balance_loss_clip": 1.05293453, + "balance_loss_mlp": 1.0326798, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.6306082462956593, + "language_loss": 0.64766049, + "learning_rate": 3.872164591585956e-06, + "loss": 0.66972786, + "num_input_tokens_seen": 50822625, + "step": 2345, + "time_per_iteration": 2.5297584533691406 + }, + { + "auxiliary_loss_clip": 0.01166881, + "auxiliary_loss_mlp": 0.01048908, + "balance_loss_clip": 1.05423307, + "balance_loss_mlp": 1.02649689, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.2273362231513, + "language_loss": 0.73300529, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.75516325, + "num_input_tokens_seen": 50842330, + "step": 2346, + "time_per_iteration": 2.562021493911743 + }, + { + "auxiliary_loss_clip": 0.01166262, + "auxiliary_loss_mlp": 0.01055072, + "balance_loss_clip": 1.06363916, + "balance_loss_mlp": 1.03362572, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 1.9063755985715003, + "language_loss": 0.77090931, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.79312265, + "num_input_tokens_seen": 50861035, + "step": 2347, + "time_per_iteration": 2.5426759719848633 + }, + { + "auxiliary_loss_clip": 0.01178664, + "auxiliary_loss_mlp": 0.01054868, + "balance_loss_clip": 1.06253791, + "balance_loss_mlp": 1.0349834, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 2.054246728280064, + "language_loss": 0.76582891, + "learning_rate": 3.8717532563775e-06, + "loss": 0.7881642, + "num_input_tokens_seen": 50880105, + "step": 2348, + "time_per_iteration": 2.55761456489563 + }, + { + "auxiliary_loss_clip": 0.01157383, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_clip": 1.05532336, + "balance_loss_mlp": 1.02556944, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.7842311773074824, + "language_loss": 0.86659497, + "learning_rate": 3.871616002680272e-06, + "loss": 0.88862944, + "num_input_tokens_seen": 50897720, + "step": 2349, + "time_per_iteration": 2.457475423812866 + }, + { + "auxiliary_loss_clip": 0.01163439, + "auxiliary_loss_mlp": 0.01054959, + "balance_loss_clip": 1.06141019, + "balance_loss_mlp": 1.0341692, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.8650001414991535, + "language_loss": 0.88835561, + "learning_rate": 3.871478678011177e-06, + "loss": 0.91053957, + "num_input_tokens_seen": 50918385, + "step": 2350, + "time_per_iteration": 2.542578935623169 + }, + { + "auxiliary_loss_clip": 0.01153966, + "auxiliary_loss_mlp": 0.01048645, + "balance_loss_clip": 1.05690694, + "balance_loss_mlp": 1.02649581, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.749852963730854, + "language_loss": 0.810013, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83203906, + "num_input_tokens_seen": 50938270, + "step": 2351, + "time_per_iteration": 2.505404233932495 + }, + { + "auxiliary_loss_clip": 0.01165737, + "auxiliary_loss_mlp": 0.01048686, + "balance_loss_clip": 1.06011975, + "balance_loss_mlp": 1.02819371, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.2363605245995113, + "language_loss": 0.82918918, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85133338, + "num_input_tokens_seen": 50958155, + "step": 2352, + "time_per_iteration": 2.5617222785949707 + }, + { + "auxiliary_loss_clip": 0.01061085, + "auxiliary_loss_mlp": 0.01003298, + "balance_loss_clip": 1.02560425, + "balance_loss_mlp": 1.00000811, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.9158650855391647, + "language_loss": 0.61976111, + "learning_rate": 3.87106627822478e-06, + "loss": 0.64040494, + "num_input_tokens_seen": 51020705, + "step": 2353, + "time_per_iteration": 4.504199266433716 + }, + { + "auxiliary_loss_clip": 0.01149237, + "auxiliary_loss_mlp": 0.01050715, + "balance_loss_clip": 1.06189585, + "balance_loss_mlp": 1.03040135, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.595619920305791, + "language_loss": 0.86979282, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89179236, + "num_input_tokens_seen": 51039995, + "step": 2354, + "time_per_iteration": 2.5256941318511963 + }, + { + "auxiliary_loss_clip": 0.01134167, + "auxiliary_loss_mlp": 0.01050408, + "balance_loss_clip": 1.05711699, + "balance_loss_mlp": 1.02864075, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.7985035745988536, + "language_loss": 0.74478275, + "learning_rate": 3.870790990270057e-06, + "loss": 0.7666285, + "num_input_tokens_seen": 51059075, + "step": 2355, + "time_per_iteration": 2.551220178604126 + }, + { + "auxiliary_loss_clip": 0.01060181, + "auxiliary_loss_mlp": 0.01005372, + "balance_loss_clip": 1.02488947, + "balance_loss_mlp": 1.00213003, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6803621643210751, + "language_loss": 0.51771325, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53836882, + "num_input_tokens_seen": 51120380, + "step": 2356, + "time_per_iteration": 4.37205958366394 + }, + { + "auxiliary_loss_clip": 0.01175883, + "auxiliary_loss_mlp": 0.01057508, + "balance_loss_clip": 1.06169426, + "balance_loss_mlp": 1.03686047, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 1.9807776103709895, + "language_loss": 0.70633727, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72867119, + "num_input_tokens_seen": 51136950, + "step": 2357, + "time_per_iteration": 2.4372549057006836 + }, + { + "auxiliary_loss_clip": 0.01126286, + "auxiliary_loss_mlp": 0.01057085, + "balance_loss_clip": 1.04927504, + "balance_loss_mlp": 1.03647399, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.838277500629423, + "language_loss": 0.82151675, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84335047, + "num_input_tokens_seen": 51155175, + "step": 2358, + "time_per_iteration": 2.5569279193878174 + }, + { + "auxiliary_loss_clip": 0.01159034, + "auxiliary_loss_mlp": 0.01054826, + "balance_loss_clip": 1.06075466, + "balance_loss_mlp": 1.0334276, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 1.9168864719509684, + "language_loss": 0.71479356, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73693216, + "num_input_tokens_seen": 51174500, + "step": 2359, + "time_per_iteration": 2.5499470233917236 + }, + { + "auxiliary_loss_clip": 0.01126314, + "auxiliary_loss_mlp": 0.00787492, + "balance_loss_clip": 1.05892038, + "balance_loss_mlp": 1.00052941, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 2.5964174110637535, + "language_loss": 0.75462472, + "learning_rate": 3.870101529014526e-06, + "loss": 0.7737627, + "num_input_tokens_seen": 51194270, + "step": 2360, + "time_per_iteration": 2.6166396141052246 + }, + { + "auxiliary_loss_clip": 0.01124826, + "auxiliary_loss_mlp": 0.01054839, + "balance_loss_clip": 1.0579989, + "balance_loss_mlp": 1.03152204, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.351592080795249, + "language_loss": 0.81626755, + "learning_rate": 3.869963423999178e-06, + "loss": 0.83806419, + "num_input_tokens_seen": 51211850, + "step": 2361, + "time_per_iteration": 4.055509567260742 + }, + { + "auxiliary_loss_clip": 0.0115739, + "auxiliary_loss_mlp": 0.01056794, + "balance_loss_clip": 1.05634642, + "balance_loss_mlp": 1.03668308, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.917784611732347, + "language_loss": 0.74231058, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76445246, + "num_input_tokens_seen": 51233545, + "step": 2362, + "time_per_iteration": 2.569814920425415 + }, + { + "auxiliary_loss_clip": 0.0116257, + "auxiliary_loss_mlp": 0.01046949, + "balance_loss_clip": 1.05905867, + "balance_loss_mlp": 1.02659988, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 2.4367701568895392, + "language_loss": 0.73779386, + "learning_rate": 3.869687001246122e-06, + "loss": 0.75988907, + "num_input_tokens_seen": 51257615, + "step": 2363, + "time_per_iteration": 2.6664927005767822 + }, + { + "auxiliary_loss_clip": 0.01137415, + "auxiliary_loss_mlp": 0.01050634, + "balance_loss_clip": 1.05112672, + "balance_loss_mlp": 1.0297122, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 2.1387083935682414, + "language_loss": 0.72978485, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75166535, + "num_input_tokens_seen": 51279645, + "step": 2364, + "time_per_iteration": 2.6472883224487305 + }, + { + "auxiliary_loss_clip": 0.01148896, + "auxiliary_loss_mlp": 0.01048755, + "balance_loss_clip": 1.05489421, + "balance_loss_mlp": 1.03007472, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 2.4890652972670666, + "language_loss": 0.9099164, + "learning_rate": 3.869410294898195e-06, + "loss": 0.93189287, + "num_input_tokens_seen": 51299775, + "step": 2365, + "time_per_iteration": 2.588102340698242 + }, + { + "auxiliary_loss_clip": 0.01132665, + "auxiliary_loss_mlp": 0.01054808, + "balance_loss_clip": 1.04735863, + "balance_loss_mlp": 1.03294516, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 2.0711828893138495, + "language_loss": 0.65601057, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67788529, + "num_input_tokens_seen": 51319430, + "step": 2366, + "time_per_iteration": 2.5785090923309326 + }, + { + "auxiliary_loss_clip": 0.01151457, + "auxiliary_loss_mlp": 0.0105812, + "balance_loss_clip": 1.05669761, + "balance_loss_mlp": 1.03674603, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 1.8642867648217263, + "language_loss": 0.80834746, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.83044314, + "num_input_tokens_seen": 51336045, + "step": 2367, + "time_per_iteration": 2.5087172985076904 + }, + { + "auxiliary_loss_clip": 0.01145881, + "auxiliary_loss_mlp": 0.0106295, + "balance_loss_clip": 1.05306685, + "balance_loss_mlp": 1.04022861, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 2.1420256317417303, + "language_loss": 0.82478559, + "learning_rate": 3.868994703727742e-06, + "loss": 0.846874, + "num_input_tokens_seen": 51357030, + "step": 2368, + "time_per_iteration": 2.5787816047668457 + }, + { + "auxiliary_loss_clip": 0.01127491, + "auxiliary_loss_mlp": 0.01050515, + "balance_loss_clip": 1.05319762, + "balance_loss_mlp": 1.02872396, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.953314873548484, + "language_loss": 0.86760324, + "learning_rate": 3.868856031585652e-06, + "loss": 0.88938332, + "num_input_tokens_seen": 51374890, + "step": 2369, + "time_per_iteration": 2.5398929119110107 + }, + { + "auxiliary_loss_clip": 0.01131791, + "auxiliary_loss_mlp": 0.01046545, + "balance_loss_clip": 1.05428839, + "balance_loss_mlp": 1.02614844, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.6056609822523706, + "language_loss": 0.75718045, + "learning_rate": 3.868717288576354e-06, + "loss": 0.7789638, + "num_input_tokens_seen": 51398100, + "step": 2370, + "time_per_iteration": 2.628145217895508 + }, + { + "auxiliary_loss_clip": 0.01156673, + "auxiliary_loss_mlp": 0.00787511, + "balance_loss_clip": 1.05024159, + "balance_loss_mlp": 1.00045037, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.7738065703934993, + "language_loss": 0.8292802, + "learning_rate": 3.868578474705109e-06, + "loss": 0.84872204, + "num_input_tokens_seen": 51418745, + "step": 2371, + "time_per_iteration": 3.8332161903381348 + }, + { + "auxiliary_loss_clip": 0.01173199, + "auxiliary_loss_mlp": 0.01052933, + "balance_loss_clip": 1.05818152, + "balance_loss_mlp": 1.03165412, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.9061899543383238, + "language_loss": 0.82981944, + "learning_rate": 3.868439589977181e-06, + "loss": 0.85208082, + "num_input_tokens_seen": 51437455, + "step": 2372, + "time_per_iteration": 2.4456379413604736 + }, + { + "auxiliary_loss_clip": 0.0117121, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.05649972, + "balance_loss_mlp": 1.033746, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.5726295509314134, + "language_loss": 0.8445552, + "learning_rate": 3.868300634397836e-06, + "loss": 0.8668136, + "num_input_tokens_seen": 51455710, + "step": 2373, + "time_per_iteration": 2.432644844055176 + }, + { + "auxiliary_loss_clip": 0.01142801, + "auxiliary_loss_mlp": 0.01056374, + "balance_loss_clip": 1.05143237, + "balance_loss_mlp": 1.0372653, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.2206387035174586, + "language_loss": 0.86529839, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88729012, + "num_input_tokens_seen": 51471270, + "step": 2374, + "time_per_iteration": 2.4840409755706787 + }, + { + "auxiliary_loss_clip": 0.01162814, + "auxiliary_loss_mlp": 0.01058429, + "balance_loss_clip": 1.0531081, + "balance_loss_mlp": 1.03720939, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 14.112735193673597, + "language_loss": 0.79237252, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81458497, + "num_input_tokens_seen": 51492705, + "step": 2375, + "time_per_iteration": 2.566340446472168 + }, + { + "auxiliary_loss_clip": 0.01159328, + "auxiliary_loss_mlp": 0.01063807, + "balance_loss_clip": 1.05463958, + "balance_loss_mlp": 1.04339886, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 4.348543052947975, + "language_loss": 0.76495916, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78719056, + "num_input_tokens_seen": 51510780, + "step": 2376, + "time_per_iteration": 2.474499464035034 + }, + { + "auxiliary_loss_clip": 0.01158637, + "auxiliary_loss_mlp": 0.01053779, + "balance_loss_clip": 1.05328393, + "balance_loss_mlp": 1.0336926, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 4.811111225071114, + "language_loss": 0.93050945, + "learning_rate": 3.867744103671717e-06, + "loss": 0.95263362, + "num_input_tokens_seen": 51531400, + "step": 2377, + "time_per_iteration": 2.495046377182007 + }, + { + "auxiliary_loss_clip": 0.01149103, + "auxiliary_loss_mlp": 0.01054906, + "balance_loss_clip": 1.05644619, + "balance_loss_mlp": 1.03155327, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9336698460815933, + "language_loss": 0.9185282, + "learning_rate": 3.867604793914382e-06, + "loss": 0.94056827, + "num_input_tokens_seen": 51548215, + "step": 2378, + "time_per_iteration": 2.514120578765869 + }, + { + "auxiliary_loss_clip": 0.01160829, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.05390596, + "balance_loss_mlp": 1.02931488, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.9033572161982295, + "language_loss": 0.74022818, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76234114, + "num_input_tokens_seen": 51566820, + "step": 2379, + "time_per_iteration": 2.5027925968170166 + }, + { + "auxiliary_loss_clip": 0.01136344, + "auxiliary_loss_mlp": 0.01059959, + "balance_loss_clip": 1.05278349, + "balance_loss_mlp": 1.03877497, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.8895469692831386, + "language_loss": 0.78926075, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81122375, + "num_input_tokens_seen": 51585075, + "step": 2380, + "time_per_iteration": 2.547553777694702 + }, + { + "auxiliary_loss_clip": 0.01123657, + "auxiliary_loss_mlp": 0.01058939, + "balance_loss_clip": 1.0517863, + "balance_loss_mlp": 1.0379703, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.339410843538638, + "language_loss": 0.88031721, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90214318, + "num_input_tokens_seen": 51603185, + "step": 2381, + "time_per_iteration": 2.548872232437134 + }, + { + "auxiliary_loss_clip": 0.011379, + "auxiliary_loss_mlp": 0.01050231, + "balance_loss_clip": 1.05090261, + "balance_loss_mlp": 1.02897561, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.674426594333887, + "language_loss": 0.7683624, + "learning_rate": 3.867046846740299e-06, + "loss": 0.79024374, + "num_input_tokens_seen": 51620880, + "step": 2382, + "time_per_iteration": 2.52264404296875 + }, + { + "auxiliary_loss_clip": 0.01130041, + "auxiliary_loss_mlp": 0.01061423, + "balance_loss_clip": 1.05136967, + "balance_loss_mlp": 1.04152751, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.6073016469258374, + "language_loss": 0.76515651, + "learning_rate": 3.866907182937039e-06, + "loss": 0.78707111, + "num_input_tokens_seen": 51640170, + "step": 2383, + "time_per_iteration": 2.6127655506134033 + }, + { + "auxiliary_loss_clip": 0.01141056, + "auxiliary_loss_mlp": 0.01057899, + "balance_loss_clip": 1.05303597, + "balance_loss_mlp": 1.03470123, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.3234611361844144, + "language_loss": 0.87853718, + "learning_rate": 3.866767448340471e-06, + "loss": 0.90052676, + "num_input_tokens_seen": 51656580, + "step": 2384, + "time_per_iteration": 2.589261293411255 + }, + { + "auxiliary_loss_clip": 0.01162904, + "auxiliary_loss_mlp": 0.01058345, + "balance_loss_clip": 1.05539942, + "balance_loss_mlp": 1.03613663, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 2.488411996234358, + "language_loss": 0.79977834, + "learning_rate": 3.866627642955895e-06, + "loss": 0.82199079, + "num_input_tokens_seen": 51674645, + "step": 2385, + "time_per_iteration": 2.4836246967315674 + }, + { + "auxiliary_loss_clip": 0.0115781, + "auxiliary_loss_mlp": 0.01052091, + "balance_loss_clip": 1.05192184, + "balance_loss_mlp": 1.03169417, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 2.0226099778248385, + "language_loss": 0.75537604, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77747506, + "num_input_tokens_seen": 51695770, + "step": 2386, + "time_per_iteration": 2.591961145401001 + }, + { + "auxiliary_loss_clip": 0.01171627, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_clip": 1.05696952, + "balance_loss_mlp": 1.02863777, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.6933078244482305, + "language_loss": 0.78464502, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80685937, + "num_input_tokens_seen": 51714165, + "step": 2387, + "time_per_iteration": 2.4586708545684814 + }, + { + "auxiliary_loss_clip": 0.01143495, + "auxiliary_loss_mlp": 0.01058987, + "balance_loss_clip": 1.05569029, + "balance_loss_mlp": 1.0369215, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 1.955435656352753, + "language_loss": 0.82187301, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84389782, + "num_input_tokens_seen": 51734440, + "step": 2388, + "time_per_iteration": 2.5580811500549316 + }, + { + "auxiliary_loss_clip": 0.01161313, + "auxiliary_loss_mlp": 0.01052061, + "balance_loss_clip": 1.05697227, + "balance_loss_mlp": 1.03140199, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.0947430521151436, + "language_loss": 0.82411194, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84624571, + "num_input_tokens_seen": 51753730, + "step": 2389, + "time_per_iteration": 2.5567471981048584 + }, + { + "auxiliary_loss_clip": 0.01151062, + "auxiliary_loss_mlp": 0.01057251, + "balance_loss_clip": 1.0552094, + "balance_loss_mlp": 1.03514981, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 5.606601128840227, + "language_loss": 0.83218777, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.85427094, + "num_input_tokens_seen": 51771195, + "step": 2390, + "time_per_iteration": 2.521442174911499 + }, + { + "auxiliary_loss_clip": 0.01155298, + "auxiliary_loss_mlp": 0.01056796, + "balance_loss_clip": 1.05614471, + "balance_loss_mlp": 1.03552926, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 1.6807365210649983, + "language_loss": 0.74801278, + "learning_rate": 3.865787324397324e-06, + "loss": 0.77013373, + "num_input_tokens_seen": 51792290, + "step": 2391, + "time_per_iteration": 2.552682876586914 + }, + { + "auxiliary_loss_clip": 0.01048249, + "auxiliary_loss_mlp": 0.0100597, + "balance_loss_clip": 1.03117156, + "balance_loss_mlp": 1.00294232, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8779766968040227, + "language_loss": 0.61783862, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63838089, + "num_input_tokens_seen": 51843675, + "step": 2392, + "time_per_iteration": 4.421301603317261 + }, + { + "auxiliary_loss_clip": 0.01164088, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_clip": 1.05571616, + "balance_loss_mlp": 1.03817487, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.18802287476805, + "language_loss": 0.76969194, + "learning_rate": 3.865506652147709e-06, + "loss": 0.79194307, + "num_input_tokens_seen": 51860285, + "step": 2393, + "time_per_iteration": 2.44295072555542 + }, + { + "auxiliary_loss_clip": 0.01178337, + "auxiliary_loss_mlp": 0.01054418, + "balance_loss_clip": 1.06084311, + "balance_loss_mlp": 1.03383029, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 2.086346776351359, + "language_loss": 0.7695415, + "learning_rate": 3.865366209909941e-06, + "loss": 0.79186904, + "num_input_tokens_seen": 51880105, + "step": 2394, + "time_per_iteration": 2.4994699954986572 + }, + { + "auxiliary_loss_clip": 0.01176051, + "auxiliary_loss_mlp": 0.01055352, + "balance_loss_clip": 1.05955791, + "balance_loss_mlp": 1.03421593, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 2.0320310662324776, + "language_loss": 0.85733604, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.87965012, + "num_input_tokens_seen": 51905175, + "step": 2395, + "time_per_iteration": 2.651970148086548 + }, + { + "auxiliary_loss_clip": 0.01134374, + "auxiliary_loss_mlp": 0.01062638, + "balance_loss_clip": 1.05108058, + "balance_loss_mlp": 1.03914165, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5666429085838791, + "language_loss": 0.8306011, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85257125, + "num_input_tokens_seen": 51924490, + "step": 2396, + "time_per_iteration": 3.924840211868286 + }, + { + "auxiliary_loss_clip": 0.01135965, + "auxiliary_loss_mlp": 0.00789182, + "balance_loss_clip": 1.04988658, + "balance_loss_mlp": 1.00045323, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 7.244387195407027, + "language_loss": 0.83003902, + "learning_rate": 3.864944458808712e-06, + "loss": 0.84929049, + "num_input_tokens_seen": 51940490, + "step": 2397, + "time_per_iteration": 2.509592056274414 + }, + { + "auxiliary_loss_clip": 0.01174271, + "auxiliary_loss_mlp": 0.01049501, + "balance_loss_clip": 1.05682921, + "balance_loss_mlp": 1.02784085, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.7019233380523144, + "language_loss": 0.79877907, + "learning_rate": 3.86480373366343e-06, + "loss": 0.82101679, + "num_input_tokens_seen": 51957910, + "step": 2398, + "time_per_iteration": 2.4521713256835938 + }, + { + "auxiliary_loss_clip": 0.0116027, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.05561686, + "balance_loss_mlp": 1.03395963, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.283551690610143, + "language_loss": 0.64577144, + "learning_rate": 3.864662937804603e-06, + "loss": 0.66792047, + "num_input_tokens_seen": 51978010, + "step": 2399, + "time_per_iteration": 2.547351837158203 + }, + { + "auxiliary_loss_clip": 0.01143944, + "auxiliary_loss_mlp": 0.01049506, + "balance_loss_clip": 1.05215716, + "balance_loss_mlp": 1.02746427, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.6321654196577995, + "language_loss": 0.81931537, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84124988, + "num_input_tokens_seen": 51998515, + "step": 2400, + "time_per_iteration": 2.5471439361572266 + }, + { + "auxiliary_loss_clip": 0.0115301, + "auxiliary_loss_mlp": 0.01055386, + "balance_loss_clip": 1.05540335, + "balance_loss_mlp": 1.03296304, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.9012245329850488, + "language_loss": 0.74490935, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76699328, + "num_input_tokens_seen": 52019270, + "step": 2401, + "time_per_iteration": 3.9257562160491943 + }, + { + "auxiliary_loss_clip": 0.01144874, + "auxiliary_loss_mlp": 0.01049923, + "balance_loss_clip": 1.05361676, + "balance_loss_mlp": 1.02894235, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.7115044891372264, + "language_loss": 0.80781007, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82975805, + "num_input_tokens_seen": 52039315, + "step": 2402, + "time_per_iteration": 2.5505669116973877 + }, + { + "auxiliary_loss_clip": 0.0112644, + "auxiliary_loss_mlp": 0.01051162, + "balance_loss_clip": 1.04829824, + "balance_loss_mlp": 1.02951396, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.209228777080022, + "language_loss": 0.84409606, + "learning_rate": 3.864099047340673e-06, + "loss": 0.86587214, + "num_input_tokens_seen": 52056555, + "step": 2403, + "time_per_iteration": 2.5555660724639893 + }, + { + "auxiliary_loss_clip": 0.01136869, + "auxiliary_loss_mlp": 0.00790573, + "balance_loss_clip": 1.05247927, + "balance_loss_mlp": 1.00049615, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 1.726528595180001, + "language_loss": 0.70170146, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72097594, + "num_input_tokens_seen": 52075800, + "step": 2404, + "time_per_iteration": 2.5814895629882812 + }, + { + "auxiliary_loss_clip": 0.0114371, + "auxiliary_loss_mlp": 0.01052302, + "balance_loss_clip": 1.05156803, + "balance_loss_mlp": 1.03235841, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.1420312113970956, + "language_loss": 0.7348907, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75685084, + "num_input_tokens_seen": 52092585, + "step": 2405, + "time_per_iteration": 2.499772071838379 + }, + { + "auxiliary_loss_clip": 0.01112175, + "auxiliary_loss_mlp": 0.0105421, + "balance_loss_clip": 1.04464102, + "balance_loss_mlp": 1.03252554, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.3021655518201483, + "language_loss": 0.72919691, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75086069, + "num_input_tokens_seen": 52108990, + "step": 2406, + "time_per_iteration": 2.5760085582733154 + }, + { + "auxiliary_loss_clip": 0.01161107, + "auxiliary_loss_mlp": 0.01059975, + "balance_loss_clip": 1.05502379, + "balance_loss_mlp": 1.03742015, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 4.8882230047813415, + "language_loss": 0.75628638, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77849722, + "num_input_tokens_seen": 52125385, + "step": 2407, + "time_per_iteration": 2.521199941635132 + }, + { + "auxiliary_loss_clip": 0.01169612, + "auxiliary_loss_mlp": 0.01050419, + "balance_loss_clip": 1.05536723, + "balance_loss_mlp": 1.02999866, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.5877132717850875, + "language_loss": 0.79816139, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.82036173, + "num_input_tokens_seen": 52144985, + "step": 2408, + "time_per_iteration": 2.4683878421783447 + }, + { + "auxiliary_loss_clip": 0.01160896, + "auxiliary_loss_mlp": 0.01055669, + "balance_loss_clip": 1.0571847, + "balance_loss_mlp": 1.03362727, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 14.723594199183468, + "language_loss": 0.81932247, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84148812, + "num_input_tokens_seen": 52163885, + "step": 2409, + "time_per_iteration": 2.4847536087036133 + }, + { + "auxiliary_loss_clip": 0.01120614, + "auxiliary_loss_mlp": 0.01062515, + "balance_loss_clip": 1.04803383, + "balance_loss_mlp": 1.04095054, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.9101297909251294, + "language_loss": 0.74861908, + "learning_rate": 3.863109517792446e-06, + "loss": 0.77045035, + "num_input_tokens_seen": 52184325, + "step": 2410, + "time_per_iteration": 2.7016148567199707 + }, + { + "auxiliary_loss_clip": 0.0117055, + "auxiliary_loss_mlp": 0.01049651, + "balance_loss_clip": 1.05537367, + "balance_loss_mlp": 1.03047073, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.1628490136299643, + "language_loss": 0.80940706, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83160907, + "num_input_tokens_seen": 52202740, + "step": 2411, + "time_per_iteration": 3.9451725482940674 + }, + { + "auxiliary_loss_clip": 0.01145804, + "auxiliary_loss_mlp": 0.01057593, + "balance_loss_clip": 1.05604935, + "balance_loss_mlp": 1.03669548, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 2.02277844798614, + "language_loss": 0.70664185, + "learning_rate": 3.862826159140214e-06, + "loss": 0.72867584, + "num_input_tokens_seen": 52223100, + "step": 2412, + "time_per_iteration": 2.62174129486084 + }, + { + "auxiliary_loss_clip": 0.01153156, + "auxiliary_loss_mlp": 0.01049568, + "balance_loss_clip": 1.05736279, + "balance_loss_mlp": 1.02914786, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.084096862609153, + "language_loss": 0.77195615, + "learning_rate": 3.862684373853579e-06, + "loss": 0.7939834, + "num_input_tokens_seen": 52239690, + "step": 2413, + "time_per_iteration": 2.4558892250061035 + }, + { + "auxiliary_loss_clip": 0.01065385, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.03251576, + "balance_loss_mlp": 1.03419352, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9180547267188456, + "language_loss": 0.58850044, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60952449, + "num_input_tokens_seen": 52296705, + "step": 2414, + "time_per_iteration": 2.9953763484954834 + }, + { + "auxiliary_loss_clip": 0.01052077, + "auxiliary_loss_mlp": 0.0102185, + "balance_loss_clip": 1.03151679, + "balance_loss_mlp": 1.01946628, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8535900491861441, + "language_loss": 0.62189686, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64263618, + "num_input_tokens_seen": 52361830, + "step": 2415, + "time_per_iteration": 3.138977527618408 + }, + { + "auxiliary_loss_clip": 0.01155834, + "auxiliary_loss_mlp": 0.01049253, + "balance_loss_clip": 1.05671477, + "balance_loss_mlp": 1.02772403, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 2.340191719593485, + "language_loss": 0.7198801, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74193108, + "num_input_tokens_seen": 52379420, + "step": 2416, + "time_per_iteration": 2.5349905490875244 + }, + { + "auxiliary_loss_clip": 0.01036909, + "auxiliary_loss_mlp": 0.01008111, + "balance_loss_clip": 1.02121246, + "balance_loss_mlp": 1.00550067, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.71853730808446, + "language_loss": 0.60436904, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62481928, + "num_input_tokens_seen": 52446290, + "step": 2417, + "time_per_iteration": 3.158947706222534 + }, + { + "auxiliary_loss_clip": 0.01169741, + "auxiliary_loss_mlp": 0.01063836, + "balance_loss_clip": 1.05239248, + "balance_loss_mlp": 1.0426172, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.97191966972553, + "language_loss": 0.79771876, + "learning_rate": 3.861974388030356e-06, + "loss": 0.82005453, + "num_input_tokens_seen": 52467295, + "step": 2418, + "time_per_iteration": 2.5426228046417236 + }, + { + "auxiliary_loss_clip": 0.01121611, + "auxiliary_loss_mlp": 0.01063496, + "balance_loss_clip": 1.04411149, + "balance_loss_mlp": 1.04287326, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.761311019782608, + "language_loss": 0.71923637, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74108744, + "num_input_tokens_seen": 52487295, + "step": 2419, + "time_per_iteration": 2.5490305423736572 + }, + { + "auxiliary_loss_clip": 0.01144462, + "auxiliary_loss_mlp": 0.01056157, + "balance_loss_clip": 1.05329192, + "balance_loss_mlp": 1.03486633, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.318730930911638, + "language_loss": 0.89482129, + "learning_rate": 3.861689899419569e-06, + "loss": 0.9168275, + "num_input_tokens_seen": 52504220, + "step": 2420, + "time_per_iteration": 2.5308330059051514 + }, + { + "auxiliary_loss_clip": 0.01154052, + "auxiliary_loss_mlp": 0.01065568, + "balance_loss_clip": 1.04956245, + "balance_loss_mlp": 1.0461843, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 1.8439781595737583, + "language_loss": 0.83101845, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85321462, + "num_input_tokens_seen": 52521900, + "step": 2421, + "time_per_iteration": 2.493203639984131 + }, + { + "auxiliary_loss_clip": 0.01106528, + "auxiliary_loss_mlp": 0.0106204, + "balance_loss_clip": 1.0417738, + "balance_loss_mlp": 1.04116607, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.4430331319216316, + "language_loss": 0.81814688, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83983254, + "num_input_tokens_seen": 52540495, + "step": 2422, + "time_per_iteration": 2.6091396808624268 + }, + { + "auxiliary_loss_clip": 0.01057708, + "auxiliary_loss_mlp": 0.00763052, + "balance_loss_clip": 1.04850578, + "balance_loss_mlp": 0.99988669, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9174410557660553, + "language_loss": 0.63364911, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65185666, + "num_input_tokens_seen": 52603305, + "step": 2423, + "time_per_iteration": 3.163137197494507 + }, + { + "auxiliary_loss_clip": 0.01110193, + "auxiliary_loss_mlp": 0.00786384, + "balance_loss_clip": 1.05156565, + "balance_loss_mlp": 1.00050664, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 2.14011119011746, + "language_loss": 0.82351488, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84248066, + "num_input_tokens_seen": 52623435, + "step": 2424, + "time_per_iteration": 2.6376845836639404 + }, + { + "auxiliary_loss_clip": 0.01143939, + "auxiliary_loss_mlp": 0.01054437, + "balance_loss_clip": 1.05268455, + "balance_loss_mlp": 1.03437424, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.3526781206955927, + "language_loss": 0.78679514, + "learning_rate": 3.860977442566429e-06, + "loss": 0.80877888, + "num_input_tokens_seen": 52642255, + "step": 2425, + "time_per_iteration": 2.5468897819519043 + }, + { + "auxiliary_loss_clip": 0.01158198, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_clip": 1.05476165, + "balance_loss_mlp": 1.03244126, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.1435514263149447, + "language_loss": 0.83376968, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85586965, + "num_input_tokens_seen": 52658700, + "step": 2426, + "time_per_iteration": 2.5105042457580566 + }, + { + "auxiliary_loss_clip": 0.0116826, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_clip": 1.05590081, + "balance_loss_mlp": 1.03528655, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 2.5738165557373183, + "language_loss": 0.87592673, + "learning_rate": 3.860691965808173e-06, + "loss": 0.8981576, + "num_input_tokens_seen": 52678140, + "step": 2427, + "time_per_iteration": 2.477922201156616 + }, + { + "auxiliary_loss_clip": 0.01131496, + "auxiliary_loss_mlp": 0.01064872, + "balance_loss_clip": 1.04759908, + "balance_loss_mlp": 1.04130459, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.8074212285434708, + "language_loss": 0.66754454, + "learning_rate": 3.8605491215899e-06, + "loss": 0.68950826, + "num_input_tokens_seen": 52696825, + "step": 2428, + "time_per_iteration": 2.5353245735168457 + }, + { + "auxiliary_loss_clip": 0.01155863, + "auxiliary_loss_mlp": 0.01057944, + "balance_loss_clip": 1.05038571, + "balance_loss_mlp": 1.03691554, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 2.0285331348201123, + "language_loss": 0.83482349, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85696149, + "num_input_tokens_seen": 52715125, + "step": 2429, + "time_per_iteration": 2.4974451065063477 + }, + { + "auxiliary_loss_clip": 0.01124062, + "auxiliary_loss_mlp": 0.01058209, + "balance_loss_clip": 1.04504371, + "balance_loss_mlp": 1.03927827, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.7810830252245162, + "language_loss": 0.79114425, + "learning_rate": 3.860263221502145e-06, + "loss": 0.81296694, + "num_input_tokens_seen": 52734015, + "step": 2430, + "time_per_iteration": 2.547636032104492 + }, + { + "auxiliary_loss_clip": 0.01170983, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.05603218, + "balance_loss_mlp": 1.03326988, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 1.9569877616970788, + "language_loss": 0.82716703, + "learning_rate": 3.860120165643504e-06, + "loss": 0.84941393, + "num_input_tokens_seen": 52753025, + "step": 2431, + "time_per_iteration": 2.4850900173187256 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.0555042, + "balance_loss_mlp": 1.03488219, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 2.583404156495904, + "language_loss": 0.79035974, + "learning_rate": 3.859977039248921e-06, + "loss": 0.81258178, + "num_input_tokens_seen": 52773420, + "step": 2432, + "time_per_iteration": 3.9220590591430664 + }, + { + "auxiliary_loss_clip": 0.01167047, + "auxiliary_loss_mlp": 0.00790826, + "balance_loss_clip": 1.05302405, + "balance_loss_mlp": 1.00054514, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 2.09432879000412, + "language_loss": 0.80388129, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82345998, + "num_input_tokens_seen": 52792870, + "step": 2433, + "time_per_iteration": 2.4864187240600586 + }, + { + "auxiliary_loss_clip": 0.01127644, + "auxiliary_loss_mlp": 0.01059968, + "balance_loss_clip": 1.05383945, + "balance_loss_mlp": 1.0377239, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.9724496186323852, + "language_loss": 0.78024185, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80211794, + "num_input_tokens_seen": 52811615, + "step": 2434, + "time_per_iteration": 2.537454128265381 + }, + { + "auxiliary_loss_clip": 0.01034006, + "auxiliary_loss_mlp": 0.01111866, + "balance_loss_clip": 1.01967287, + "balance_loss_mlp": 1.10874319, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8788035736256292, + "language_loss": 0.58445752, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60591614, + "num_input_tokens_seen": 52873230, + "step": 2435, + "time_per_iteration": 3.0952541828155518 + }, + { + "auxiliary_loss_clip": 0.01161837, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.05095994, + "balance_loss_mlp": 1.02764857, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.149645649793068, + "language_loss": 0.87924182, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90133488, + "num_input_tokens_seen": 52889325, + "step": 2436, + "time_per_iteration": 3.85778546333313 + }, + { + "auxiliary_loss_clip": 0.01159005, + "auxiliary_loss_mlp": 0.00788676, + "balance_loss_clip": 1.05517292, + "balance_loss_mlp": 1.00054789, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 1.950860131434879, + "language_loss": 0.74517584, + "learning_rate": 3.85926034942691e-06, + "loss": 0.76465261, + "num_input_tokens_seen": 52909705, + "step": 2437, + "time_per_iteration": 2.5077946186065674 + }, + { + "auxiliary_loss_clip": 0.01170174, + "auxiliary_loss_mlp": 0.010536, + "balance_loss_clip": 1.05452466, + "balance_loss_mlp": 1.02967429, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 5.671950614433697, + "language_loss": 0.7375865, + "learning_rate": 3.859116799930736e-06, + "loss": 0.75982428, + "num_input_tokens_seen": 52930300, + "step": 2438, + "time_per_iteration": 2.5407519340515137 + }, + { + "auxiliary_loss_clip": 0.0115659, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_clip": 1.05611098, + "balance_loss_mlp": 1.0312463, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.9266862294427993, + "language_loss": 0.74511868, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76719666, + "num_input_tokens_seen": 52949955, + "step": 2439, + "time_per_iteration": 2.5602381229400635 + }, + { + "auxiliary_loss_clip": 0.01156569, + "auxiliary_loss_mlp": 0.01057145, + "balance_loss_clip": 1.0557611, + "balance_loss_mlp": 1.03621197, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.3239249907798434, + "language_loss": 0.74604005, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76817721, + "num_input_tokens_seen": 52972905, + "step": 2440, + "time_per_iteration": 2.7006678581237793 + }, + { + "auxiliary_loss_clip": 0.01162589, + "auxiliary_loss_mlp": 0.01059524, + "balance_loss_clip": 1.05260539, + "balance_loss_mlp": 1.03912699, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 2.2672986933946597, + "language_loss": 0.82882559, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85104674, + "num_input_tokens_seen": 52994850, + "step": 2441, + "time_per_iteration": 3.9415924549102783 + }, + { + "auxiliary_loss_clip": 0.01155841, + "auxiliary_loss_mlp": 0.01054759, + "balance_loss_clip": 1.05617082, + "balance_loss_mlp": 1.03268099, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 1.8741258632342435, + "language_loss": 0.71670544, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73881143, + "num_input_tokens_seen": 53014740, + "step": 2442, + "time_per_iteration": 2.543065309524536 + }, + { + "auxiliary_loss_clip": 0.01138744, + "auxiliary_loss_mlp": 0.01052443, + "balance_loss_clip": 1.05439186, + "balance_loss_mlp": 1.03135514, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 5.4803401509992336, + "language_loss": 0.80766553, + "learning_rate": 3.8583979950904e-06, + "loss": 0.82957739, + "num_input_tokens_seen": 53029780, + "step": 2443, + "time_per_iteration": 2.547086000442505 + }, + { + "auxiliary_loss_clip": 0.01150778, + "auxiliary_loss_mlp": 0.01060385, + "balance_loss_clip": 1.05232191, + "balance_loss_mlp": 1.03709197, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 1.7606997152981587, + "language_loss": 0.82949311, + "learning_rate": 3.858254022688599e-06, + "loss": 0.8516047, + "num_input_tokens_seen": 53048620, + "step": 2444, + "time_per_iteration": 2.507131576538086 + }, + { + "auxiliary_loss_clip": 0.01137259, + "auxiliary_loss_mlp": 0.01065186, + "balance_loss_clip": 1.0528605, + "balance_loss_mlp": 1.04427636, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.7586123479905058, + "language_loss": 0.70946801, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73149252, + "num_input_tokens_seen": 53070055, + "step": 2445, + "time_per_iteration": 2.5737202167510986 + }, + { + "auxiliary_loss_clip": 0.0106428, + "auxiliary_loss_mlp": 0.01012924, + "balance_loss_clip": 1.02324581, + "balance_loss_mlp": 1.01026535, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8231096335950593, + "language_loss": 0.63012403, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65089607, + "num_input_tokens_seen": 53126945, + "step": 2446, + "time_per_iteration": 2.9296376705169678 + }, + { + "auxiliary_loss_clip": 0.01121344, + "auxiliary_loss_mlp": 0.01050997, + "balance_loss_clip": 1.05270064, + "balance_loss_mlp": 1.02999198, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.5865912889410014, + "language_loss": 0.75024545, + "learning_rate": 3.857821682713975e-06, + "loss": 0.77196884, + "num_input_tokens_seen": 53149130, + "step": 2447, + "time_per_iteration": 2.6369385719299316 + }, + { + "auxiliary_loss_clip": 0.01167716, + "auxiliary_loss_mlp": 0.01052024, + "balance_loss_clip": 1.05528319, + "balance_loss_mlp": 1.03235483, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.2421203652666772, + "language_loss": 0.85159522, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87379265, + "num_input_tokens_seen": 53167120, + "step": 2448, + "time_per_iteration": 2.518284320831299 + }, + { + "auxiliary_loss_clip": 0.01062333, + "auxiliary_loss_mlp": 0.01005544, + "balance_loss_clip": 1.02152324, + "balance_loss_mlp": 1.00293291, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7692694633991609, + "language_loss": 0.56878471, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58946347, + "num_input_tokens_seen": 53227945, + "step": 2449, + "time_per_iteration": 2.981935739517212 + }, + { + "auxiliary_loss_clip": 0.01133777, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.04679668, + "balance_loss_mlp": 1.02523279, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 3.1160289962352716, + "language_loss": 0.85089904, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87270463, + "num_input_tokens_seen": 53244615, + "step": 2450, + "time_per_iteration": 3.943692684173584 + }, + { + "auxiliary_loss_clip": 0.01158916, + "auxiliary_loss_mlp": 0.01059994, + "balance_loss_clip": 1.0527519, + "balance_loss_mlp": 1.03826189, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.1055731200595784, + "language_loss": 0.75051653, + "learning_rate": 3.857244243157052e-06, + "loss": 0.77270567, + "num_input_tokens_seen": 53262205, + "step": 2451, + "time_per_iteration": 2.48467755317688 + }, + { + "auxiliary_loss_clip": 0.01133309, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.04744577, + "balance_loss_mlp": 1.02744865, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.59079977744349, + "language_loss": 0.82302856, + "learning_rate": 3.85709970718691e-06, + "loss": 0.84482872, + "num_input_tokens_seen": 53282445, + "step": 2452, + "time_per_iteration": 2.550046682357788 + }, + { + "auxiliary_loss_clip": 0.01100135, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.05219817, + "balance_loss_mlp": 1.02563429, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.6073222010462132, + "language_loss": 0.74119383, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76264185, + "num_input_tokens_seen": 53299060, + "step": 2453, + "time_per_iteration": 2.638888120651245 + }, + { + "auxiliary_loss_clip": 0.01144017, + "auxiliary_loss_mlp": 0.0105387, + "balance_loss_clip": 1.05267584, + "balance_loss_mlp": 1.0325439, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.4788051734986585, + "language_loss": 0.76358843, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78556728, + "num_input_tokens_seen": 53315970, + "step": 2454, + "time_per_iteration": 2.511618137359619 + }, + { + "auxiliary_loss_clip": 0.01143562, + "auxiliary_loss_mlp": 0.01045906, + "balance_loss_clip": 1.04894531, + "balance_loss_mlp": 1.02589083, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.08029840541338, + "language_loss": 0.83060342, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85249811, + "num_input_tokens_seen": 53332940, + "step": 2455, + "time_per_iteration": 2.4994170665740967 + }, + { + "auxiliary_loss_clip": 0.01132893, + "auxiliary_loss_mlp": 0.01054905, + "balance_loss_clip": 1.05069256, + "balance_loss_mlp": 1.03457975, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.1814318879889, + "language_loss": 0.8437981, + "learning_rate": 3.85652085914712e-06, + "loss": 0.86567611, + "num_input_tokens_seen": 53353295, + "step": 2456, + "time_per_iteration": 2.671914577484131 + }, + { + "auxiliary_loss_clip": 0.01151056, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.04972863, + "balance_loss_mlp": 1.02558208, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8971340888391715, + "language_loss": 0.84446037, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86642122, + "num_input_tokens_seen": 53373410, + "step": 2457, + "time_per_iteration": 2.4943737983703613 + }, + { + "auxiliary_loss_clip": 0.01153112, + "auxiliary_loss_mlp": 0.01046194, + "balance_loss_clip": 1.05296016, + "balance_loss_mlp": 1.02635801, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.9398490702543987, + "language_loss": 0.75656193, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77855504, + "num_input_tokens_seen": 53391430, + "step": 2458, + "time_per_iteration": 2.4996337890625 + }, + { + "auxiliary_loss_clip": 0.01119278, + "auxiliary_loss_mlp": 0.0105458, + "balance_loss_clip": 1.04868054, + "balance_loss_mlp": 1.03289545, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 1.886407357702389, + "language_loss": 0.83871424, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86045283, + "num_input_tokens_seen": 53409960, + "step": 2459, + "time_per_iteration": 2.6155874729156494 + }, + { + "auxiliary_loss_clip": 0.01129081, + "auxiliary_loss_mlp": 0.01041443, + "balance_loss_clip": 1.04574943, + "balance_loss_mlp": 1.02221417, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 3.8728253451321377, + "language_loss": 0.75847399, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78017926, + "num_input_tokens_seen": 53426160, + "step": 2460, + "time_per_iteration": 2.5115201473236084 + }, + { + "auxiliary_loss_clip": 0.01134685, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_clip": 1.05305135, + "balance_loss_mlp": 1.02753913, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 3.711054123211538, + "language_loss": 0.81461096, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83643985, + "num_input_tokens_seen": 53448530, + "step": 2461, + "time_per_iteration": 2.6334211826324463 + }, + { + "auxiliary_loss_clip": 0.01158099, + "auxiliary_loss_mlp": 0.01052561, + "balance_loss_clip": 1.0526005, + "balance_loss_mlp": 1.03106785, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 2.7592710615278766, + "language_loss": 0.66160572, + "learning_rate": 3.855650475213761e-06, + "loss": 0.6837123, + "num_input_tokens_seen": 53465915, + "step": 2462, + "time_per_iteration": 2.5174503326416016 + }, + { + "auxiliary_loss_clip": 0.01136329, + "auxiliary_loss_mlp": 0.01055333, + "balance_loss_clip": 1.05032337, + "balance_loss_mlp": 1.03462672, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.6048047610447955, + "language_loss": 0.67506778, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69698441, + "num_input_tokens_seen": 53496055, + "step": 2463, + "time_per_iteration": 2.8435380458831787 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01055429, + "balance_loss_clip": 1.04940915, + "balance_loss_mlp": 1.0351274, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.418669591680718, + "language_loss": 0.76966691, + "learning_rate": 3.855359784245646e-06, + "loss": 0.79175788, + "num_input_tokens_seen": 53513790, + "step": 2464, + "time_per_iteration": 2.563957929611206 + }, + { + "auxiliary_loss_clip": 0.01134254, + "auxiliary_loss_mlp": 0.01057975, + "balance_loss_clip": 1.04811001, + "balance_loss_mlp": 1.03792357, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 2.41799811168739, + "language_loss": 0.79968691, + "learning_rate": 3.855214333225688e-06, + "loss": 0.82160914, + "num_input_tokens_seen": 53533410, + "step": 2465, + "time_per_iteration": 2.5459609031677246 + }, + { + "auxiliary_loss_clip": 0.01169616, + "auxiliary_loss_mlp": 0.01051722, + "balance_loss_clip": 1.05521584, + "balance_loss_mlp": 1.03105152, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 2.636990904480508, + "language_loss": 0.76491654, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78712994, + "num_input_tokens_seen": 53554775, + "step": 2466, + "time_per_iteration": 2.5268616676330566 + }, + { + "auxiliary_loss_clip": 0.010243, + "auxiliary_loss_mlp": 0.01016543, + "balance_loss_clip": 1.03844929, + "balance_loss_mlp": 1.0138725, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.7836956117991611, + "language_loss": 0.60035866, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62076712, + "num_input_tokens_seen": 53609675, + "step": 2467, + "time_per_iteration": 3.2149243354797363 + }, + { + "auxiliary_loss_clip": 0.01148176, + "auxiliary_loss_mlp": 0.01050481, + "balance_loss_clip": 1.05377376, + "balance_loss_mlp": 1.03039443, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 3.9820986173671447, + "language_loss": 0.87886119, + "learning_rate": 3.85477755808841e-06, + "loss": 0.90084773, + "num_input_tokens_seen": 53626950, + "step": 2468, + "time_per_iteration": 2.5649311542510986 + }, + { + "auxiliary_loss_clip": 0.01133409, + "auxiliary_loss_mlp": 0.01053569, + "balance_loss_clip": 1.04862142, + "balance_loss_mlp": 1.03248096, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 1.969227283063607, + "language_loss": 0.76023293, + "learning_rate": 3.854631825701919e-06, + "loss": 0.7821027, + "num_input_tokens_seen": 53644200, + "step": 2469, + "time_per_iteration": 2.573228359222412 + }, + { + "auxiliary_loss_clip": 0.01131882, + "auxiliary_loss_mlp": 0.01052924, + "balance_loss_clip": 1.05120349, + "balance_loss_mlp": 1.03319442, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 4.281716050092176, + "language_loss": 0.76019764, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78204566, + "num_input_tokens_seen": 53659650, + "step": 2470, + "time_per_iteration": 2.548293113708496 + }, + { + "auxiliary_loss_clip": 0.01160871, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.05244064, + "balance_loss_mlp": 1.03121042, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 2.2679766854405865, + "language_loss": 0.71978867, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74190968, + "num_input_tokens_seen": 53680275, + "step": 2471, + "time_per_iteration": 3.9891445636749268 + }, + { + "auxiliary_loss_clip": 0.01135444, + "auxiliary_loss_mlp": 0.01060606, + "balance_loss_clip": 1.04739261, + "balance_loss_mlp": 1.0384208, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 2.1172270693604816, + "language_loss": 0.8987571, + "learning_rate": 3.854194206597615e-06, + "loss": 0.9207176, + "num_input_tokens_seen": 53698270, + "step": 2472, + "time_per_iteration": 2.5104973316192627 + }, + { + "auxiliary_loss_clip": 0.01127972, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.04891348, + "balance_loss_mlp": 1.03045547, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 2.7111926957839207, + "language_loss": 0.8005085, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82230705, + "num_input_tokens_seen": 53716845, + "step": 2473, + "time_per_iteration": 2.539344549179077 + }, + { + "auxiliary_loss_clip": 0.01158297, + "auxiliary_loss_mlp": 0.01061265, + "balance_loss_clip": 1.05378461, + "balance_loss_mlp": 1.04135728, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.0689164879724826, + "language_loss": 0.77267683, + "learning_rate": 3.853902108962709e-06, + "loss": 0.7948724, + "num_input_tokens_seen": 53734970, + "step": 2474, + "time_per_iteration": 2.511507034301758 + }, + { + "auxiliary_loss_clip": 0.01124489, + "auxiliary_loss_mlp": 0.01063954, + "balance_loss_clip": 1.05124354, + "balance_loss_mlp": 1.04300857, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.737352253274615, + "language_loss": 0.81906313, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84094757, + "num_input_tokens_seen": 53753415, + "step": 2475, + "time_per_iteration": 4.0954670906066895 + }, + { + "auxiliary_loss_clip": 0.01108135, + "auxiliary_loss_mlp": 0.01066632, + "balance_loss_clip": 1.04716718, + "balance_loss_mlp": 1.04649782, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.8894405420200928, + "language_loss": 0.80511659, + "learning_rate": 3.85360973012719e-06, + "loss": 0.8268643, + "num_input_tokens_seen": 53770305, + "step": 2476, + "time_per_iteration": 2.572543144226074 + }, + { + "auxiliary_loss_clip": 0.01149032, + "auxiliary_loss_mlp": 0.01058517, + "balance_loss_clip": 1.05059195, + "balance_loss_mlp": 1.03972983, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.7734057760793742, + "language_loss": 0.77749145, + "learning_rate": 3.853463435273058e-06, + "loss": 0.79956692, + "num_input_tokens_seen": 53788895, + "step": 2477, + "time_per_iteration": 2.5506205558776855 + }, + { + "auxiliary_loss_clip": 0.01043729, + "auxiliary_loss_mlp": 0.01015256, + "balance_loss_clip": 1.01936054, + "balance_loss_mlp": 1.01203763, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8113650306283874, + "language_loss": 0.60138893, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62197888, + "num_input_tokens_seen": 53850260, + "step": 2478, + "time_per_iteration": 3.126466989517212 + }, + { + "auxiliary_loss_clip": 0.0110959, + "auxiliary_loss_mlp": 0.01056814, + "balance_loss_clip": 1.0454607, + "balance_loss_mlp": 1.03735912, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.0822025097108647, + "language_loss": 0.70850563, + "learning_rate": 3.853170634719787e-06, + "loss": 0.73016971, + "num_input_tokens_seen": 53867520, + "step": 2479, + "time_per_iteration": 2.634312152862549 + }, + { + "auxiliary_loss_clip": 0.01138901, + "auxiliary_loss_mlp": 0.01055634, + "balance_loss_clip": 1.04924548, + "balance_loss_mlp": 1.03490376, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.553179547418975, + "language_loss": 0.80754894, + "learning_rate": 3.853024129031751e-06, + "loss": 0.8294943, + "num_input_tokens_seen": 53886620, + "step": 2480, + "time_per_iteration": 3.9969065189361572 + }, + { + "auxiliary_loss_clip": 0.01139394, + "auxiliary_loss_mlp": 0.01049302, + "balance_loss_clip": 1.05310535, + "balance_loss_mlp": 1.02991867, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.059786924361706, + "language_loss": 0.84033805, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86222506, + "num_input_tokens_seen": 53902230, + "step": 2481, + "time_per_iteration": 2.5708165168762207 + }, + { + "auxiliary_loss_clip": 0.01148607, + "auxiliary_loss_mlp": 0.01056411, + "balance_loss_clip": 1.05152225, + "balance_loss_mlp": 1.03569221, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 2.3345747344459715, + "language_loss": 0.77450633, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79655653, + "num_input_tokens_seen": 53919475, + "step": 2482, + "time_per_iteration": 2.502335786819458 + }, + { + "auxiliary_loss_clip": 0.01139919, + "auxiliary_loss_mlp": 0.01042154, + "balance_loss_clip": 1.05478859, + "balance_loss_mlp": 1.02112579, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.2560587435035293, + "language_loss": 0.79041803, + "learning_rate": 3.852584190388713e-06, + "loss": 0.81223869, + "num_input_tokens_seen": 53939150, + "step": 2483, + "time_per_iteration": 2.596137762069702 + }, + { + "auxiliary_loss_clip": 0.01151134, + "auxiliary_loss_mlp": 0.00784696, + "balance_loss_clip": 1.0514946, + "balance_loss_mlp": 1.00048971, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 2.006878434009963, + "language_loss": 0.70054948, + "learning_rate": 3.852437403666595e-06, + "loss": 0.71990776, + "num_input_tokens_seen": 53958735, + "step": 2484, + "time_per_iteration": 2.538827419281006 + }, + { + "auxiliary_loss_clip": 0.01142516, + "auxiliary_loss_mlp": 0.0078718, + "balance_loss_clip": 1.05064118, + "balance_loss_mlp": 1.00050735, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 1.9043537573510723, + "language_loss": 0.84333229, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86262918, + "num_input_tokens_seen": 53975065, + "step": 2485, + "time_per_iteration": 2.5823426246643066 + }, + { + "auxiliary_loss_clip": 0.01146102, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.04950702, + "balance_loss_mlp": 1.03063679, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.292429931271819, + "language_loss": 0.84805179, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87002873, + "num_input_tokens_seen": 53993330, + "step": 2486, + "time_per_iteration": 2.5012927055358887 + }, + { + "auxiliary_loss_clip": 0.01147635, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.04851747, + "balance_loss_mlp": 1.02540076, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.0240531535420514, + "language_loss": 0.74696815, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76887, + "num_input_tokens_seen": 54010515, + "step": 2487, + "time_per_iteration": 2.496985673904419 + }, + { + "auxiliary_loss_clip": 0.0115378, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_clip": 1.05059755, + "balance_loss_mlp": 1.02928782, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.443163590600751, + "language_loss": 0.71576071, + "learning_rate": 3.8518495543877e-06, + "loss": 0.73778975, + "num_input_tokens_seen": 54031315, + "step": 2488, + "time_per_iteration": 2.6532511711120605 + }, + { + "auxiliary_loss_clip": 0.01139783, + "auxiliary_loss_mlp": 0.01053799, + "balance_loss_clip": 1.05279338, + "balance_loss_mlp": 1.03408182, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.7344329719347544, + "language_loss": 0.70714509, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72908086, + "num_input_tokens_seen": 54045965, + "step": 2489, + "time_per_iteration": 4.053692579269409 + }, + { + "auxiliary_loss_clip": 0.01140477, + "auxiliary_loss_mlp": 0.01051384, + "balance_loss_clip": 1.04823399, + "balance_loss_mlp": 1.03159547, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 3.139744980954251, + "language_loss": 0.81223202, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83415067, + "num_input_tokens_seen": 54059960, + "step": 2490, + "time_per_iteration": 2.484414577484131 + }, + { + "auxiliary_loss_clip": 0.01124624, + "auxiliary_loss_mlp": 0.01051896, + "balance_loss_clip": 1.04924703, + "balance_loss_mlp": 1.03261971, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.9478298227885227, + "language_loss": 0.80275071, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82451594, + "num_input_tokens_seen": 54079330, + "step": 2491, + "time_per_iteration": 2.685479164123535 + }, + { + "auxiliary_loss_clip": 0.01144156, + "auxiliary_loss_mlp": 0.01049497, + "balance_loss_clip": 1.04627657, + "balance_loss_mlp": 1.02793217, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.325686650306096, + "language_loss": 0.90240514, + "learning_rate": 3.851260581551727e-06, + "loss": 0.92434168, + "num_input_tokens_seen": 54097555, + "step": 2492, + "time_per_iteration": 2.5082008838653564 + }, + { + "auxiliary_loss_clip": 0.01152829, + "auxiliary_loss_mlp": 0.01059053, + "balance_loss_clip": 1.05178308, + "balance_loss_mlp": 1.03907382, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.991017279878022, + "language_loss": 0.78928614, + "learning_rate": 3.851113162828802e-06, + "loss": 0.81140494, + "num_input_tokens_seen": 54115600, + "step": 2493, + "time_per_iteration": 2.482034921646118 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01046726, + "balance_loss_clip": 1.04957843, + "balance_loss_mlp": 1.02615082, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 1.8464800910080081, + "language_loss": 0.80332708, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82530439, + "num_input_tokens_seen": 54135220, + "step": 2494, + "time_per_iteration": 2.512192964553833 + }, + { + "auxiliary_loss_clip": 0.01138505, + "auxiliary_loss_mlp": 0.01050134, + "balance_loss_clip": 1.04941607, + "balance_loss_mlp": 1.02968931, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.896183156807243, + "language_loss": 0.66374421, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68563056, + "num_input_tokens_seen": 54161065, + "step": 2495, + "time_per_iteration": 2.920058012008667 + }, + { + "auxiliary_loss_clip": 0.01058274, + "auxiliary_loss_mlp": 0.01013205, + "balance_loss_clip": 1.02660596, + "balance_loss_mlp": 1.00986671, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.8995239784274077, + "language_loss": 0.59449536, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61521012, + "num_input_tokens_seen": 54225095, + "step": 2496, + "time_per_iteration": 3.082857370376587 + }, + { + "auxiliary_loss_clip": 0.01165822, + "auxiliary_loss_mlp": 0.01054591, + "balance_loss_clip": 1.05156207, + "balance_loss_mlp": 1.03384852, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 1.845420870793934, + "language_loss": 0.65648293, + "learning_rate": 3.850522786049075e-06, + "loss": 0.6786871, + "num_input_tokens_seen": 54243750, + "step": 2497, + "time_per_iteration": 2.4477992057800293 + }, + { + "auxiliary_loss_clip": 0.01130024, + "auxiliary_loss_mlp": 0.01055981, + "balance_loss_clip": 1.04924262, + "balance_loss_mlp": 1.03453565, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4990828543511512, + "language_loss": 0.75175315, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77361321, + "num_input_tokens_seen": 54266185, + "step": 2498, + "time_per_iteration": 2.591810703277588 + }, + { + "auxiliary_loss_clip": 0.01134279, + "auxiliary_loss_mlp": 0.01047758, + "balance_loss_clip": 1.05543876, + "balance_loss_mlp": 1.02745652, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.7500467018434858, + "language_loss": 0.71854258, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74036294, + "num_input_tokens_seen": 54283940, + "step": 2499, + "time_per_iteration": 2.5690770149230957 + }, + { + "auxiliary_loss_clip": 0.01137802, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.04986238, + "balance_loss_mlp": 1.03804553, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 5.592322214143914, + "language_loss": 0.71885538, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74081814, + "num_input_tokens_seen": 54304830, + "step": 2500, + "time_per_iteration": 2.60164213180542 + }, + { + "auxiliary_loss_clip": 0.01136015, + "auxiliary_loss_mlp": 0.01062999, + "balance_loss_clip": 1.05283713, + "balance_loss_mlp": 1.04250741, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.04416148437353, + "language_loss": 0.65616035, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67815042, + "num_input_tokens_seen": 54325595, + "step": 2501, + "time_per_iteration": 2.7028465270996094 + }, + { + "auxiliary_loss_clip": 0.01140014, + "auxiliary_loss_mlp": 0.01060515, + "balance_loss_clip": 1.04976439, + "balance_loss_mlp": 1.03972507, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 15.074590949362687, + "language_loss": 0.83256704, + "learning_rate": 3.849783236246318e-06, + "loss": 0.8545723, + "num_input_tokens_seen": 54342180, + "step": 2502, + "time_per_iteration": 2.5021448135375977 + }, + { + "auxiliary_loss_clip": 0.01124312, + "auxiliary_loss_mlp": 0.01055253, + "balance_loss_clip": 1.04939139, + "balance_loss_mlp": 1.03724098, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 1.811117906559974, + "language_loss": 0.77516717, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79696286, + "num_input_tokens_seen": 54360255, + "step": 2503, + "time_per_iteration": 2.563169479370117 + }, + { + "auxiliary_loss_clip": 0.01162402, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.05174458, + "balance_loss_mlp": 1.02890956, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 1.9063880542949736, + "language_loss": 0.85246003, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87456262, + "num_input_tokens_seen": 54378260, + "step": 2504, + "time_per_iteration": 2.5120792388916016 + }, + { + "auxiliary_loss_clip": 0.0114883, + "auxiliary_loss_mlp": 0.01048373, + "balance_loss_clip": 1.05071187, + "balance_loss_mlp": 1.02994347, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.598508570354026, + "language_loss": 0.83348823, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85546029, + "num_input_tokens_seen": 54399745, + "step": 2505, + "time_per_iteration": 2.5425455570220947 + }, + { + "auxiliary_loss_clip": 0.01125832, + "auxiliary_loss_mlp": 0.01061495, + "balance_loss_clip": 1.0470233, + "balance_loss_mlp": 1.04225492, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 1.8974994201760302, + "language_loss": 0.76010734, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78198063, + "num_input_tokens_seen": 54417105, + "step": 2506, + "time_per_iteration": 2.555166721343994 + }, + { + "auxiliary_loss_clip": 0.01167909, + "auxiliary_loss_mlp": 0.01055389, + "balance_loss_clip": 1.05314863, + "balance_loss_mlp": 1.03596973, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 2.643562275023957, + "language_loss": 0.76194012, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78417313, + "num_input_tokens_seen": 54433920, + "step": 2507, + "time_per_iteration": 2.4298441410064697 + }, + { + "auxiliary_loss_clip": 0.01146379, + "auxiliary_loss_mlp": 0.01046803, + "balance_loss_clip": 1.04824376, + "balance_loss_mlp": 1.02854013, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.0776658185029717, + "language_loss": 0.68730241, + "learning_rate": 3.848893461794131e-06, + "loss": 0.70923424, + "num_input_tokens_seen": 54451540, + "step": 2508, + "time_per_iteration": 2.4990594387054443 + }, + { + "auxiliary_loss_clip": 0.01133748, + "auxiliary_loss_mlp": 0.01058819, + "balance_loss_clip": 1.05087698, + "balance_loss_mlp": 1.03926873, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.7434362892600754, + "language_loss": 0.77488178, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79680747, + "num_input_tokens_seen": 54470800, + "step": 2509, + "time_per_iteration": 2.528008460998535 + }, + { + "auxiliary_loss_clip": 0.01148305, + "auxiliary_loss_mlp": 0.0078767, + "balance_loss_clip": 1.05096555, + "balance_loss_mlp": 1.00035501, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 3.3247650194142295, + "language_loss": 0.79897833, + "learning_rate": 3.848596309368246e-06, + "loss": 0.8183381, + "num_input_tokens_seen": 54486525, + "step": 2510, + "time_per_iteration": 2.5096352100372314 + }, + { + "auxiliary_loss_clip": 0.01157189, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_clip": 1.05494642, + "balance_loss_mlp": 1.04081511, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 1.8249851624323752, + "language_loss": 0.73704851, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.75923681, + "num_input_tokens_seen": 54503795, + "step": 2511, + "time_per_iteration": 4.028635263442993 + }, + { + "auxiliary_loss_clip": 0.01098637, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.0441494, + "balance_loss_mlp": 1.02829766, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.037957197540134, + "language_loss": 0.68877983, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71023738, + "num_input_tokens_seen": 54523025, + "step": 2512, + "time_per_iteration": 2.6856071949005127 + }, + { + "auxiliary_loss_clip": 0.01156148, + "auxiliary_loss_mlp": 0.01053885, + "balance_loss_clip": 1.05387962, + "balance_loss_mlp": 1.03482294, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 8.928439655905818, + "language_loss": 0.73844552, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76054579, + "num_input_tokens_seen": 54545025, + "step": 2513, + "time_per_iteration": 2.5802505016326904 + }, + { + "auxiliary_loss_clip": 0.0102853, + "auxiliary_loss_mlp": 0.01043481, + "balance_loss_clip": 1.03879762, + "balance_loss_mlp": 1.03978527, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8843858298081847, + "language_loss": 0.64808935, + "learning_rate": 3.84800116337411e-06, + "loss": 0.66880947, + "num_input_tokens_seen": 54604545, + "step": 2514, + "time_per_iteration": 3.2318265438079834 + }, + { + "auxiliary_loss_clip": 0.01148278, + "auxiliary_loss_mlp": 0.01044287, + "balance_loss_clip": 1.05198014, + "balance_loss_mlp": 1.02611959, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.1109909731331125, + "language_loss": 0.73067838, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75260401, + "num_input_tokens_seen": 54620590, + "step": 2515, + "time_per_iteration": 3.9795327186584473 + }, + { + "auxiliary_loss_clip": 0.01132981, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_clip": 1.04820156, + "balance_loss_mlp": 1.0251174, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 2.133750123677633, + "language_loss": 0.77257288, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79435426, + "num_input_tokens_seen": 54640410, + "step": 2516, + "time_per_iteration": 2.5522236824035645 + }, + { + "auxiliary_loss_clip": 0.01055756, + "auxiliary_loss_mlp": 0.01024894, + "balance_loss_clip": 1.02357388, + "balance_loss_mlp": 1.02191401, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7445382973677832, + "language_loss": 0.54687989, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56768632, + "num_input_tokens_seen": 54701430, + "step": 2517, + "time_per_iteration": 3.0757334232330322 + }, + { + "auxiliary_loss_clip": 0.01118732, + "auxiliary_loss_mlp": 0.01048725, + "balance_loss_clip": 1.04152215, + "balance_loss_mlp": 1.02826893, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 3.135950120624262, + "language_loss": 0.78519976, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.80687439, + "num_input_tokens_seen": 54720845, + "step": 2518, + "time_per_iteration": 2.5580954551696777 + }, + { + "auxiliary_loss_clip": 0.01155224, + "auxiliary_loss_mlp": 0.0104814, + "balance_loss_clip": 1.05167913, + "balance_loss_mlp": 1.02836335, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.4842269620515625, + "language_loss": 0.704144, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72617763, + "num_input_tokens_seen": 54740495, + "step": 2519, + "time_per_iteration": 2.5494384765625 + }, + { + "auxiliary_loss_clip": 0.01151076, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_clip": 1.05034685, + "balance_loss_mlp": 1.02741909, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 2.0732666350072537, + "language_loss": 0.7925899, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81456417, + "num_input_tokens_seen": 54758415, + "step": 2520, + "time_per_iteration": 3.902132987976074 + }, + { + "auxiliary_loss_clip": 0.01146881, + "auxiliary_loss_mlp": 0.01065583, + "balance_loss_clip": 1.05219495, + "balance_loss_mlp": 1.04516268, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.62278742936466, + "language_loss": 0.74792624, + "learning_rate": 3.846956960161114e-06, + "loss": 0.77005088, + "num_input_tokens_seen": 54779355, + "step": 2521, + "time_per_iteration": 2.5988030433654785 + }, + { + "auxiliary_loss_clip": 0.01136064, + "auxiliary_loss_mlp": 0.01052547, + "balance_loss_clip": 1.0486474, + "balance_loss_mlp": 1.03248429, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.088474127832624, + "language_loss": 0.8183825, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84026861, + "num_input_tokens_seen": 54799465, + "step": 2522, + "time_per_iteration": 2.5706069469451904 + }, + { + "auxiliary_loss_clip": 0.01026271, + "auxiliary_loss_mlp": 0.01018295, + "balance_loss_clip": 1.03091025, + "balance_loss_mlp": 1.01455152, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.8357704417976192, + "language_loss": 0.57912189, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59956753, + "num_input_tokens_seen": 54857665, + "step": 2523, + "time_per_iteration": 3.1622226238250732 + }, + { + "auxiliary_loss_clip": 0.01143862, + "auxiliary_loss_mlp": 0.01059808, + "balance_loss_clip": 1.04821646, + "balance_loss_mlp": 1.03769445, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.9088979602017626, + "language_loss": 0.74769777, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.7697345, + "num_input_tokens_seen": 54879895, + "step": 2524, + "time_per_iteration": 2.5656349658966064 + }, + { + "auxiliary_loss_clip": 0.01143399, + "auxiliary_loss_mlp": 0.01041857, + "balance_loss_clip": 1.05152333, + "balance_loss_mlp": 1.02227139, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.7688811766587713, + "language_loss": 0.74766672, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76951927, + "num_input_tokens_seen": 54898245, + "step": 2525, + "time_per_iteration": 2.5155892372131348 + }, + { + "auxiliary_loss_clip": 0.01146776, + "auxiliary_loss_mlp": 0.01050251, + "balance_loss_clip": 1.04917598, + "balance_loss_mlp": 1.03060484, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8747366612817626, + "language_loss": 0.79552865, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81749898, + "num_input_tokens_seen": 54917060, + "step": 2526, + "time_per_iteration": 2.494734525680542 + }, + { + "auxiliary_loss_clip": 0.01139729, + "auxiliary_loss_mlp": 0.01047096, + "balance_loss_clip": 1.05316746, + "balance_loss_mlp": 1.02938151, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.8305416770498057, + "language_loss": 0.85015595, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87202412, + "num_input_tokens_seen": 54936365, + "step": 2527, + "time_per_iteration": 2.5262930393218994 + }, + { + "auxiliary_loss_clip": 0.01129477, + "auxiliary_loss_mlp": 0.01047923, + "balance_loss_clip": 1.04832554, + "balance_loss_mlp": 1.02793205, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 1.9753888007905682, + "language_loss": 0.6917187, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71349269, + "num_input_tokens_seen": 54961365, + "step": 2528, + "time_per_iteration": 4.245699644088745 + }, + { + "auxiliary_loss_clip": 0.01136256, + "auxiliary_loss_mlp": 0.0104693, + "balance_loss_clip": 1.05154192, + "balance_loss_mlp": 1.02857196, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 1.7227285525987517, + "language_loss": 0.86778545, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88961732, + "num_input_tokens_seen": 54980750, + "step": 2529, + "time_per_iteration": 2.5202653408050537 + }, + { + "auxiliary_loss_clip": 0.0112928, + "auxiliary_loss_mlp": 0.0104351, + "balance_loss_clip": 1.0494473, + "balance_loss_mlp": 1.02484167, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.9711850171926737, + "language_loss": 0.83466387, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85639179, + "num_input_tokens_seen": 54999675, + "step": 2530, + "time_per_iteration": 2.523061513900757 + }, + { + "auxiliary_loss_clip": 0.01128503, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.04643333, + "balance_loss_mlp": 1.02913213, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 1.9225831719509037, + "language_loss": 0.80120111, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82297015, + "num_input_tokens_seen": 55018295, + "step": 2531, + "time_per_iteration": 2.502504348754883 + }, + { + "auxiliary_loss_clip": 0.0114732, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.04850936, + "balance_loss_mlp": 1.02867556, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 2.0200346063066967, + "language_loss": 0.78663623, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.80857766, + "num_input_tokens_seen": 55037975, + "step": 2532, + "time_per_iteration": 2.5171539783477783 + }, + { + "auxiliary_loss_clip": 0.01149724, + "auxiliary_loss_mlp": 0.01050254, + "balance_loss_clip": 1.05113733, + "balance_loss_mlp": 1.03116894, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 12.457450551570743, + "language_loss": 0.87797838, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89997822, + "num_input_tokens_seen": 55057135, + "step": 2533, + "time_per_iteration": 2.589427947998047 + }, + { + "auxiliary_loss_clip": 0.01117459, + "auxiliary_loss_mlp": 0.01048717, + "balance_loss_clip": 1.044783, + "balance_loss_mlp": 1.02979851, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.5377079390987194, + "language_loss": 0.78710431, + "learning_rate": 3.84500862231636e-06, + "loss": 0.80876613, + "num_input_tokens_seen": 55075525, + "step": 2534, + "time_per_iteration": 2.5679986476898193 + }, + { + "auxiliary_loss_clip": 0.01165673, + "auxiliary_loss_mlp": 0.01048545, + "balance_loss_clip": 1.05107856, + "balance_loss_mlp": 1.02832747, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.831656137411806, + "language_loss": 0.76635289, + "learning_rate": 3.844858260274702e-06, + "loss": 0.78849506, + "num_input_tokens_seen": 55090845, + "step": 2535, + "time_per_iteration": 2.433056116104126 + }, + { + "auxiliary_loss_clip": 0.01145289, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.04930985, + "balance_loss_mlp": 1.03172636, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.4671477306997476, + "language_loss": 0.7868523, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80880636, + "num_input_tokens_seen": 55108750, + "step": 2536, + "time_per_iteration": 2.5242366790771484 + }, + { + "auxiliary_loss_clip": 0.01127059, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.04857671, + "balance_loss_mlp": 1.03725111, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.633299915533139, + "language_loss": 0.7579425, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77976501, + "num_input_tokens_seen": 55126750, + "step": 2537, + "time_per_iteration": 2.58121395111084 + }, + { + "auxiliary_loss_clip": 0.01150896, + "auxiliary_loss_mlp": 0.01056572, + "balance_loss_clip": 1.05383563, + "balance_loss_mlp": 1.03836894, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.040766679594079, + "language_loss": 0.77612692, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79820168, + "num_input_tokens_seen": 55144690, + "step": 2538, + "time_per_iteration": 2.529289484024048 + }, + { + "auxiliary_loss_clip": 0.01108126, + "auxiliary_loss_mlp": 0.01045685, + "balance_loss_clip": 1.0468049, + "balance_loss_mlp": 1.02861392, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.9524994018378592, + "language_loss": 0.89909136, + "learning_rate": 3.844256112593029e-06, + "loss": 0.9206295, + "num_input_tokens_seen": 55166055, + "step": 2539, + "time_per_iteration": 2.6245710849761963 + }, + { + "auxiliary_loss_clip": 0.0114566, + "auxiliary_loss_mlp": 0.01057831, + "balance_loss_clip": 1.05151153, + "balance_loss_mlp": 1.03885341, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 2.14481596299968, + "language_loss": 0.93330967, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95534462, + "num_input_tokens_seen": 55186285, + "step": 2540, + "time_per_iteration": 2.539761543273926 + }, + { + "auxiliary_loss_clip": 0.01130474, + "auxiliary_loss_mlp": 0.01053682, + "balance_loss_clip": 1.04464173, + "balance_loss_mlp": 1.03628969, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.904644186158126, + "language_loss": 0.75398499, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77582657, + "num_input_tokens_seen": 55207915, + "step": 2541, + "time_per_iteration": 2.5944013595581055 + }, + { + "auxiliary_loss_clip": 0.01117344, + "auxiliary_loss_mlp": 0.0105624, + "balance_loss_clip": 1.04604173, + "balance_loss_mlp": 1.03760803, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.6750540818751818, + "language_loss": 0.81585574, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83759159, + "num_input_tokens_seen": 55227860, + "step": 2542, + "time_per_iteration": 2.565232038497925 + }, + { + "auxiliary_loss_clip": 0.01163036, + "auxiliary_loss_mlp": 0.0105904, + "balance_loss_clip": 1.05287766, + "balance_loss_mlp": 1.04034853, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.4030447994953574, + "language_loss": 0.77423394, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79645473, + "num_input_tokens_seen": 55247330, + "step": 2543, + "time_per_iteration": 2.484835386276245 + }, + { + "auxiliary_loss_clip": 0.01146657, + "auxiliary_loss_mlp": 0.01055182, + "balance_loss_clip": 1.04919624, + "balance_loss_mlp": 1.03721714, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.0119040358063724, + "language_loss": 0.86445659, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88647497, + "num_input_tokens_seen": 55266195, + "step": 2544, + "time_per_iteration": 2.536827802658081 + }, + { + "auxiliary_loss_clip": 0.01150862, + "auxiliary_loss_mlp": 0.01059145, + "balance_loss_clip": 1.04912841, + "balance_loss_mlp": 1.03879595, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.405789701071272, + "language_loss": 0.82760882, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84970891, + "num_input_tokens_seen": 55283305, + "step": 2545, + "time_per_iteration": 2.4712166786193848 + }, + { + "auxiliary_loss_clip": 0.01161917, + "auxiliary_loss_mlp": 0.01048595, + "balance_loss_clip": 1.05312967, + "balance_loss_mlp": 1.02960479, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.3954620353309575, + "language_loss": 0.70777881, + "learning_rate": 3.843199661896884e-06, + "loss": 0.72988391, + "num_input_tokens_seen": 55303035, + "step": 2546, + "time_per_iteration": 2.485450267791748 + }, + { + "auxiliary_loss_clip": 0.0112968, + "auxiliary_loss_mlp": 0.01047469, + "balance_loss_clip": 1.0462904, + "balance_loss_mlp": 1.02810931, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.5477261340428226, + "language_loss": 0.77481455, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79658604, + "num_input_tokens_seen": 55327570, + "step": 2547, + "time_per_iteration": 2.7255096435546875 + }, + { + "auxiliary_loss_clip": 0.01113547, + "auxiliary_loss_mlp": 0.01054193, + "balance_loss_clip": 1.04621112, + "balance_loss_mlp": 1.0346427, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.1643903841732257, + "language_loss": 0.74376583, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76544321, + "num_input_tokens_seen": 55351090, + "step": 2548, + "time_per_iteration": 2.7025341987609863 + }, + { + "auxiliary_loss_clip": 0.01138638, + "auxiliary_loss_mlp": 0.01048509, + "balance_loss_clip": 1.04901958, + "balance_loss_mlp": 1.02937579, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.7399928630348651, + "language_loss": 0.80730951, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82918102, + "num_input_tokens_seen": 55371050, + "step": 2549, + "time_per_iteration": 2.55544376373291 + }, + { + "auxiliary_loss_clip": 0.01148153, + "auxiliary_loss_mlp": 0.01046631, + "balance_loss_clip": 1.04862964, + "balance_loss_mlp": 1.02764106, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.6979354617227207, + "language_loss": 0.74540114, + "learning_rate": 3.842594437983917e-06, + "loss": 0.76734889, + "num_input_tokens_seen": 55390375, + "step": 2550, + "time_per_iteration": 3.986729383468628 + }, + { + "auxiliary_loss_clip": 0.01154335, + "auxiliary_loss_mlp": 0.01045354, + "balance_loss_clip": 1.04995394, + "balance_loss_mlp": 1.02580428, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 3.042022732907165, + "language_loss": 0.77006847, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79206538, + "num_input_tokens_seen": 55408890, + "step": 2551, + "time_per_iteration": 2.497774362564087 + }, + { + "auxiliary_loss_clip": 0.01050345, + "auxiliary_loss_mlp": 0.01009731, + "balance_loss_clip": 1.01918292, + "balance_loss_mlp": 1.00636911, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9561975553240505, + "language_loss": 0.56747419, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58807492, + "num_input_tokens_seen": 55463815, + "step": 2552, + "time_per_iteration": 3.0008530616760254 + }, + { + "auxiliary_loss_clip": 0.01119621, + "auxiliary_loss_mlp": 0.01044866, + "balance_loss_clip": 1.05166483, + "balance_loss_mlp": 1.02555442, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 2.2720945229748444, + "language_loss": 0.89093029, + "learning_rate": 3.84213978637978e-06, + "loss": 0.91257513, + "num_input_tokens_seen": 55481050, + "step": 2553, + "time_per_iteration": 2.5830512046813965 + }, + { + "auxiliary_loss_clip": 0.01152921, + "auxiliary_loss_mlp": 0.01047198, + "balance_loss_clip": 1.05379951, + "balance_loss_mlp": 1.02683663, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.9821824986566219, + "language_loss": 0.78414941, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80615056, + "num_input_tokens_seen": 55500050, + "step": 2554, + "time_per_iteration": 4.064753293991089 + }, + { + "auxiliary_loss_clip": 0.01093831, + "auxiliary_loss_mlp": 0.01053583, + "balance_loss_clip": 1.04411125, + "balance_loss_mlp": 1.03291225, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.9803244897098056, + "language_loss": 0.7872985, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80877268, + "num_input_tokens_seen": 55518125, + "step": 2555, + "time_per_iteration": 2.630971670150757 + }, + { + "auxiliary_loss_clip": 0.01130167, + "auxiliary_loss_mlp": 0.010444, + "balance_loss_clip": 1.04945004, + "balance_loss_mlp": 1.02642357, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.7694613184763506, + "language_loss": 0.77390879, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79565442, + "num_input_tokens_seen": 55540960, + "step": 2556, + "time_per_iteration": 2.664419174194336 + }, + { + "auxiliary_loss_clip": 0.01140931, + "auxiliary_loss_mlp": 0.00784109, + "balance_loss_clip": 1.05022252, + "balance_loss_mlp": 1.00060296, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 2.21861825565204, + "language_loss": 0.89931059, + "learning_rate": 3.84153260631005e-06, + "loss": 0.91856098, + "num_input_tokens_seen": 55559210, + "step": 2557, + "time_per_iteration": 2.51943302154541 + }, + { + "auxiliary_loss_clip": 0.01140376, + "auxiliary_loss_mlp": 0.01049457, + "balance_loss_clip": 1.04845881, + "balance_loss_mlp": 1.02921557, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.045096382283073, + "language_loss": 0.70575243, + "learning_rate": 3.841380636700468e-06, + "loss": 0.72765076, + "num_input_tokens_seen": 55578925, + "step": 2558, + "time_per_iteration": 2.554431200027466 + }, + { + "auxiliary_loss_clip": 0.01136747, + "auxiliary_loss_mlp": 0.01048557, + "balance_loss_clip": 1.04847741, + "balance_loss_mlp": 1.0292567, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.2079333852131637, + "language_loss": 0.92499256, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94684559, + "num_input_tokens_seen": 55597255, + "step": 2559, + "time_per_iteration": 2.544220447540283 + }, + { + "auxiliary_loss_clip": 0.01138063, + "auxiliary_loss_mlp": 0.01052786, + "balance_loss_clip": 1.05269825, + "balance_loss_mlp": 1.03321195, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 2.7337775173800414, + "language_loss": 0.63787347, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65978199, + "num_input_tokens_seen": 55619515, + "step": 2560, + "time_per_iteration": 3.9313948154449463 + }, + { + "auxiliary_loss_clip": 0.0114016, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_clip": 1.05083692, + "balance_loss_mlp": 1.02503014, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.6640908119550661, + "language_loss": 0.87956417, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90141892, + "num_input_tokens_seen": 55640050, + "step": 2561, + "time_per_iteration": 2.579725503921509 + }, + { + "auxiliary_loss_clip": 0.01146267, + "auxiliary_loss_mlp": 0.01043126, + "balance_loss_clip": 1.0506736, + "balance_loss_mlp": 1.02444553, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 2.025140768765658, + "language_loss": 0.83016682, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85206079, + "num_input_tokens_seen": 55658695, + "step": 2562, + "time_per_iteration": 2.485596179962158 + }, + { + "auxiliary_loss_clip": 0.0113725, + "auxiliary_loss_mlp": 0.00788569, + "balance_loss_clip": 1.05032289, + "balance_loss_mlp": 1.00063992, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 2.0959712791536984, + "language_loss": 0.7481097, + "learning_rate": 3.840619741387832e-06, + "loss": 0.76736784, + "num_input_tokens_seen": 55676340, + "step": 2563, + "time_per_iteration": 2.545466423034668 + }, + { + "auxiliary_loss_clip": 0.01121206, + "auxiliary_loss_mlp": 0.01045379, + "balance_loss_clip": 1.04965973, + "balance_loss_mlp": 1.02580476, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 6.218007237705584, + "language_loss": 0.75943983, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.7811057, + "num_input_tokens_seen": 55698890, + "step": 2564, + "time_per_iteration": 2.67018461227417 + }, + { + "auxiliary_loss_clip": 0.01135917, + "auxiliary_loss_mlp": 0.01055967, + "balance_loss_clip": 1.04877925, + "balance_loss_mlp": 1.03684628, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.990608999659258, + "language_loss": 0.70712042, + "learning_rate": 3.840314894646969e-06, + "loss": 0.72903931, + "num_input_tokens_seen": 55718535, + "step": 2565, + "time_per_iteration": 2.567512273788452 + }, + { + "auxiliary_loss_clip": 0.01144317, + "auxiliary_loss_mlp": 0.01055767, + "balance_loss_clip": 1.04607177, + "balance_loss_mlp": 1.03618121, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.0110322339002247, + "language_loss": 0.71796411, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73996496, + "num_input_tokens_seen": 55738970, + "step": 2566, + "time_per_iteration": 2.6032025814056396 + }, + { + "auxiliary_loss_clip": 0.0115468, + "auxiliary_loss_mlp": 0.01045017, + "balance_loss_clip": 1.04748774, + "balance_loss_mlp": 1.02639687, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.60178270640841, + "language_loss": 0.84968174, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87167871, + "num_input_tokens_seen": 55759585, + "step": 2567, + "time_per_iteration": 4.052991151809692 + }, + { + "auxiliary_loss_clip": 0.01112354, + "auxiliary_loss_mlp": 0.01048318, + "balance_loss_clip": 1.04372001, + "balance_loss_mlp": 1.02941108, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.0739009769135768, + "language_loss": 0.78002024, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80162692, + "num_input_tokens_seen": 55779250, + "step": 2568, + "time_per_iteration": 2.6050031185150146 + }, + { + "auxiliary_loss_clip": 0.0112763, + "auxiliary_loss_mlp": 0.01041409, + "balance_loss_clip": 1.0461992, + "balance_loss_mlp": 1.02058315, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 10.121279666694173, + "language_loss": 0.70220858, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72389895, + "num_input_tokens_seen": 55800470, + "step": 2569, + "time_per_iteration": 2.5414164066314697 + }, + { + "auxiliary_loss_clip": 0.01130654, + "auxiliary_loss_mlp": 0.01050467, + "balance_loss_clip": 1.0473299, + "balance_loss_mlp": 1.0323714, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 2.27194029311023, + "language_loss": 0.76774281, + "learning_rate": 3.839551556659884e-06, + "loss": 0.789554, + "num_input_tokens_seen": 55817795, + "step": 2570, + "time_per_iteration": 2.5253636837005615 + }, + { + "auxiliary_loss_clip": 0.01143117, + "auxiliary_loss_mlp": 0.01044296, + "balance_loss_clip": 1.05139923, + "balance_loss_mlp": 1.02515137, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 2.364629703209558, + "language_loss": 0.77737957, + "learning_rate": 3.839398679771359e-06, + "loss": 0.7992537, + "num_input_tokens_seen": 55836125, + "step": 2571, + "time_per_iteration": 2.480435609817505 + }, + { + "auxiliary_loss_clip": 0.01136238, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.04692471, + "balance_loss_mlp": 1.03156257, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 2.2189919871266053, + "language_loss": 0.82533908, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84721464, + "num_input_tokens_seen": 55855280, + "step": 2572, + "time_per_iteration": 2.539874315261841 + }, + { + "auxiliary_loss_clip": 0.01162735, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.05136299, + "balance_loss_mlp": 1.02733338, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.6688175504759648, + "language_loss": 0.90298986, + "learning_rate": 3.839092716749563e-06, + "loss": 0.9250803, + "num_input_tokens_seen": 55875695, + "step": 2573, + "time_per_iteration": 2.4830617904663086 + }, + { + "auxiliary_loss_clip": 0.01093026, + "auxiliary_loss_mlp": 0.01052247, + "balance_loss_clip": 1.04082203, + "balance_loss_mlp": 1.03202903, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.6867279565296136, + "language_loss": 0.70447522, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72592795, + "num_input_tokens_seen": 55894575, + "step": 2574, + "time_per_iteration": 2.572718620300293 + }, + { + "auxiliary_loss_clip": 0.01132728, + "auxiliary_loss_mlp": 0.0105096, + "balance_loss_clip": 1.04590058, + "balance_loss_mlp": 1.03015757, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.8019489116073366, + "language_loss": 0.82432783, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84616476, + "num_input_tokens_seen": 55912855, + "step": 2575, + "time_per_iteration": 2.5234875679016113 + }, + { + "auxiliary_loss_clip": 0.01136741, + "auxiliary_loss_mlp": 0.01047798, + "balance_loss_clip": 1.0490315, + "balance_loss_mlp": 1.02886784, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 1.8586066157447239, + "language_loss": 0.84955311, + "learning_rate": 3.838633249192036e-06, + "loss": 0.87139851, + "num_input_tokens_seen": 55932375, + "step": 2576, + "time_per_iteration": 2.54650616645813 + }, + { + "auxiliary_loss_clip": 0.01157646, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.04714692, + "balance_loss_mlp": 1.0275985, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.6645910357511415, + "language_loss": 0.82337451, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84541911, + "num_input_tokens_seen": 55953970, + "step": 2577, + "time_per_iteration": 2.52412748336792 + }, + { + "auxiliary_loss_clip": 0.01128756, + "auxiliary_loss_mlp": 0.0105143, + "balance_loss_clip": 1.04884624, + "balance_loss_mlp": 1.0320704, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.237931125314515, + "language_loss": 0.7637099, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78551173, + "num_input_tokens_seen": 55973120, + "step": 2578, + "time_per_iteration": 2.6238555908203125 + }, + { + "auxiliary_loss_clip": 0.01127636, + "auxiliary_loss_mlp": 0.01048794, + "balance_loss_clip": 1.04703391, + "balance_loss_mlp": 1.02848077, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 2.1014497503729683, + "language_loss": 0.82648051, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84824479, + "num_input_tokens_seen": 55993260, + "step": 2579, + "time_per_iteration": 2.5666966438293457 + }, + { + "auxiliary_loss_clip": 0.01146431, + "auxiliary_loss_mlp": 0.01050189, + "balance_loss_clip": 1.05532908, + "balance_loss_mlp": 1.03077006, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.5181717041563, + "language_loss": 0.80629921, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82826543, + "num_input_tokens_seen": 56012130, + "step": 2580, + "time_per_iteration": 2.535661458969116 + }, + { + "auxiliary_loss_clip": 0.01052084, + "auxiliary_loss_mlp": 0.0100973, + "balance_loss_clip": 1.02226019, + "balance_loss_mlp": 1.00670195, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.841172112589919, + "language_loss": 0.58913291, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60975111, + "num_input_tokens_seen": 56079045, + "step": 2581, + "time_per_iteration": 3.173752784729004 + }, + { + "auxiliary_loss_clip": 0.01114375, + "auxiliary_loss_mlp": 0.01056997, + "balance_loss_clip": 1.04240978, + "balance_loss_mlp": 1.03482378, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 2.119444392866207, + "language_loss": 0.84926844, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87098217, + "num_input_tokens_seen": 56098745, + "step": 2582, + "time_per_iteration": 2.5998871326446533 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01058311, + "balance_loss_clip": 1.05160129, + "balance_loss_mlp": 1.03750896, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.1923514384015728, + "language_loss": 0.7872448, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80933219, + "num_input_tokens_seen": 56117655, + "step": 2583, + "time_per_iteration": 2.566084623336792 + }, + { + "auxiliary_loss_clip": 0.01144706, + "auxiliary_loss_mlp": 0.01058186, + "balance_loss_clip": 1.04853249, + "balance_loss_mlp": 1.03547716, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.7838569471248724, + "language_loss": 0.75904989, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78107882, + "num_input_tokens_seen": 56141960, + "step": 2584, + "time_per_iteration": 2.614967107772827 + }, + { + "auxiliary_loss_clip": 0.01142995, + "auxiliary_loss_mlp": 0.01042814, + "balance_loss_clip": 1.04673779, + "balance_loss_mlp": 1.02265549, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 3.3248275800356093, + "language_loss": 0.75676239, + "learning_rate": 3.837251082205368e-06, + "loss": 0.77862048, + "num_input_tokens_seen": 56161430, + "step": 2585, + "time_per_iteration": 2.5070695877075195 + }, + { + "auxiliary_loss_clip": 0.01121286, + "auxiliary_loss_mlp": 0.01045422, + "balance_loss_clip": 1.04903114, + "balance_loss_mlp": 1.02565694, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.091083379789917, + "language_loss": 0.61056769, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63223481, + "num_input_tokens_seen": 56179390, + "step": 2586, + "time_per_iteration": 2.573927164077759 + }, + { + "auxiliary_loss_clip": 0.01135146, + "auxiliary_loss_mlp": 0.01044791, + "balance_loss_clip": 1.04540753, + "balance_loss_mlp": 1.0246563, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.6359867229652785, + "language_loss": 0.81158817, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83338755, + "num_input_tokens_seen": 56198020, + "step": 2587, + "time_per_iteration": 2.530538320541382 + }, + { + "auxiliary_loss_clip": 0.0116745, + "auxiliary_loss_mlp": 0.01059218, + "balance_loss_clip": 1.05376172, + "balance_loss_mlp": 1.03578186, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 6.065689435117433, + "language_loss": 0.89004838, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91231513, + "num_input_tokens_seen": 56218165, + "step": 2588, + "time_per_iteration": 2.50475811958313 + }, + { + "auxiliary_loss_clip": 0.0110519, + "auxiliary_loss_mlp": 0.01062623, + "balance_loss_clip": 1.04751492, + "balance_loss_mlp": 1.03930604, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.316578955610548, + "language_loss": 0.64683211, + "learning_rate": 3.83663497412695e-06, + "loss": 0.6685102, + "num_input_tokens_seen": 56237160, + "step": 2589, + "time_per_iteration": 4.05492091178894 + }, + { + "auxiliary_loss_clip": 0.01109605, + "auxiliary_loss_mlp": 0.01050366, + "balance_loss_clip": 1.04127336, + "balance_loss_mlp": 1.02828836, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 3.6668267956586136, + "language_loss": 0.83202642, + "learning_rate": 3.836480772979281e-06, + "loss": 0.85362613, + "num_input_tokens_seen": 56257610, + "step": 2590, + "time_per_iteration": 2.6055643558502197 + }, + { + "auxiliary_loss_clip": 0.01130174, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.05068302, + "balance_loss_mlp": 1.02347469, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.5688254212009674, + "language_loss": 0.79899001, + "learning_rate": 3.836326502192077e-06, + "loss": 0.82072914, + "num_input_tokens_seen": 56275215, + "step": 2591, + "time_per_iteration": 2.5618042945861816 + }, + { + "auxiliary_loss_clip": 0.01151459, + "auxiliary_loss_mlp": 0.01054873, + "balance_loss_clip": 1.05199265, + "balance_loss_mlp": 1.03547764, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 4.90554308866878, + "language_loss": 0.64567119, + "learning_rate": 3.836172161771189e-06, + "loss": 0.6677345, + "num_input_tokens_seen": 56297130, + "step": 2592, + "time_per_iteration": 2.6415486335754395 + }, + { + "auxiliary_loss_clip": 0.01141746, + "auxiliary_loss_mlp": 0.01055696, + "balance_loss_clip": 1.05384588, + "balance_loss_mlp": 1.03420281, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.1627067973178677, + "language_loss": 0.82104117, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84301555, + "num_input_tokens_seen": 56314995, + "step": 2593, + "time_per_iteration": 4.032254934310913 + }, + { + "auxiliary_loss_clip": 0.01148929, + "auxiliary_loss_mlp": 0.01049067, + "balance_loss_clip": 1.05223846, + "balance_loss_mlp": 1.02890825, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.1374014678489712, + "language_loss": 0.73130381, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.75328374, + "num_input_tokens_seen": 56334005, + "step": 2594, + "time_per_iteration": 2.4981863498687744 + }, + { + "auxiliary_loss_clip": 0.01122971, + "auxiliary_loss_mlp": 0.01043408, + "balance_loss_clip": 1.04236364, + "balance_loss_mlp": 1.02236795, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 2.5427015905190036, + "language_loss": 0.81675863, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83842242, + "num_input_tokens_seen": 56353795, + "step": 2595, + "time_per_iteration": 2.6075046062469482 + }, + { + "auxiliary_loss_clip": 0.01160673, + "auxiliary_loss_mlp": 0.0104435, + "balance_loss_clip": 1.04972482, + "balance_loss_mlp": 1.02496624, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 2.2943589350173443, + "language_loss": 0.87105167, + "learning_rate": 3.835554103867876e-06, + "loss": 0.89310187, + "num_input_tokens_seen": 56373195, + "step": 2596, + "time_per_iteration": 2.464132070541382 + }, + { + "auxiliary_loss_clip": 0.01152707, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_clip": 1.05516195, + "balance_loss_mlp": 1.027511, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.8205873646108484, + "language_loss": 0.68564034, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70763493, + "num_input_tokens_seen": 56391525, + "step": 2597, + "time_per_iteration": 2.5034661293029785 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01045849, + "balance_loss_clip": 1.05160105, + "balance_loss_mlp": 1.02739584, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.9321815384756298, + "language_loss": 0.79951817, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82127213, + "num_input_tokens_seen": 56410715, + "step": 2598, + "time_per_iteration": 2.545665740966797 + }, + { + "auxiliary_loss_clip": 0.01137112, + "auxiliary_loss_mlp": 0.00785871, + "balance_loss_clip": 1.05020332, + "balance_loss_mlp": 1.00056791, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 2.9339874518787066, + "language_loss": 0.82935357, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84858334, + "num_input_tokens_seen": 56429170, + "step": 2599, + "time_per_iteration": 3.9827001094818115 + }, + { + "auxiliary_loss_clip": 0.01168337, + "auxiliary_loss_mlp": 0.01060726, + "balance_loss_clip": 1.05407, + "balance_loss_mlp": 1.03888714, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 2.247414575178162, + "language_loss": 0.81721371, + "learning_rate": 3.834934932294287e-06, + "loss": 0.83950442, + "num_input_tokens_seen": 56445685, + "step": 2600, + "time_per_iteration": 2.4527125358581543 + }, + { + "auxiliary_loss_clip": 0.01163899, + "auxiliary_loss_mlp": 0.00786025, + "balance_loss_clip": 1.05324364, + "balance_loss_mlp": 1.00059736, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8392765358826455, + "language_loss": 0.88643116, + "learning_rate": 3.834779965433917e-06, + "loss": 0.9059304, + "num_input_tokens_seen": 56465900, + "step": 2601, + "time_per_iteration": 2.4898130893707275 + }, + { + "auxiliary_loss_clip": 0.01168194, + "auxiliary_loss_mlp": 0.01067572, + "balance_loss_clip": 1.05568635, + "balance_loss_mlp": 1.04451728, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.6564033425344953, + "language_loss": 0.78498924, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80734694, + "num_input_tokens_seen": 56485020, + "step": 2602, + "time_per_iteration": 2.4734065532684326 + }, + { + "auxiliary_loss_clip": 0.01127185, + "auxiliary_loss_mlp": 0.01043489, + "balance_loss_clip": 1.04652023, + "balance_loss_mlp": 1.0229013, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.8837399816371294, + "language_loss": 0.73893404, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76064074, + "num_input_tokens_seen": 56505205, + "step": 2603, + "time_per_iteration": 2.5863637924194336 + }, + { + "auxiliary_loss_clip": 0.01147784, + "auxiliary_loss_mlp": 0.01048975, + "balance_loss_clip": 1.04697418, + "balance_loss_mlp": 1.0289005, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 2.4181361214842996, + "language_loss": 0.87562358, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89759111, + "num_input_tokens_seen": 56521495, + "step": 2604, + "time_per_iteration": 2.501185655593872 + }, + { + "auxiliary_loss_clip": 0.01151027, + "auxiliary_loss_mlp": 0.01045613, + "balance_loss_clip": 1.05033064, + "balance_loss_mlp": 1.02590728, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 2.1817016674846723, + "language_loss": 0.85554022, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87750661, + "num_input_tokens_seen": 56540665, + "step": 2605, + "time_per_iteration": 2.5465657711029053 + }, + { + "auxiliary_loss_clip": 0.01155899, + "auxiliary_loss_mlp": 0.01055353, + "balance_loss_clip": 1.04996133, + "balance_loss_mlp": 1.03455079, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.120921123664328, + "language_loss": 0.73019278, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75230527, + "num_input_tokens_seen": 56560805, + "step": 2606, + "time_per_iteration": 2.550485372543335 + }, + { + "auxiliary_loss_clip": 0.01165009, + "auxiliary_loss_mlp": 0.01051813, + "balance_loss_clip": 1.05554986, + "balance_loss_mlp": 1.03273988, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 2.091857980437494, + "language_loss": 0.76427829, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78644651, + "num_input_tokens_seen": 56576335, + "step": 2607, + "time_per_iteration": 3.8190836906433105 + }, + { + "auxiliary_loss_clip": 0.01120796, + "auxiliary_loss_mlp": 0.0104762, + "balance_loss_clip": 1.0474596, + "balance_loss_mlp": 1.0293448, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.8432004318262378, + "language_loss": 0.81708723, + "learning_rate": 3.833693249639615e-06, + "loss": 0.83877134, + "num_input_tokens_seen": 56595880, + "step": 2608, + "time_per_iteration": 2.5315499305725098 + }, + { + "auxiliary_loss_clip": 0.01133109, + "auxiliary_loss_mlp": 0.01052972, + "balance_loss_clip": 1.04710782, + "balance_loss_mlp": 1.03102517, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 2.338006296346189, + "language_loss": 0.72555798, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74741876, + "num_input_tokens_seen": 56615130, + "step": 2609, + "time_per_iteration": 2.5237131118774414 + }, + { + "auxiliary_loss_clip": 0.01150386, + "auxiliary_loss_mlp": 0.01046201, + "balance_loss_clip": 1.04874504, + "balance_loss_mlp": 1.02605486, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.7057451148535785, + "language_loss": 0.71928906, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74125493, + "num_input_tokens_seen": 56634005, + "step": 2610, + "time_per_iteration": 2.4969494342803955 + }, + { + "auxiliary_loss_clip": 0.01164346, + "auxiliary_loss_mlp": 0.01058396, + "balance_loss_clip": 1.05108345, + "balance_loss_mlp": 1.03635418, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 1.9209064744670348, + "language_loss": 0.72637439, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74860179, + "num_input_tokens_seen": 56653480, + "step": 2611, + "time_per_iteration": 2.457216739654541 + }, + { + "auxiliary_loss_clip": 0.01146054, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.04730844, + "balance_loss_mlp": 1.02880573, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.0681052386147702, + "language_loss": 0.70673072, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72867978, + "num_input_tokens_seen": 56672270, + "step": 2612, + "time_per_iteration": 2.4787309169769287 + }, + { + "auxiliary_loss_clip": 0.01120913, + "auxiliary_loss_mlp": 0.01060977, + "balance_loss_clip": 1.04738092, + "balance_loss_mlp": 1.04024673, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 3.021047701674438, + "language_loss": 0.75857943, + "learning_rate": 3.83291493793963e-06, + "loss": 0.78039831, + "num_input_tokens_seen": 56691510, + "step": 2613, + "time_per_iteration": 2.5512354373931885 + }, + { + "auxiliary_loss_clip": 0.01117196, + "auxiliary_loss_mlp": 0.01054501, + "balance_loss_clip": 1.04321706, + "balance_loss_mlp": 1.03468823, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.6490353987489095, + "language_loss": 0.6586746, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68039155, + "num_input_tokens_seen": 56712230, + "step": 2614, + "time_per_iteration": 2.58443284034729 + }, + { + "auxiliary_loss_clip": 0.01155198, + "auxiliary_loss_mlp": 0.01048613, + "balance_loss_clip": 1.05253983, + "balance_loss_mlp": 1.02766824, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.3548020549759956, + "language_loss": 0.75148499, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77352309, + "num_input_tokens_seen": 56727490, + "step": 2615, + "time_per_iteration": 2.466231346130371 + }, + { + "auxiliary_loss_clip": 0.01142262, + "auxiliary_loss_mlp": 0.0106145, + "balance_loss_clip": 1.04963708, + "balance_loss_mlp": 1.04131544, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.6050739555555182, + "language_loss": 0.72850358, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75054073, + "num_input_tokens_seen": 56747385, + "step": 2616, + "time_per_iteration": 2.49364972114563 + }, + { + "auxiliary_loss_clip": 0.01137596, + "auxiliary_loss_mlp": 0.01056071, + "balance_loss_clip": 1.05237424, + "balance_loss_mlp": 1.03530467, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 2.205184767249811, + "language_loss": 0.72599745, + "learning_rate": 3.832291037466539e-06, + "loss": 0.7479341, + "num_input_tokens_seen": 56768055, + "step": 2617, + "time_per_iteration": 2.523362636566162 + }, + { + "auxiliary_loss_clip": 0.01144664, + "auxiliary_loss_mlp": 0.01056046, + "balance_loss_clip": 1.05056, + "balance_loss_mlp": 1.0353874, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.1167240226289237, + "language_loss": 0.74537301, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76738012, + "num_input_tokens_seen": 56785110, + "step": 2618, + "time_per_iteration": 2.4769880771636963 + }, + { + "auxiliary_loss_clip": 0.01166065, + "auxiliary_loss_mlp": 0.01053041, + "balance_loss_clip": 1.05261266, + "balance_loss_mlp": 1.03099942, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.072433066138511, + "language_loss": 0.78823733, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.81042838, + "num_input_tokens_seen": 56804975, + "step": 2619, + "time_per_iteration": 2.4477298259735107 + }, + { + "auxiliary_loss_clip": 0.01130418, + "auxiliary_loss_mlp": 0.0105841, + "balance_loss_clip": 1.04889667, + "balance_loss_mlp": 1.03833485, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.7939648973868314, + "language_loss": 0.77054125, + "learning_rate": 3.831822382544101e-06, + "loss": 0.79242945, + "num_input_tokens_seen": 56822470, + "step": 2620, + "time_per_iteration": 2.4980037212371826 + }, + { + "auxiliary_loss_clip": 0.01143442, + "auxiliary_loss_mlp": 0.01054709, + "balance_loss_clip": 1.05175817, + "balance_loss_mlp": 1.03230965, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.852959419180445, + "language_loss": 0.71171588, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73369741, + "num_input_tokens_seen": 56842100, + "step": 2621, + "time_per_iteration": 2.5859289169311523 + }, + { + "auxiliary_loss_clip": 0.01105654, + "auxiliary_loss_mlp": 0.01057133, + "balance_loss_clip": 1.04562616, + "balance_loss_mlp": 1.03434062, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 6.775293353189825, + "language_loss": 0.71750391, + "learning_rate": 3.831509598604828e-06, + "loss": 0.73913181, + "num_input_tokens_seen": 56865920, + "step": 2622, + "time_per_iteration": 2.863943576812744 + }, + { + "auxiliary_loss_clip": 0.01097997, + "auxiliary_loss_mlp": 0.01052415, + "balance_loss_clip": 1.04045606, + "balance_loss_mlp": 1.03296041, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.9816158294562438, + "language_loss": 0.8823874, + "learning_rate": 3.831353102455684e-06, + "loss": 0.90389156, + "num_input_tokens_seen": 56885265, + "step": 2623, + "time_per_iteration": 2.6047353744506836 + }, + { + "auxiliary_loss_clip": 0.01161179, + "auxiliary_loss_mlp": 0.01048529, + "balance_loss_clip": 1.05181193, + "balance_loss_mlp": 1.02958691, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.9070662271157859, + "language_loss": 0.81869638, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84079349, + "num_input_tokens_seen": 56906710, + "step": 2624, + "time_per_iteration": 2.48673939704895 + }, + { + "auxiliary_loss_clip": 0.01126917, + "auxiliary_loss_mlp": 0.01050276, + "balance_loss_clip": 1.04732311, + "balance_loss_mlp": 1.02943838, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.2445324150063537, + "language_loss": 0.7993902, + "learning_rate": 3.831039901828054e-06, + "loss": 0.8211621, + "num_input_tokens_seen": 56924275, + "step": 2625, + "time_per_iteration": 2.5567164421081543 + }, + { + "auxiliary_loss_clip": 0.01162511, + "auxiliary_loss_mlp": 0.0104535, + "balance_loss_clip": 1.05369544, + "balance_loss_mlp": 1.02708697, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.0609787052063573, + "language_loss": 0.80647683, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82855546, + "num_input_tokens_seen": 56941525, + "step": 2626, + "time_per_iteration": 2.5042970180511475 + }, + { + "auxiliary_loss_clip": 0.01105401, + "auxiliary_loss_mlp": 0.01053782, + "balance_loss_clip": 1.04948783, + "balance_loss_mlp": 1.03058434, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 2.615940685302325, + "language_loss": 0.73686206, + "learning_rate": 3.830726423467561e-06, + "loss": 0.75845391, + "num_input_tokens_seen": 56962145, + "step": 2627, + "time_per_iteration": 2.6323068141937256 + }, + { + "auxiliary_loss_clip": 0.01118319, + "auxiliary_loss_mlp": 0.01054679, + "balance_loss_clip": 1.04667377, + "balance_loss_mlp": 1.03434193, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.1778641247335346, + "language_loss": 0.85023069, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87196064, + "num_input_tokens_seen": 56977505, + "step": 2628, + "time_per_iteration": 2.5012354850769043 + }, + { + "auxiliary_loss_clip": 0.01132529, + "auxiliary_loss_mlp": 0.01044082, + "balance_loss_clip": 1.04924238, + "balance_loss_mlp": 1.026057, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.8929646914277798, + "language_loss": 0.76789689, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78966296, + "num_input_tokens_seen": 56996770, + "step": 2629, + "time_per_iteration": 2.5212292671203613 + }, + { + "auxiliary_loss_clip": 0.01146233, + "auxiliary_loss_mlp": 0.01055061, + "balance_loss_clip": 1.05120015, + "balance_loss_mlp": 1.03431845, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.674747369267729, + "language_loss": 0.73848546, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.7604984, + "num_input_tokens_seen": 57014970, + "step": 2630, + "time_per_iteration": 4.0979437828063965 + }, + { + "auxiliary_loss_clip": 0.01153546, + "auxiliary_loss_mlp": 0.01050712, + "balance_loss_clip": 1.04901099, + "balance_loss_mlp": 1.03019595, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 2.1017599826788578, + "language_loss": 0.83657098, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85861355, + "num_input_tokens_seen": 57034045, + "step": 2631, + "time_per_iteration": 2.508497476577759 + }, + { + "auxiliary_loss_clip": 0.01160843, + "auxiliary_loss_mlp": 0.01046615, + "balance_loss_clip": 1.05123377, + "balance_loss_mlp": 1.0272671, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.7784360764454776, + "language_loss": 0.78415853, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80623311, + "num_input_tokens_seen": 57053695, + "step": 2632, + "time_per_iteration": 2.4553747177124023 + }, + { + "auxiliary_loss_clip": 0.01152591, + "auxiliary_loss_mlp": 0.01056582, + "balance_loss_clip": 1.05393374, + "balance_loss_mlp": 1.03701949, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.0063477467690785, + "language_loss": 0.83121568, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85330743, + "num_input_tokens_seen": 57071290, + "step": 2633, + "time_per_iteration": 3.9546473026275635 + }, + { + "auxiliary_loss_clip": 0.01166444, + "auxiliary_loss_mlp": 0.010479, + "balance_loss_clip": 1.0543648, + "balance_loss_mlp": 1.02827835, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 2.1657553337168123, + "language_loss": 0.77365422, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79579771, + "num_input_tokens_seen": 57091465, + "step": 2634, + "time_per_iteration": 2.483314275741577 + }, + { + "auxiliary_loss_clip": 0.01129109, + "auxiliary_loss_mlp": 0.00788458, + "balance_loss_clip": 1.05181098, + "balance_loss_mlp": 1.00055742, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.3467249145047906, + "language_loss": 0.8924287, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91160434, + "num_input_tokens_seen": 57110075, + "step": 2635, + "time_per_iteration": 2.559206962585449 + }, + { + "auxiliary_loss_clip": 0.01094524, + "auxiliary_loss_mlp": 0.01059921, + "balance_loss_clip": 1.04100621, + "balance_loss_mlp": 1.03811765, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 3.2961334177670785, + "language_loss": 0.75879234, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78033674, + "num_input_tokens_seen": 57128945, + "step": 2636, + "time_per_iteration": 2.614999771118164 + }, + { + "auxiliary_loss_clip": 0.01125972, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.05161762, + "balance_loss_mlp": 1.0232085, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.2497337874661607, + "language_loss": 0.72483873, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74654335, + "num_input_tokens_seen": 57152385, + "step": 2637, + "time_per_iteration": 2.719088077545166 + }, + { + "auxiliary_loss_clip": 0.01149829, + "auxiliary_loss_mlp": 0.01044937, + "balance_loss_clip": 1.0516727, + "balance_loss_mlp": 1.02564931, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 2.173582303762717, + "language_loss": 0.77695203, + "learning_rate": 3.82899733013685e-06, + "loss": 0.79889965, + "num_input_tokens_seen": 57172620, + "step": 2638, + "time_per_iteration": 2.519160032272339 + }, + { + "auxiliary_loss_clip": 0.01128547, + "auxiliary_loss_mlp": 0.010601, + "balance_loss_clip": 1.04749978, + "balance_loss_mlp": 1.03884494, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 2.528568353757563, + "language_loss": 0.76020455, + "learning_rate": 3.828839723580128e-06, + "loss": 0.78209102, + "num_input_tokens_seen": 57194680, + "step": 2639, + "time_per_iteration": 3.98391056060791 + }, + { + "auxiliary_loss_clip": 0.01104082, + "auxiliary_loss_mlp": 0.01055075, + "balance_loss_clip": 1.0470897, + "balance_loss_mlp": 1.0348928, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.1670895216841366, + "language_loss": 0.81760621, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83919775, + "num_input_tokens_seen": 57214675, + "step": 2640, + "time_per_iteration": 2.6155378818511963 + }, + { + "auxiliary_loss_clip": 0.01131476, + "auxiliary_loss_mlp": 0.01058296, + "balance_loss_clip": 1.04528725, + "balance_loss_mlp": 1.0364573, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.5124997270393006, + "language_loss": 0.66769493, + "learning_rate": 3.828524302423306e-06, + "loss": 0.68959266, + "num_input_tokens_seen": 57235830, + "step": 2641, + "time_per_iteration": 2.5905163288116455 + }, + { + "auxiliary_loss_clip": 0.01143836, + "auxiliary_loss_mlp": 0.01055447, + "balance_loss_clip": 1.04736757, + "balance_loss_mlp": 1.03483534, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.3203217798276343, + "language_loss": 0.75130957, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77330238, + "num_input_tokens_seen": 57255970, + "step": 2642, + "time_per_iteration": 2.589731454849243 + }, + { + "auxiliary_loss_clip": 0.01155431, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.05701256, + "balance_loss_mlp": 1.03292012, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 2.2735442684985534, + "language_loss": 0.70751911, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72960061, + "num_input_tokens_seen": 57274435, + "step": 2643, + "time_per_iteration": 2.529658317565918 + }, + { + "auxiliary_loss_clip": 0.01161543, + "auxiliary_loss_mlp": 0.01049437, + "balance_loss_clip": 1.05374789, + "balance_loss_mlp": 1.03111458, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.3646214127492255, + "language_loss": 0.78372306, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80583286, + "num_input_tokens_seen": 57293115, + "step": 2644, + "time_per_iteration": 2.4817922115325928 + }, + { + "auxiliary_loss_clip": 0.01150037, + "auxiliary_loss_mlp": 0.01048862, + "balance_loss_clip": 1.05185866, + "balance_loss_mlp": 1.02953792, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 2.0322903793368403, + "language_loss": 0.81660545, + "learning_rate": 3.827892628103657e-06, + "loss": 0.83859444, + "num_input_tokens_seen": 57312565, + "step": 2645, + "time_per_iteration": 2.5192997455596924 + }, + { + "auxiliary_loss_clip": 0.01164301, + "auxiliary_loss_mlp": 0.01052445, + "balance_loss_clip": 1.05177248, + "balance_loss_mlp": 1.03101122, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 2.415666047620892, + "language_loss": 0.70015728, + "learning_rate": 3.827734536224087e-06, + "loss": 0.72232473, + "num_input_tokens_seen": 57333360, + "step": 2646, + "time_per_iteration": 4.029776334762573 + }, + { + "auxiliary_loss_clip": 0.01137886, + "auxiliary_loss_mlp": 0.0104269, + "balance_loss_clip": 1.05312037, + "balance_loss_mlp": 1.02440369, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 4.254974319011207, + "language_loss": 0.62381446, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64562023, + "num_input_tokens_seen": 57350575, + "step": 2647, + "time_per_iteration": 2.5020899772644043 + }, + { + "auxiliary_loss_clip": 0.01165312, + "auxiliary_loss_mlp": 0.01049104, + "balance_loss_clip": 1.05576897, + "balance_loss_mlp": 1.02931535, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.00835257242776, + "language_loss": 0.90138721, + "learning_rate": 3.827418144547318e-06, + "loss": 0.92353141, + "num_input_tokens_seen": 57367570, + "step": 2648, + "time_per_iteration": 2.4153597354888916 + }, + { + "auxiliary_loss_clip": 0.0116129, + "auxiliary_loss_mlp": 0.01046663, + "balance_loss_clip": 1.05520594, + "balance_loss_mlp": 1.02848363, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 2.0147203804228773, + "language_loss": 0.91942906, + "learning_rate": 3.827259844762114e-06, + "loss": 0.94150853, + "num_input_tokens_seen": 57383980, + "step": 2649, + "time_per_iteration": 2.4270730018615723 + }, + { + "auxiliary_loss_clip": 0.01100976, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_clip": 1.05137289, + "balance_loss_mlp": 1.02520108, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 4.676329097542947, + "language_loss": 0.71198028, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73345399, + "num_input_tokens_seen": 57400840, + "step": 2650, + "time_per_iteration": 2.670433521270752 + }, + { + "auxiliary_loss_clip": 0.01147647, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_clip": 1.05031121, + "balance_loss_mlp": 1.02362382, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 2.5082419568955054, + "language_loss": 0.71247792, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73437601, + "num_input_tokens_seen": 57419230, + "step": 2651, + "time_per_iteration": 2.8024981021881104 + }, + { + "auxiliary_loss_clip": 0.01118173, + "auxiliary_loss_mlp": 0.00787831, + "balance_loss_clip": 1.04591799, + "balance_loss_mlp": 1.00047541, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 2.1196864883237003, + "language_loss": 0.79819798, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.817258, + "num_input_tokens_seen": 57439315, + "step": 2652, + "time_per_iteration": 2.5778188705444336 + }, + { + "auxiliary_loss_clip": 0.01140128, + "auxiliary_loss_mlp": 0.00784538, + "balance_loss_clip": 1.0532347, + "balance_loss_mlp": 1.00044298, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.7593687434345733, + "language_loss": 0.70234001, + "learning_rate": 3.826625952782601e-06, + "loss": 0.7215867, + "num_input_tokens_seen": 57454635, + "step": 2653, + "time_per_iteration": 2.520185947418213 + }, + { + "auxiliary_loss_clip": 0.01150136, + "auxiliary_loss_mlp": 0.01043355, + "balance_loss_clip": 1.05361176, + "balance_loss_mlp": 1.02314901, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.857780783498898, + "language_loss": 0.7693947, + "learning_rate": 3.826467306608095e-06, + "loss": 0.79132962, + "num_input_tokens_seen": 57476805, + "step": 2654, + "time_per_iteration": 2.557934284210205 + }, + { + "auxiliary_loss_clip": 0.01121928, + "auxiliary_loss_mlp": 0.01045037, + "balance_loss_clip": 1.04919302, + "balance_loss_mlp": 1.02582061, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 2.0262418766221266, + "language_loss": 0.82130933, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84297895, + "num_input_tokens_seen": 57496400, + "step": 2655, + "time_per_iteration": 2.551630735397339 + }, + { + "auxiliary_loss_clip": 0.01121941, + "auxiliary_loss_mlp": 0.01050819, + "balance_loss_clip": 1.05012202, + "balance_loss_mlp": 1.03163862, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.0209120395019107, + "language_loss": 0.73181403, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75354159, + "num_input_tokens_seen": 57513700, + "step": 2656, + "time_per_iteration": 2.547652244567871 + }, + { + "auxiliary_loss_clip": 0.01120021, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.05230033, + "balance_loss_mlp": 1.02441263, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 2.426312985084106, + "language_loss": 0.77900612, + "learning_rate": 3.825990952549713e-06, + "loss": 0.80063438, + "num_input_tokens_seen": 57536180, + "step": 2657, + "time_per_iteration": 2.8289990425109863 + }, + { + "auxiliary_loss_clip": 0.01148552, + "auxiliary_loss_mlp": 0.01050111, + "balance_loss_clip": 1.05320764, + "balance_loss_mlp": 1.03032231, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.9517512737318885, + "language_loss": 0.74895155, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77093816, + "num_input_tokens_seen": 57555025, + "step": 2658, + "time_per_iteration": 2.488779067993164 + }, + { + "auxiliary_loss_clip": 0.01143405, + "auxiliary_loss_mlp": 0.01052874, + "balance_loss_clip": 1.0617193, + "balance_loss_mlp": 1.0314399, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.7615545638927725, + "language_loss": 0.75242084, + "learning_rate": 3.825673036958624e-06, + "loss": 0.7743836, + "num_input_tokens_seen": 57577660, + "step": 2659, + "time_per_iteration": 2.694458246231079 + }, + { + "auxiliary_loss_clip": 0.01128546, + "auxiliary_loss_mlp": 0.01050836, + "balance_loss_clip": 1.0545373, + "balance_loss_mlp": 1.03164291, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.3981130269317465, + "language_loss": 0.91005933, + "learning_rate": 3.825513975315508e-06, + "loss": 0.93185318, + "num_input_tokens_seen": 57596335, + "step": 2660, + "time_per_iteration": 2.568033218383789 + }, + { + "auxiliary_loss_clip": 0.01117988, + "auxiliary_loss_mlp": 0.01064474, + "balance_loss_clip": 1.05440092, + "balance_loss_mlp": 1.04278946, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 1.688011952826918, + "language_loss": 0.77774799, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79957265, + "num_input_tokens_seen": 57616830, + "step": 2661, + "time_per_iteration": 2.6939265727996826 + }, + { + "auxiliary_loss_clip": 0.01141731, + "auxiliary_loss_mlp": 0.00786231, + "balance_loss_clip": 1.0525403, + "balance_loss_mlp": 1.00041378, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.704532379581962, + "language_loss": 0.74176538, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76104504, + "num_input_tokens_seen": 57635515, + "step": 2662, + "time_per_iteration": 2.6014232635498047 + }, + { + "auxiliary_loss_clip": 0.0113637, + "auxiliary_loss_mlp": 0.00791022, + "balance_loss_clip": 1.05110669, + "balance_loss_mlp": 1.00044632, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.4563561402357132, + "language_loss": 0.81948286, + "learning_rate": 3.825036375068263e-06, + "loss": 0.8387568, + "num_input_tokens_seen": 57654250, + "step": 2663, + "time_per_iteration": 2.5667309761047363 + }, + { + "auxiliary_loss_clip": 0.01120246, + "auxiliary_loss_mlp": 0.01051034, + "balance_loss_clip": 1.05418491, + "balance_loss_mlp": 1.03095913, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.1912258948920513, + "language_loss": 0.7955519, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81726468, + "num_input_tokens_seen": 57672645, + "step": 2664, + "time_per_iteration": 2.6041300296783447 + }, + { + "auxiliary_loss_clip": 0.01156638, + "auxiliary_loss_mlp": 0.01052508, + "balance_loss_clip": 1.05486524, + "balance_loss_mlp": 1.03318429, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 2.508513901765524, + "language_loss": 0.94087237, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96296382, + "num_input_tokens_seen": 57691055, + "step": 2665, + "time_per_iteration": 2.510291337966919 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_clip": 1.05260849, + "balance_loss_mlp": 1.026057, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 1.8929608146339632, + "language_loss": 0.85258573, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87436867, + "num_input_tokens_seen": 57707235, + "step": 2666, + "time_per_iteration": 2.509577989578247 + }, + { + "auxiliary_loss_clip": 0.01134478, + "auxiliary_loss_mlp": 0.00786277, + "balance_loss_clip": 1.05162406, + "balance_loss_mlp": 1.00049686, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 2.0183162410541664, + "language_loss": 0.81403744, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83324492, + "num_input_tokens_seen": 57724190, + "step": 2667, + "time_per_iteration": 2.543764352798462 + }, + { + "auxiliary_loss_clip": 0.01166877, + "auxiliary_loss_mlp": 0.01051585, + "balance_loss_clip": 1.05835176, + "balance_loss_mlp": 1.0316174, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 1.9987899619469789, + "language_loss": 0.74033225, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76251686, + "num_input_tokens_seen": 57743620, + "step": 2668, + "time_per_iteration": 2.4692344665527344 + }, + { + "auxiliary_loss_clip": 0.01151104, + "auxiliary_loss_mlp": 0.01052601, + "balance_loss_clip": 1.05546117, + "balance_loss_mlp": 1.03318191, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.647910426500593, + "language_loss": 0.77325726, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79529428, + "num_input_tokens_seen": 57764810, + "step": 2669, + "time_per_iteration": 3.995464563369751 + }, + { + "auxiliary_loss_clip": 0.01068086, + "auxiliary_loss_mlp": 0.01004665, + "balance_loss_clip": 1.03510952, + "balance_loss_mlp": 1.00139904, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.7949332024064886, + "language_loss": 0.55555522, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57628274, + "num_input_tokens_seen": 57824390, + "step": 2670, + "time_per_iteration": 2.956777572631836 + }, + { + "auxiliary_loss_clip": 0.0115081, + "auxiliary_loss_mlp": 0.01049228, + "balance_loss_clip": 1.05300367, + "balance_loss_mlp": 1.03051186, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 2.6564178583489104, + "language_loss": 0.77514851, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79714888, + "num_input_tokens_seen": 57843665, + "step": 2671, + "time_per_iteration": 3.866701126098633 + }, + { + "auxiliary_loss_clip": 0.01154438, + "auxiliary_loss_mlp": 0.01049037, + "balance_loss_clip": 1.05453157, + "balance_loss_mlp": 1.03040433, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 3.6843731543741725, + "language_loss": 0.65318274, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.67521751, + "num_input_tokens_seen": 57863305, + "step": 2672, + "time_per_iteration": 2.5707786083221436 + }, + { + "auxiliary_loss_clip": 0.01149755, + "auxiliary_loss_mlp": 0.01043344, + "balance_loss_clip": 1.0550344, + "balance_loss_mlp": 1.02263749, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 2.2749842649618466, + "language_loss": 0.85658705, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.8785181, + "num_input_tokens_seen": 57883025, + "step": 2673, + "time_per_iteration": 2.4723825454711914 + }, + { + "auxiliary_loss_clip": 0.01127806, + "auxiliary_loss_mlp": 0.01055181, + "balance_loss_clip": 1.05263638, + "balance_loss_mlp": 1.03670394, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.2048768647501356, + "language_loss": 0.72656059, + "learning_rate": 3.823279846575403e-06, + "loss": 0.7483905, + "num_input_tokens_seen": 57901430, + "step": 2674, + "time_per_iteration": 2.539383888244629 + }, + { + "auxiliary_loss_clip": 0.01152193, + "auxiliary_loss_mlp": 0.01048595, + "balance_loss_clip": 1.05405402, + "balance_loss_mlp": 1.02863967, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 1.577882015736375, + "language_loss": 0.84510797, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86711586, + "num_input_tokens_seen": 57919550, + "step": 2675, + "time_per_iteration": 2.4861068725585938 + }, + { + "auxiliary_loss_clip": 0.01117127, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_clip": 1.04819441, + "balance_loss_mlp": 1.03436553, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 2.064692864665217, + "language_loss": 0.82719058, + "learning_rate": 3.822959578715685e-06, + "loss": 0.8489145, + "num_input_tokens_seen": 57939890, + "step": 2676, + "time_per_iteration": 2.6624512672424316 + }, + { + "auxiliary_loss_clip": 0.0115047, + "auxiliary_loss_mlp": 0.01051135, + "balance_loss_clip": 1.05610466, + "balance_loss_mlp": 1.03363478, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.7643720664717957, + "language_loss": 0.73118114, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75319713, + "num_input_tokens_seen": 57957410, + "step": 2677, + "time_per_iteration": 2.4731099605560303 + }, + { + "auxiliary_loss_clip": 0.01136767, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.05196166, + "balance_loss_mlp": 1.02745056, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 2.135421112663134, + "language_loss": 0.76273215, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78455782, + "num_input_tokens_seen": 57977900, + "step": 2678, + "time_per_iteration": 4.038446664810181 + }, + { + "auxiliary_loss_clip": 0.01150033, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_clip": 1.05335617, + "balance_loss_mlp": 1.0257175, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 2.3440087687636346, + "language_loss": 0.70331067, + "learning_rate": 3.822478658490228e-06, + "loss": 0.7252717, + "num_input_tokens_seen": 57998210, + "step": 2679, + "time_per_iteration": 2.6071395874023438 + }, + { + "auxiliary_loss_clip": 0.01043056, + "auxiliary_loss_mlp": 0.00759149, + "balance_loss_clip": 1.03125942, + "balance_loss_mlp": 1.0001328, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.7882593619024254, + "language_loss": 0.51806766, + "learning_rate": 3.822318213523154e-06, + "loss": 0.53608972, + "num_input_tokens_seen": 58059420, + "step": 2680, + "time_per_iteration": 3.1672844886779785 + }, + { + "auxiliary_loss_clip": 0.01142334, + "auxiliary_loss_mlp": 0.01045971, + "balance_loss_clip": 1.05131292, + "balance_loss_mlp": 1.02586007, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.6467363862711888, + "language_loss": 0.80966187, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.83154488, + "num_input_tokens_seen": 58078370, + "step": 2681, + "time_per_iteration": 2.5604074001312256 + }, + { + "auxiliary_loss_clip": 0.01138197, + "auxiliary_loss_mlp": 0.01057649, + "balance_loss_clip": 1.05365467, + "balance_loss_mlp": 1.03943336, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.931193360617953, + "language_loss": 0.69315946, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71511793, + "num_input_tokens_seen": 58097395, + "step": 2682, + "time_per_iteration": 2.599832773208618 + }, + { + "auxiliary_loss_clip": 0.01140351, + "auxiliary_loss_mlp": 0.01049497, + "balance_loss_clip": 1.05639613, + "balance_loss_mlp": 1.03013706, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 2.287053488149796, + "language_loss": 0.87356853, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89546704, + "num_input_tokens_seen": 58115630, + "step": 2683, + "time_per_iteration": 2.5210609436035156 + }, + { + "auxiliary_loss_clip": 0.01163799, + "auxiliary_loss_mlp": 0.01058172, + "balance_loss_clip": 1.05564356, + "balance_loss_mlp": 1.03893149, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 1.8001821252756918, + "language_loss": 0.74665797, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76887763, + "num_input_tokens_seen": 58138655, + "step": 2684, + "time_per_iteration": 2.6038663387298584 + }, + { + "auxiliary_loss_clip": 0.01138076, + "auxiliary_loss_mlp": 0.00788149, + "balance_loss_clip": 1.0533042, + "balance_loss_mlp": 1.00054002, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 3.0918747736366785, + "language_loss": 0.70641291, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72567523, + "num_input_tokens_seen": 58157440, + "step": 2685, + "time_per_iteration": 2.624856948852539 + }, + { + "auxiliary_loss_clip": 0.01117967, + "auxiliary_loss_mlp": 0.01059028, + "balance_loss_clip": 1.05368972, + "balance_loss_mlp": 1.03835666, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.865346523604106, + "language_loss": 0.71760952, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73937947, + "num_input_tokens_seen": 58176660, + "step": 2686, + "time_per_iteration": 4.072430372238159 + }, + { + "auxiliary_loss_clip": 0.01154112, + "auxiliary_loss_mlp": 0.01054153, + "balance_loss_clip": 1.054896, + "balance_loss_mlp": 1.03475821, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 2.389289670797744, + "language_loss": 0.82201546, + "learning_rate": 3.821193164224981e-06, + "loss": 0.84409809, + "num_input_tokens_seen": 58195085, + "step": 2687, + "time_per_iteration": 2.4797565937042236 + }, + { + "auxiliary_loss_clip": 0.01154021, + "auxiliary_loss_mlp": 0.01048397, + "balance_loss_clip": 1.05200112, + "balance_loss_mlp": 1.02759445, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.9333377131109037, + "language_loss": 0.71884286, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74086702, + "num_input_tokens_seen": 58213540, + "step": 2688, + "time_per_iteration": 2.5029640197753906 + }, + { + "auxiliary_loss_clip": 0.01123952, + "auxiliary_loss_mlp": 0.01050805, + "balance_loss_clip": 1.05098486, + "balance_loss_mlp": 1.03193378, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 2.0161373928962627, + "language_loss": 0.75836653, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78011405, + "num_input_tokens_seen": 58236995, + "step": 2689, + "time_per_iteration": 2.6512997150421143 + }, + { + "auxiliary_loss_clip": 0.01166202, + "auxiliary_loss_mlp": 0.01052919, + "balance_loss_clip": 1.05916691, + "balance_loss_mlp": 1.03272533, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 9.962543450491534, + "language_loss": 0.87493849, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89712965, + "num_input_tokens_seen": 58257230, + "step": 2690, + "time_per_iteration": 2.4821231365203857 + }, + { + "auxiliary_loss_clip": 0.01145835, + "auxiliary_loss_mlp": 0.01047665, + "balance_loss_clip": 1.05541432, + "balance_loss_mlp": 1.03009319, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 4.652240908346697, + "language_loss": 0.88065046, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.90258539, + "num_input_tokens_seen": 58277080, + "step": 2691, + "time_per_iteration": 2.4977903366088867 + }, + { + "auxiliary_loss_clip": 0.01151924, + "auxiliary_loss_mlp": 0.0104797, + "balance_loss_clip": 1.05327582, + "balance_loss_mlp": 1.02677417, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 3.563074606577422, + "language_loss": 0.8208878, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84288681, + "num_input_tokens_seen": 58294815, + "step": 2692, + "time_per_iteration": 2.5193214416503906 + }, + { + "auxiliary_loss_clip": 0.0116913, + "auxiliary_loss_mlp": 0.01052356, + "balance_loss_clip": 1.05581379, + "balance_loss_mlp": 1.03172064, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 2.1254803722340285, + "language_loss": 0.81259167, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83480656, + "num_input_tokens_seen": 58313215, + "step": 2693, + "time_per_iteration": 2.4872186183929443 + }, + { + "auxiliary_loss_clip": 0.01162191, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.05728757, + "balance_loss_mlp": 1.03546476, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.4756560174846212, + "language_loss": 0.83731508, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85947335, + "num_input_tokens_seen": 58333215, + "step": 2694, + "time_per_iteration": 2.497377872467041 + }, + { + "auxiliary_loss_clip": 0.01118177, + "auxiliary_loss_mlp": 0.01054534, + "balance_loss_clip": 1.04958034, + "balance_loss_mlp": 1.03356552, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 2.888686836217608, + "language_loss": 0.69236887, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71409595, + "num_input_tokens_seen": 58351160, + "step": 2695, + "time_per_iteration": 2.5842649936676025 + }, + { + "auxiliary_loss_clip": 0.01157138, + "auxiliary_loss_mlp": 0.01053284, + "balance_loss_clip": 1.05839193, + "balance_loss_mlp": 1.0329349, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 4.143169268090086, + "language_loss": 0.83123446, + "learning_rate": 3.819741700256637e-06, + "loss": 0.8533386, + "num_input_tokens_seen": 58368505, + "step": 2696, + "time_per_iteration": 2.5079638957977295 + }, + { + "auxiliary_loss_clip": 0.0117291, + "auxiliary_loss_mlp": 0.01058529, + "balance_loss_clip": 1.05848503, + "balance_loss_mlp": 1.03697562, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 1.8850482912297923, + "language_loss": 0.8830508, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90536517, + "num_input_tokens_seen": 58385085, + "step": 2697, + "time_per_iteration": 2.4563567638397217 + }, + { + "auxiliary_loss_clip": 0.01157376, + "auxiliary_loss_mlp": 0.01048256, + "balance_loss_clip": 1.05352616, + "balance_loss_mlp": 1.03044641, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.6977680618710216, + "language_loss": 0.80792534, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82998168, + "num_input_tokens_seen": 58406985, + "step": 2698, + "time_per_iteration": 2.5822253227233887 + }, + { + "auxiliary_loss_clip": 0.01146664, + "auxiliary_loss_mlp": 0.01045681, + "balance_loss_clip": 1.05503535, + "balance_loss_mlp": 1.02672637, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.6298332153643422, + "language_loss": 0.77661252, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79853594, + "num_input_tokens_seen": 58426205, + "step": 2699, + "time_per_iteration": 2.538442611694336 + }, + { + "auxiliary_loss_clip": 0.01136106, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.05297089, + "balance_loss_mlp": 1.02742505, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.64729584153639, + "language_loss": 0.86471277, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88652563, + "num_input_tokens_seen": 58443830, + "step": 2700, + "time_per_iteration": 2.535418748855591 + }, + { + "auxiliary_loss_clip": 0.01149536, + "auxiliary_loss_mlp": 0.00785175, + "balance_loss_clip": 1.05274463, + "balance_loss_mlp": 1.00063348, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.5553110747203687, + "language_loss": 0.80515307, + "learning_rate": 3.818932915932284e-06, + "loss": 0.8245002, + "num_input_tokens_seen": 58464405, + "step": 2701, + "time_per_iteration": 2.6368064880371094 + }, + { + "auxiliary_loss_clip": 0.01140819, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_clip": 1.05455518, + "balance_loss_mlp": 1.02674091, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.6635590869569314, + "language_loss": 0.73165464, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75351644, + "num_input_tokens_seen": 58483295, + "step": 2702, + "time_per_iteration": 2.545250415802002 + }, + { + "auxiliary_loss_clip": 0.01152942, + "auxiliary_loss_mlp": 0.01051558, + "balance_loss_clip": 1.05535746, + "balance_loss_mlp": 1.03064823, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 1.9233091413056524, + "language_loss": 0.72855121, + "learning_rate": 3.81860891934076e-06, + "loss": 0.75059617, + "num_input_tokens_seen": 58501205, + "step": 2703, + "time_per_iteration": 2.4772982597351074 + }, + { + "auxiliary_loss_clip": 0.01162475, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.05348563, + "balance_loss_mlp": 1.02607751, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.864825124140232, + "language_loss": 0.70369947, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72579134, + "num_input_tokens_seen": 58522315, + "step": 2704, + "time_per_iteration": 2.554871082305908 + }, + { + "auxiliary_loss_clip": 0.01024837, + "auxiliary_loss_mlp": 0.01004013, + "balance_loss_clip": 1.02486277, + "balance_loss_mlp": 1.00086617, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7795654176099741, + "language_loss": 0.53373861, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55402714, + "num_input_tokens_seen": 58586695, + "step": 2705, + "time_per_iteration": 3.1392605304718018 + }, + { + "auxiliary_loss_clip": 0.01136723, + "auxiliary_loss_mlp": 0.00787997, + "balance_loss_clip": 1.05115056, + "balance_loss_mlp": 1.00070155, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.989524951512854, + "language_loss": 0.75437057, + "learning_rate": 3.818122407255102e-06, + "loss": 0.7736178, + "num_input_tokens_seen": 58602435, + "step": 2706, + "time_per_iteration": 2.512484550476074 + }, + { + "auxiliary_loss_clip": 0.01129098, + "auxiliary_loss_mlp": 0.01050997, + "balance_loss_clip": 1.04944241, + "balance_loss_mlp": 1.03244758, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.8623444390806345, + "language_loss": 0.72339386, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74519479, + "num_input_tokens_seen": 58621275, + "step": 2707, + "time_per_iteration": 2.645549774169922 + }, + { + "auxiliary_loss_clip": 0.01142791, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.05464578, + "balance_loss_mlp": 1.03021312, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 3.2523189626453606, + "language_loss": 0.84199697, + "learning_rate": 3.817797721137495e-06, + "loss": 0.86391377, + "num_input_tokens_seen": 58637550, + "step": 2708, + "time_per_iteration": 2.514319896697998 + }, + { + "auxiliary_loss_clip": 0.01101383, + "auxiliary_loss_mlp": 0.00790377, + "balance_loss_clip": 1.04675698, + "balance_loss_mlp": 1.00066268, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.0280074680858564, + "language_loss": 0.86186308, + "learning_rate": 3.817635274679006e-06, + "loss": 0.8807807, + "num_input_tokens_seen": 58654135, + "step": 2709, + "time_per_iteration": 4.047273874282837 + }, + { + "auxiliary_loss_clip": 0.0113997, + "auxiliary_loss_mlp": 0.00784407, + "balance_loss_clip": 1.0521698, + "balance_loss_mlp": 1.00055039, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.69653965586132, + "language_loss": 0.91559517, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93483895, + "num_input_tokens_seen": 58674320, + "step": 2710, + "time_per_iteration": 2.5401206016540527 + }, + { + "auxiliary_loss_clip": 0.01119608, + "auxiliary_loss_mlp": 0.01057079, + "balance_loss_clip": 1.05359304, + "balance_loss_mlp": 1.03837478, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 2.032864857664557, + "language_loss": 0.8152312, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83699811, + "num_input_tokens_seen": 58691000, + "step": 2711, + "time_per_iteration": 4.010707139968872 + }, + { + "auxiliary_loss_clip": 0.01152224, + "auxiliary_loss_mlp": 0.01043578, + "balance_loss_clip": 1.05579996, + "balance_loss_mlp": 1.02446902, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.1940200110075154, + "language_loss": 0.81094396, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83290195, + "num_input_tokens_seen": 58710230, + "step": 2712, + "time_per_iteration": 2.545306444168091 + }, + { + "auxiliary_loss_clip": 0.01169398, + "auxiliary_loss_mlp": 0.01056362, + "balance_loss_clip": 1.05762315, + "balance_loss_mlp": 1.03678775, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 1.952303230877819, + "language_loss": 0.76990086, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79215848, + "num_input_tokens_seen": 58728610, + "step": 2713, + "time_per_iteration": 2.5270884037017822 + }, + { + "auxiliary_loss_clip": 0.01153964, + "auxiliary_loss_mlp": 0.01061303, + "balance_loss_clip": 1.05896401, + "balance_loss_mlp": 1.04118013, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.0684299230150938, + "language_loss": 0.7943126, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.81646526, + "num_input_tokens_seen": 58744385, + "step": 2714, + "time_per_iteration": 2.5026698112487793 + }, + { + "auxiliary_loss_clip": 0.01152787, + "auxiliary_loss_mlp": 0.01052634, + "balance_loss_clip": 1.05626607, + "balance_loss_mlp": 1.03415704, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 2.4657695969693627, + "language_loss": 0.78107989, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80313408, + "num_input_tokens_seen": 58763905, + "step": 2715, + "time_per_iteration": 2.556461811065674 + }, + { + "auxiliary_loss_clip": 0.01129942, + "auxiliary_loss_mlp": 0.01042709, + "balance_loss_clip": 1.04918742, + "balance_loss_mlp": 1.02500653, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.3244167220959575, + "language_loss": 0.8151983, + "learning_rate": 3.816496219917336e-06, + "loss": 0.83692479, + "num_input_tokens_seen": 58785580, + "step": 2716, + "time_per_iteration": 2.5626611709594727 + }, + { + "auxiliary_loss_clip": 0.0114475, + "auxiliary_loss_mlp": 0.0105022, + "balance_loss_clip": 1.05819178, + "balance_loss_mlp": 1.03210032, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 2.8993091816121845, + "language_loss": 0.86256611, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88451576, + "num_input_tokens_seen": 58806075, + "step": 2717, + "time_per_iteration": 3.957880735397339 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01046073, + "balance_loss_clip": 1.05621016, + "balance_loss_mlp": 1.02842975, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.818192221144235, + "language_loss": 0.76853979, + "learning_rate": 3.816170155671629e-06, + "loss": 0.79037929, + "num_input_tokens_seen": 58827405, + "step": 2718, + "time_per_iteration": 2.6229310035705566 + }, + { + "auxiliary_loss_clip": 0.01147155, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_clip": 1.0593555, + "balance_loss_mlp": 1.0258131, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 1.7234516733020546, + "language_loss": 0.73853439, + "learning_rate": 3.816007020241652e-06, + "loss": 0.76043963, + "num_input_tokens_seen": 58847205, + "step": 2719, + "time_per_iteration": 2.5608363151550293 + }, + { + "auxiliary_loss_clip": 0.01129974, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.05037129, + "balance_loss_mlp": 1.02309728, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.7970970756784421, + "language_loss": 0.72383845, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74554658, + "num_input_tokens_seen": 58866865, + "step": 2720, + "time_per_iteration": 2.5683867931365967 + }, + { + "auxiliary_loss_clip": 0.01110091, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_clip": 1.04936934, + "balance_loss_mlp": 1.02554989, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.517084425286009, + "language_loss": 0.75139147, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77294827, + "num_input_tokens_seen": 58885200, + "step": 2721, + "time_per_iteration": 2.600870132446289 + }, + { + "auxiliary_loss_clip": 0.01120236, + "auxiliary_loss_mlp": 0.01061311, + "balance_loss_clip": 1.05225408, + "balance_loss_mlp": 1.03910232, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 5.054571519862783, + "language_loss": 0.78799009, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.80980551, + "num_input_tokens_seen": 58906385, + "step": 2722, + "time_per_iteration": 2.6399338245391846 + }, + { + "auxiliary_loss_clip": 0.01150663, + "auxiliary_loss_mlp": 0.00787036, + "balance_loss_clip": 1.052742, + "balance_loss_mlp": 1.00065243, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.1430331293189493, + "language_loss": 0.84952128, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86889827, + "num_input_tokens_seen": 58925040, + "step": 2723, + "time_per_iteration": 2.524824619293213 + }, + { + "auxiliary_loss_clip": 0.01113929, + "auxiliary_loss_mlp": 0.01045218, + "balance_loss_clip": 1.05139685, + "balance_loss_mlp": 1.02573967, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 1.9361125153500012, + "language_loss": 0.71140862, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73300004, + "num_input_tokens_seen": 58944790, + "step": 2724, + "time_per_iteration": 2.631398916244507 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01048004, + "balance_loss_clip": 1.05131352, + "balance_loss_mlp": 1.02997971, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.6686968431243807, + "language_loss": 0.7059809, + "learning_rate": 3.815026761751955e-06, + "loss": 0.72765112, + "num_input_tokens_seen": 58962500, + "step": 2725, + "time_per_iteration": 3.9025368690490723 + }, + { + "auxiliary_loss_clip": 0.01116103, + "auxiliary_loss_mlp": 0.01043755, + "balance_loss_clip": 1.05220509, + "balance_loss_mlp": 1.02583802, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.961942516009715, + "language_loss": 0.88616526, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90776384, + "num_input_tokens_seen": 58980355, + "step": 2726, + "time_per_iteration": 2.574215888977051 + }, + { + "auxiliary_loss_clip": 0.01159084, + "auxiliary_loss_mlp": 0.01051994, + "balance_loss_clip": 1.06176949, + "balance_loss_mlp": 1.03351617, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.7737101622491194, + "language_loss": 0.7404967, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76260746, + "num_input_tokens_seen": 58999505, + "step": 2727, + "time_per_iteration": 2.522914409637451 + }, + { + "auxiliary_loss_clip": 0.01150359, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.0573951, + "balance_loss_mlp": 1.03365171, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 2.297656105822463, + "language_loss": 0.82590455, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84791487, + "num_input_tokens_seen": 59017930, + "step": 2728, + "time_per_iteration": 2.51259183883667 + }, + { + "auxiliary_loss_clip": 0.01155388, + "auxiliary_loss_mlp": 0.01047399, + "balance_loss_clip": 1.05643809, + "balance_loss_mlp": 1.02842116, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.357589390221824, + "language_loss": 0.85559303, + "learning_rate": 3.814371879489633e-06, + "loss": 0.87762094, + "num_input_tokens_seen": 59035130, + "step": 2729, + "time_per_iteration": 2.4639530181884766 + }, + { + "auxiliary_loss_clip": 0.01166206, + "auxiliary_loss_mlp": 0.01046805, + "balance_loss_clip": 1.05843139, + "balance_loss_mlp": 1.02888775, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.9918218665573768, + "language_loss": 0.7244584, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74658853, + "num_input_tokens_seen": 59053080, + "step": 2730, + "time_per_iteration": 2.470609664916992 + }, + { + "auxiliary_loss_clip": 0.01142365, + "auxiliary_loss_mlp": 0.01050827, + "balance_loss_clip": 1.05228043, + "balance_loss_mlp": 1.0302875, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 2.219825715819825, + "language_loss": 0.74596924, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76790112, + "num_input_tokens_seen": 59075610, + "step": 2731, + "time_per_iteration": 2.7362213134765625 + }, + { + "auxiliary_loss_clip": 0.01125694, + "auxiliary_loss_mlp": 0.01053997, + "balance_loss_clip": 1.05050814, + "balance_loss_mlp": 1.03401721, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.26888438941647, + "language_loss": 0.79204023, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.81383711, + "num_input_tokens_seen": 59094555, + "step": 2732, + "time_per_iteration": 2.5587191581726074 + }, + { + "auxiliary_loss_clip": 0.01141273, + "auxiliary_loss_mlp": 0.01048965, + "balance_loss_clip": 1.05237043, + "balance_loss_mlp": 1.03022528, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 2.2101763750756622, + "language_loss": 0.70183498, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.72373736, + "num_input_tokens_seen": 59113515, + "step": 2733, + "time_per_iteration": 2.560082197189331 + }, + { + "auxiliary_loss_clip": 0.01143858, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.05613065, + "balance_loss_mlp": 1.02881038, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 1.8446205059421055, + "language_loss": 0.80777657, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.82970297, + "num_input_tokens_seen": 59133275, + "step": 2734, + "time_per_iteration": 2.5785255432128906 + }, + { + "auxiliary_loss_clip": 0.01132381, + "auxiliary_loss_mlp": 0.0105999, + "balance_loss_clip": 1.05025113, + "balance_loss_mlp": 1.03875911, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 2.04854644191817, + "language_loss": 0.82493544, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84685916, + "num_input_tokens_seen": 59154095, + "step": 2735, + "time_per_iteration": 2.6321845054626465 + }, + { + "auxiliary_loss_clip": 0.01075139, + "auxiliary_loss_mlp": 0.01045536, + "balance_loss_clip": 1.04443347, + "balance_loss_mlp": 1.0265696, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 3.3079069577709923, + "language_loss": 0.78959572, + "learning_rate": 3.813223186925296e-06, + "loss": 0.81080246, + "num_input_tokens_seen": 59173795, + "step": 2736, + "time_per_iteration": 2.666109800338745 + }, + { + "auxiliary_loss_clip": 0.01146333, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.0581758, + "balance_loss_mlp": 1.03490436, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.85890936152898, + "language_loss": 0.81312346, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83512372, + "num_input_tokens_seen": 59191610, + "step": 2737, + "time_per_iteration": 2.5788562297821045 + }, + { + "auxiliary_loss_clip": 0.01147796, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_clip": 1.05268145, + "balance_loss_mlp": 1.03494287, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8801845084293758, + "language_loss": 0.87719882, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89922959, + "num_input_tokens_seen": 59213000, + "step": 2738, + "time_per_iteration": 2.6002237796783447 + }, + { + "auxiliary_loss_clip": 0.01143019, + "auxiliary_loss_mlp": 0.01060261, + "balance_loss_clip": 1.05650723, + "balance_loss_mlp": 1.04144955, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 1.6941541534369542, + "language_loss": 0.71986222, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74189508, + "num_input_tokens_seen": 59232340, + "step": 2739, + "time_per_iteration": 2.563429117202759 + }, + { + "auxiliary_loss_clip": 0.01155567, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.05777693, + "balance_loss_mlp": 1.02564096, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.715903110548655, + "language_loss": 0.81281888, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.83482361, + "num_input_tokens_seen": 59253950, + "step": 2740, + "time_per_iteration": 2.5925660133361816 + }, + { + "auxiliary_loss_clip": 0.01114191, + "auxiliary_loss_mlp": 0.01066396, + "balance_loss_clip": 1.04721498, + "balance_loss_mlp": 1.04114795, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 1.9411744903767691, + "language_loss": 0.69240528, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71421111, + "num_input_tokens_seen": 59275545, + "step": 2741, + "time_per_iteration": 2.732539176940918 + }, + { + "auxiliary_loss_clip": 0.01164507, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_clip": 1.05665922, + "balance_loss_mlp": 1.02732134, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 1.9740701102442006, + "language_loss": 0.79860914, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82072341, + "num_input_tokens_seen": 59293480, + "step": 2742, + "time_per_iteration": 2.5522634983062744 + }, + { + "auxiliary_loss_clip": 0.01139336, + "auxiliary_loss_mlp": 0.01055445, + "balance_loss_clip": 1.05528867, + "balance_loss_mlp": 1.03510761, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 2.052032856738034, + "language_loss": 0.84615767, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86810547, + "num_input_tokens_seen": 59313435, + "step": 2743, + "time_per_iteration": 2.5742082595825195 + }, + { + "auxiliary_loss_clip": 0.0116062, + "auxiliary_loss_mlp": 0.01052657, + "balance_loss_clip": 1.05471051, + "balance_loss_mlp": 1.03277278, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.63750668690879, + "language_loss": 0.85694599, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87907881, + "num_input_tokens_seen": 59331535, + "step": 2744, + "time_per_iteration": 2.507441282272339 + }, + { + "auxiliary_loss_clip": 0.01129953, + "auxiliary_loss_mlp": 0.01047169, + "balance_loss_clip": 1.05009949, + "balance_loss_mlp": 1.02832246, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.667113328037982, + "language_loss": 0.83108711, + "learning_rate": 3.811741346238036e-06, + "loss": 0.8528583, + "num_input_tokens_seen": 59350680, + "step": 2745, + "time_per_iteration": 2.5860557556152344 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01049777, + "balance_loss_clip": 1.05825305, + "balance_loss_mlp": 1.03082228, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 2.0468262900796366, + "language_loss": 0.76756144, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78938591, + "num_input_tokens_seen": 59367020, + "step": 2746, + "time_per_iteration": 2.5577733516693115 + }, + { + "auxiliary_loss_clip": 0.01166341, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_clip": 1.05811691, + "balance_loss_mlp": 1.03064954, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.714157369952264, + "language_loss": 0.80577755, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82794011, + "num_input_tokens_seen": 59386075, + "step": 2747, + "time_per_iteration": 2.541909694671631 + }, + { + "auxiliary_loss_clip": 0.01157379, + "auxiliary_loss_mlp": 0.01048856, + "balance_loss_clip": 1.05941093, + "balance_loss_mlp": 1.02895951, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 1.8403577005197853, + "language_loss": 0.69365001, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71571231, + "num_input_tokens_seen": 59402690, + "step": 2748, + "time_per_iteration": 3.9805846214294434 + }, + { + "auxiliary_loss_clip": 0.01167867, + "auxiliary_loss_mlp": 0.00786206, + "balance_loss_clip": 1.05896997, + "balance_loss_mlp": 1.00069082, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.3925925494256726, + "language_loss": 0.88435197, + "learning_rate": 3.811080963869561e-06, + "loss": 0.90389276, + "num_input_tokens_seen": 59421130, + "step": 2749, + "time_per_iteration": 2.5208284854888916 + }, + { + "auxiliary_loss_clip": 0.0115142, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.05334496, + "balance_loss_mlp": 1.02704966, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.434887656145232, + "language_loss": 0.78739643, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.80938327, + "num_input_tokens_seen": 59438970, + "step": 2750, + "time_per_iteration": 4.815222263336182 + }, + { + "auxiliary_loss_clip": 0.01155092, + "auxiliary_loss_mlp": 0.01045672, + "balance_loss_clip": 1.0572443, + "balance_loss_mlp": 1.025967, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.6784381516267857, + "language_loss": 0.94741809, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.9694258, + "num_input_tokens_seen": 59458510, + "step": 2751, + "time_per_iteration": 2.5223746299743652 + }, + { + "auxiliary_loss_clip": 0.01079167, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.05289865, + "balance_loss_mlp": 1.03569841, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 2.0251946816004045, + "language_loss": 0.71058124, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73193479, + "num_input_tokens_seen": 59477110, + "step": 2752, + "time_per_iteration": 2.691768169403076 + }, + { + "auxiliary_loss_clip": 0.01073709, + "auxiliary_loss_mlp": 0.01005959, + "balance_loss_clip": 1.04019332, + "balance_loss_mlp": 1.00299072, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.8140083391583405, + "language_loss": 0.54045105, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56124771, + "num_input_tokens_seen": 59541155, + "step": 2753, + "time_per_iteration": 3.1919400691986084 + }, + { + "auxiliary_loss_clip": 0.01163994, + "auxiliary_loss_mlp": 0.00785933, + "balance_loss_clip": 1.05479932, + "balance_loss_mlp": 1.00085318, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.875640850577083, + "language_loss": 0.75412548, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77362478, + "num_input_tokens_seen": 59561155, + "step": 2754, + "time_per_iteration": 2.559525966644287 + }, + { + "auxiliary_loss_clip": 0.01140726, + "auxiliary_loss_mlp": 0.0105973, + "balance_loss_clip": 1.05291271, + "balance_loss_mlp": 1.03617394, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 2.142376173940083, + "language_loss": 0.8667835, + "learning_rate": 3.810088330151188e-06, + "loss": 0.88878798, + "num_input_tokens_seen": 59580460, + "step": 2755, + "time_per_iteration": 2.5793206691741943 + }, + { + "auxiliary_loss_clip": 0.01125744, + "auxiliary_loss_mlp": 0.01049531, + "balance_loss_clip": 1.04931426, + "balance_loss_mlp": 1.0299809, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.8869332141661435, + "language_loss": 0.73359799, + "learning_rate": 3.80992265092595e-06, + "loss": 0.75535083, + "num_input_tokens_seen": 59600025, + "step": 2756, + "time_per_iteration": 2.620352268218994 + }, + { + "auxiliary_loss_clip": 0.01131593, + "auxiliary_loss_mlp": 0.01045581, + "balance_loss_clip": 1.0553813, + "balance_loss_mlp": 1.02640009, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.6408683633486443, + "language_loss": 0.75039113, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77216291, + "num_input_tokens_seen": 59620600, + "step": 2757, + "time_per_iteration": 4.015758037567139 + }, + { + "auxiliary_loss_clip": 0.01143188, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.05648541, + "balance_loss_mlp": 1.02641392, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 1.66657079375883, + "language_loss": 0.84481812, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86670232, + "num_input_tokens_seen": 59641385, + "step": 2758, + "time_per_iteration": 2.6116244792938232 + }, + { + "auxiliary_loss_clip": 0.01167828, + "auxiliary_loss_mlp": 0.0105479, + "balance_loss_clip": 1.06013799, + "balance_loss_mlp": 1.03638411, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 1.8210927068299887, + "language_loss": 0.78811574, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81034195, + "num_input_tokens_seen": 59659865, + "step": 2759, + "time_per_iteration": 2.5521509647369385 + }, + { + "auxiliary_loss_clip": 0.01103058, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.05056429, + "balance_loss_mlp": 1.02607298, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 2.3796760516434086, + "language_loss": 0.74994254, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77143508, + "num_input_tokens_seen": 59678780, + "step": 2760, + "time_per_iteration": 2.6604702472686768 + }, + { + "auxiliary_loss_clip": 0.01121293, + "auxiliary_loss_mlp": 0.01041408, + "balance_loss_clip": 1.05338919, + "balance_loss_mlp": 1.0223943, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.723254298883707, + "language_loss": 0.7295506, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75117767, + "num_input_tokens_seen": 59698795, + "step": 2761, + "time_per_iteration": 2.6779139041900635 + }, + { + "auxiliary_loss_clip": 0.01138762, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.05343235, + "balance_loss_mlp": 1.02352095, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 1.8825077187734651, + "language_loss": 0.88868964, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91050375, + "num_input_tokens_seen": 59718795, + "step": 2762, + "time_per_iteration": 2.601358413696289 + }, + { + "auxiliary_loss_clip": 0.01120505, + "auxiliary_loss_mlp": 0.01047688, + "balance_loss_clip": 1.0546515, + "balance_loss_mlp": 1.02807808, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.9283672182308094, + "language_loss": 0.88102239, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90270436, + "num_input_tokens_seen": 59737555, + "step": 2763, + "time_per_iteration": 2.6453099250793457 + }, + { + "auxiliary_loss_clip": 0.01073653, + "auxiliary_loss_mlp": 0.01013228, + "balance_loss_clip": 1.03048515, + "balance_loss_mlp": 1.01056957, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7785765022079777, + "language_loss": 0.59816754, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61903632, + "num_input_tokens_seen": 59800915, + "step": 2764, + "time_per_iteration": 4.535740613937378 + }, + { + "auxiliary_loss_clip": 0.01156178, + "auxiliary_loss_mlp": 0.01053866, + "balance_loss_clip": 1.05589199, + "balance_loss_mlp": 1.03271794, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 2.663130581104088, + "language_loss": 0.82052785, + "learning_rate": 3.808428450193401e-06, + "loss": 0.8426283, + "num_input_tokens_seen": 59822910, + "step": 2765, + "time_per_iteration": 2.5537514686584473 + }, + { + "auxiliary_loss_clip": 0.01175027, + "auxiliary_loss_mlp": 0.01053253, + "balance_loss_clip": 1.05962229, + "balance_loss_mlp": 1.03209352, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 3.060991636599477, + "language_loss": 0.6986323, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72091514, + "num_input_tokens_seen": 59838805, + "step": 2766, + "time_per_iteration": 2.4221320152282715 + }, + { + "auxiliary_loss_clip": 0.01153456, + "auxiliary_loss_mlp": 0.01047127, + "balance_loss_clip": 1.05883968, + "balance_loss_mlp": 1.02836299, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.2943008388297783, + "language_loss": 0.88370991, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90571576, + "num_input_tokens_seen": 59855345, + "step": 2767, + "time_per_iteration": 2.474994421005249 + }, + { + "auxiliary_loss_clip": 0.01063847, + "auxiliary_loss_mlp": 0.01006748, + "balance_loss_clip": 1.03043592, + "balance_loss_mlp": 1.00405335, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.6406954907279198, + "language_loss": 0.52880502, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54951096, + "num_input_tokens_seen": 59917710, + "step": 2768, + "time_per_iteration": 3.153320550918579 + }, + { + "auxiliary_loss_clip": 0.01143196, + "auxiliary_loss_mlp": 0.01055453, + "balance_loss_clip": 1.05712843, + "balance_loss_mlp": 1.03457975, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 3.6003736127547783, + "language_loss": 0.84665596, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.86864245, + "num_input_tokens_seen": 59935105, + "step": 2769, + "time_per_iteration": 2.5073888301849365 + }, + { + "auxiliary_loss_clip": 0.01055872, + "auxiliary_loss_mlp": 0.01005857, + "balance_loss_clip": 1.03075612, + "balance_loss_mlp": 1.00281668, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.812245295180394, + "language_loss": 0.57486331, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59548056, + "num_input_tokens_seen": 59984085, + "step": 2770, + "time_per_iteration": 2.9298553466796875 + }, + { + "auxiliary_loss_clip": 0.0104484, + "auxiliary_loss_mlp": 0.01005614, + "balance_loss_clip": 1.03311896, + "balance_loss_mlp": 1.00270581, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8684897829210715, + "language_loss": 0.56310642, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58361095, + "num_input_tokens_seen": 60043470, + "step": 2771, + "time_per_iteration": 2.951857089996338 + }, + { + "auxiliary_loss_clip": 0.01118761, + "auxiliary_loss_mlp": 0.01058042, + "balance_loss_clip": 1.05275369, + "balance_loss_mlp": 1.037359, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.654277329924346, + "language_loss": 0.70534045, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72710848, + "num_input_tokens_seen": 60063045, + "step": 2772, + "time_per_iteration": 2.6017050743103027 + }, + { + "auxiliary_loss_clip": 0.01154182, + "auxiliary_loss_mlp": 0.01046529, + "balance_loss_clip": 1.05561864, + "balance_loss_mlp": 1.02703834, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.9551537723857224, + "language_loss": 0.86279356, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88480067, + "num_input_tokens_seen": 60081945, + "step": 2773, + "time_per_iteration": 2.5572943687438965 + }, + { + "auxiliary_loss_clip": 0.01104301, + "auxiliary_loss_mlp": 0.0104317, + "balance_loss_clip": 1.04703927, + "balance_loss_mlp": 1.02407241, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.1179712912423536, + "language_loss": 0.82294071, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84441543, + "num_input_tokens_seen": 60096820, + "step": 2774, + "time_per_iteration": 2.5798511505126953 + }, + { + "auxiliary_loss_clip": 0.01127816, + "auxiliary_loss_mlp": 0.01045767, + "balance_loss_clip": 1.05226922, + "balance_loss_mlp": 1.0256927, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.138884182019628, + "language_loss": 0.83339667, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85513246, + "num_input_tokens_seen": 60116140, + "step": 2775, + "time_per_iteration": 2.5732991695404053 + }, + { + "auxiliary_loss_clip": 0.01155963, + "auxiliary_loss_mlp": 0.01055156, + "balance_loss_clip": 1.06037271, + "balance_loss_mlp": 1.03691649, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.7703078631805187, + "language_loss": 0.80568671, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82779801, + "num_input_tokens_seen": 60134235, + "step": 2776, + "time_per_iteration": 2.498051881790161 + }, + { + "auxiliary_loss_clip": 0.01148498, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_clip": 1.06242549, + "balance_loss_mlp": 1.02914143, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.8642164362798508, + "language_loss": 0.79845858, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82042962, + "num_input_tokens_seen": 60153275, + "step": 2777, + "time_per_iteration": 2.520864486694336 + }, + { + "auxiliary_loss_clip": 0.01155919, + "auxiliary_loss_mlp": 0.01050263, + "balance_loss_clip": 1.05615127, + "balance_loss_mlp": 1.0311774, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.6369088482224081, + "language_loss": 0.85604537, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87810719, + "num_input_tokens_seen": 60173215, + "step": 2778, + "time_per_iteration": 2.5423483848571777 + }, + { + "auxiliary_loss_clip": 0.01141987, + "auxiliary_loss_mlp": 0.01046887, + "balance_loss_clip": 1.05681539, + "balance_loss_mlp": 1.02789664, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 2.221144833744401, + "language_loss": 0.74690402, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76879269, + "num_input_tokens_seen": 60190515, + "step": 2779, + "time_per_iteration": 2.546565294265747 + }, + { + "auxiliary_loss_clip": 0.0111784, + "auxiliary_loss_mlp": 0.00785766, + "balance_loss_clip": 1.04971087, + "balance_loss_mlp": 1.00037551, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.0302322491034186, + "language_loss": 0.65453106, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67356718, + "num_input_tokens_seen": 60211655, + "step": 2780, + "time_per_iteration": 2.6242618560791016 + }, + { + "auxiliary_loss_clip": 0.01126534, + "auxiliary_loss_mlp": 0.01051858, + "balance_loss_clip": 1.04841805, + "balance_loss_mlp": 1.03088903, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.6876469182832707, + "language_loss": 0.78212237, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80390632, + "num_input_tokens_seen": 60230860, + "step": 2781, + "time_per_iteration": 2.5051653385162354 + }, + { + "auxiliary_loss_clip": 0.01105912, + "auxiliary_loss_mlp": 0.01048007, + "balance_loss_clip": 1.04753923, + "balance_loss_mlp": 1.02850461, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.57775273009111, + "language_loss": 0.75377864, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77531785, + "num_input_tokens_seen": 60250535, + "step": 2782, + "time_per_iteration": 2.5993874073028564 + }, + { + "auxiliary_loss_clip": 0.01136818, + "auxiliary_loss_mlp": 0.01055096, + "balance_loss_clip": 1.05334473, + "balance_loss_mlp": 1.03458047, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 2.0281694411159767, + "language_loss": 0.67757988, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.69949901, + "num_input_tokens_seen": 60269530, + "step": 2783, + "time_per_iteration": 2.6057674884796143 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01052463, + "balance_loss_clip": 1.05810809, + "balance_loss_mlp": 1.03294849, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 1.6396359623402017, + "language_loss": 0.70315927, + "learning_rate": 3.805255790873081e-06, + "loss": 0.725357, + "num_input_tokens_seen": 60289900, + "step": 2784, + "time_per_iteration": 2.455103635787964 + }, + { + "auxiliary_loss_clip": 0.01145005, + "auxiliary_loss_mlp": 0.01054503, + "balance_loss_clip": 1.05461049, + "balance_loss_mlp": 1.03333116, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.7402169676472032, + "language_loss": 0.60403997, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62603503, + "num_input_tokens_seen": 60310025, + "step": 2785, + "time_per_iteration": 2.5668647289276123 + }, + { + "auxiliary_loss_clip": 0.01052882, + "auxiliary_loss_mlp": 0.01008453, + "balance_loss_clip": 1.02894771, + "balance_loss_mlp": 1.00535345, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.8086121336687637, + "language_loss": 0.58810329, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60871661, + "num_input_tokens_seen": 60377800, + "step": 2786, + "time_per_iteration": 3.117018461227417 + }, + { + "auxiliary_loss_clip": 0.01148031, + "auxiliary_loss_mlp": 0.0104996, + "balance_loss_clip": 1.05192399, + "balance_loss_mlp": 1.0294565, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 2.3929785098968144, + "language_loss": 0.76359856, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78557849, + "num_input_tokens_seen": 60398215, + "step": 2787, + "time_per_iteration": 3.9660799503326416 + }, + { + "auxiliary_loss_clip": 0.01154415, + "auxiliary_loss_mlp": 0.01046567, + "balance_loss_clip": 1.05653262, + "balance_loss_mlp": 1.02710032, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 1.907790314205626, + "language_loss": 0.77492529, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79693514, + "num_input_tokens_seen": 60416910, + "step": 2788, + "time_per_iteration": 2.5101263523101807 + }, + { + "auxiliary_loss_clip": 0.01048454, + "auxiliary_loss_mlp": 0.01003543, + "balance_loss_clip": 1.02347159, + "balance_loss_mlp": 1.00056303, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8661928208317132, + "language_loss": 0.59409058, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61461055, + "num_input_tokens_seen": 60468660, + "step": 2789, + "time_per_iteration": 2.93503737449646 + }, + { + "auxiliary_loss_clip": 0.01152939, + "auxiliary_loss_mlp": 0.01060329, + "balance_loss_clip": 1.05445683, + "balance_loss_mlp": 1.03934813, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.6958584653750899, + "language_loss": 0.70074075, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72287345, + "num_input_tokens_seen": 60492370, + "step": 2790, + "time_per_iteration": 4.070323944091797 + }, + { + "auxiliary_loss_clip": 0.01131579, + "auxiliary_loss_mlp": 0.01057195, + "balance_loss_clip": 1.05111384, + "balance_loss_mlp": 1.0380981, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.858489612464086, + "language_loss": 0.79331446, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81520224, + "num_input_tokens_seen": 60512655, + "step": 2791, + "time_per_iteration": 2.5566282272338867 + }, + { + "auxiliary_loss_clip": 0.01128151, + "auxiliary_loss_mlp": 0.01051478, + "balance_loss_clip": 1.04995418, + "balance_loss_mlp": 1.03081942, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.9059602887328937, + "language_loss": 0.71219027, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.7339865, + "num_input_tokens_seen": 60533090, + "step": 2792, + "time_per_iteration": 2.6051533222198486 + }, + { + "auxiliary_loss_clip": 0.01134373, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.0524838, + "balance_loss_mlp": 1.02998281, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 1.9158918896654171, + "language_loss": 0.71684951, + "learning_rate": 3.803744324194691e-06, + "loss": 0.73868883, + "num_input_tokens_seen": 60553190, + "step": 2793, + "time_per_iteration": 2.5817506313323975 + }, + { + "auxiliary_loss_clip": 0.01149038, + "auxiliary_loss_mlp": 0.01061621, + "balance_loss_clip": 1.05350661, + "balance_loss_mlp": 1.04207087, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.8939627867476259, + "language_loss": 0.77255297, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79465961, + "num_input_tokens_seen": 60571995, + "step": 2794, + "time_per_iteration": 2.486802101135254 + }, + { + "auxiliary_loss_clip": 0.01142528, + "auxiliary_loss_mlp": 0.01053191, + "balance_loss_clip": 1.05323577, + "balance_loss_mlp": 1.0337007, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.60787962299442, + "language_loss": 0.71866518, + "learning_rate": 3.803407690167187e-06, + "loss": 0.7406224, + "num_input_tokens_seen": 60591275, + "step": 2795, + "time_per_iteration": 2.553788661956787 + }, + { + "auxiliary_loss_clip": 0.01137389, + "auxiliary_loss_mlp": 0.01049238, + "balance_loss_clip": 1.04985571, + "balance_loss_mlp": 1.02983034, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.7619044942479758, + "language_loss": 0.84198129, + "learning_rate": 3.803239270572142e-06, + "loss": 0.86384755, + "num_input_tokens_seen": 60609235, + "step": 2796, + "time_per_iteration": 3.8613481521606445 + }, + { + "auxiliary_loss_clip": 0.01108685, + "auxiliary_loss_mlp": 0.01051971, + "balance_loss_clip": 1.04992592, + "balance_loss_mlp": 1.03122866, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.8575021619077918, + "language_loss": 0.81471485, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83632141, + "num_input_tokens_seen": 60629880, + "step": 2797, + "time_per_iteration": 2.637455463409424 + }, + { + "auxiliary_loss_clip": 0.01146939, + "auxiliary_loss_mlp": 0.01044885, + "balance_loss_clip": 1.05209768, + "balance_loss_mlp": 1.02790999, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.348357048258489, + "language_loss": 0.74920839, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77112669, + "num_input_tokens_seen": 60651175, + "step": 2798, + "time_per_iteration": 2.5033626556396484 + }, + { + "auxiliary_loss_clip": 0.0116657, + "auxiliary_loss_mlp": 0.01052056, + "balance_loss_clip": 1.05831242, + "balance_loss_mlp": 1.03466332, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 2.362474929804795, + "language_loss": 0.79607677, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81826305, + "num_input_tokens_seen": 60670210, + "step": 2799, + "time_per_iteration": 2.452054738998413 + }, + { + "auxiliary_loss_clip": 0.01084223, + "auxiliary_loss_mlp": 0.01052059, + "balance_loss_clip": 1.04567504, + "balance_loss_mlp": 1.02970695, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.152845350533457, + "language_loss": 0.71516114, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.73652399, + "num_input_tokens_seen": 60690895, + "step": 2800, + "time_per_iteration": 2.7833034992218018 + }, + { + "auxiliary_loss_clip": 0.01117145, + "auxiliary_loss_mlp": 0.00786275, + "balance_loss_clip": 1.04895556, + "balance_loss_mlp": 1.0003463, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9044333122498138, + "language_loss": 0.83504748, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85408163, + "num_input_tokens_seen": 60708280, + "step": 2801, + "time_per_iteration": 2.6533682346343994 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.01055382, + "balance_loss_clip": 1.05008841, + "balance_loss_mlp": 1.03579617, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 4.014205835819352, + "language_loss": 0.82957882, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.8514747, + "num_input_tokens_seen": 60724150, + "step": 2802, + "time_per_iteration": 2.4887161254882812 + }, + { + "auxiliary_loss_clip": 0.01154653, + "auxiliary_loss_mlp": 0.01048004, + "balance_loss_clip": 1.0549022, + "balance_loss_mlp": 1.02773833, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.602821897443084, + "language_loss": 0.81135863, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83338523, + "num_input_tokens_seen": 60746485, + "step": 2803, + "time_per_iteration": 2.556628465652466 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_clip": 1.0538739, + "balance_loss_mlp": 1.02960825, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.522067464956671, + "language_loss": 0.76174623, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78374159, + "num_input_tokens_seen": 60762875, + "step": 2804, + "time_per_iteration": 3.9243600368499756 + }, + { + "auxiliary_loss_clip": 0.01043667, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.02651858, + "balance_loss_mlp": 1.02445412, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8557006391081285, + "language_loss": 0.55472445, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.5754348, + "num_input_tokens_seen": 60825510, + "step": 2805, + "time_per_iteration": 3.072341203689575 + }, + { + "auxiliary_loss_clip": 0.01141693, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_clip": 1.04797077, + "balance_loss_mlp": 1.02707076, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 2.2839079999829934, + "language_loss": 0.72812134, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.74998558, + "num_input_tokens_seen": 60844440, + "step": 2806, + "time_per_iteration": 2.491755962371826 + }, + { + "auxiliary_loss_clip": 0.01117252, + "auxiliary_loss_mlp": 0.01050369, + "balance_loss_clip": 1.05067968, + "balance_loss_mlp": 1.03171241, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.7617783928948405, + "language_loss": 0.69980013, + "learning_rate": 3.80138214341862e-06, + "loss": 0.72147632, + "num_input_tokens_seen": 60863210, + "step": 2807, + "time_per_iteration": 2.5397188663482666 + }, + { + "auxiliary_loss_clip": 0.01137384, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.04786062, + "balance_loss_mlp": 1.02564573, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 4.284821825760336, + "language_loss": 0.70455003, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72638416, + "num_input_tokens_seen": 60882510, + "step": 2808, + "time_per_iteration": 2.4946506023406982 + }, + { + "auxiliary_loss_clip": 0.01126102, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_clip": 1.053056, + "balance_loss_mlp": 1.02330017, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.2519732035336757, + "language_loss": 0.80112958, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.82283038, + "num_input_tokens_seen": 60901105, + "step": 2809, + "time_per_iteration": 2.5505402088165283 + }, + { + "auxiliary_loss_clip": 0.01154614, + "auxiliary_loss_mlp": 0.01052125, + "balance_loss_clip": 1.05169749, + "balance_loss_mlp": 1.03168058, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.1893869061232714, + "language_loss": 0.88613832, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.90820563, + "num_input_tokens_seen": 60915340, + "step": 2810, + "time_per_iteration": 2.4215309619903564 + }, + { + "auxiliary_loss_clip": 0.0115385, + "auxiliary_loss_mlp": 0.01051219, + "balance_loss_clip": 1.05472839, + "balance_loss_mlp": 1.03145409, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.53281969986707, + "language_loss": 0.92586672, + "learning_rate": 3.800704774747416e-06, + "loss": 0.9479174, + "num_input_tokens_seen": 60933735, + "step": 2811, + "time_per_iteration": 2.4709627628326416 + }, + { + "auxiliary_loss_clip": 0.01146718, + "auxiliary_loss_mlp": 0.01048696, + "balance_loss_clip": 1.05432415, + "balance_loss_mlp": 1.02970588, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 1.9170492523119471, + "language_loss": 0.78734398, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80929816, + "num_input_tokens_seen": 60953105, + "step": 2812, + "time_per_iteration": 2.4876792430877686 + }, + { + "auxiliary_loss_clip": 0.01151836, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.05754912, + "balance_loss_mlp": 1.03142571, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.901273308169579, + "language_loss": 0.74915868, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7711705, + "num_input_tokens_seen": 60969150, + "step": 2813, + "time_per_iteration": 2.4411323070526123 + }, + { + "auxiliary_loss_clip": 0.01137507, + "auxiliary_loss_mlp": 0.01043778, + "balance_loss_clip": 1.05107343, + "balance_loss_mlp": 1.02420354, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.845519469332777, + "language_loss": 0.6943872, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71620005, + "num_input_tokens_seen": 60982825, + "step": 2814, + "time_per_iteration": 2.4923009872436523 + }, + { + "auxiliary_loss_clip": 0.01165232, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_clip": 1.05712914, + "balance_loss_mlp": 1.02536774, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 1.792424353225706, + "language_loss": 0.61638075, + "learning_rate": 3.800026313549776e-06, + "loss": 0.6384775, + "num_input_tokens_seen": 61000875, + "step": 2815, + "time_per_iteration": 2.491389751434326 + }, + { + "auxiliary_loss_clip": 0.01131105, + "auxiliary_loss_mlp": 0.01044584, + "balance_loss_clip": 1.0477078, + "balance_loss_mlp": 1.02647638, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.7509681178245855, + "language_loss": 0.82257879, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84433568, + "num_input_tokens_seen": 61021940, + "step": 2816, + "time_per_iteration": 2.5710690021514893 + }, + { + "auxiliary_loss_clip": 0.01138161, + "auxiliary_loss_mlp": 0.01047671, + "balance_loss_clip": 1.05445671, + "balance_loss_mlp": 1.0284425, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.2934462114765735, + "language_loss": 0.8728348, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89469314, + "num_input_tokens_seen": 61040285, + "step": 2817, + "time_per_iteration": 2.514408588409424 + }, + { + "auxiliary_loss_clip": 0.0114293, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.05519021, + "balance_loss_mlp": 1.02715278, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.6757898102156783, + "language_loss": 0.81674147, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83863723, + "num_input_tokens_seen": 61059020, + "step": 2818, + "time_per_iteration": 2.4951512813568115 + }, + { + "auxiliary_loss_clip": 0.01161927, + "auxiliary_loss_mlp": 0.01048676, + "balance_loss_clip": 1.0554235, + "balance_loss_mlp": 1.02949524, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 2.5955796608510613, + "language_loss": 0.81004769, + "learning_rate": 3.799346760237336e-06, + "loss": 0.83215368, + "num_input_tokens_seen": 61074245, + "step": 2819, + "time_per_iteration": 2.4053192138671875 + }, + { + "auxiliary_loss_clip": 0.01053306, + "auxiliary_loss_mlp": 0.01005, + "balance_loss_clip": 1.02634573, + "balance_loss_mlp": 1.0024246, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 1.1077265038961586, + "language_loss": 0.61114168, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63172472, + "num_input_tokens_seen": 61127080, + "step": 2820, + "time_per_iteration": 2.9864771366119385 + }, + { + "auxiliary_loss_clip": 0.01131103, + "auxiliary_loss_mlp": 0.0105354, + "balance_loss_clip": 1.05148709, + "balance_loss_mlp": 1.03531253, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 1.9705923561278595, + "language_loss": 0.78754646, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.80939293, + "num_input_tokens_seen": 61146955, + "step": 2821, + "time_per_iteration": 2.6065144538879395 + }, + { + "auxiliary_loss_clip": 0.01143518, + "auxiliary_loss_mlp": 0.01055768, + "balance_loss_clip": 1.05288315, + "balance_loss_mlp": 1.03555, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 1.8045873664050596, + "language_loss": 0.78273165, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80472457, + "num_input_tokens_seen": 61166605, + "step": 2822, + "time_per_iteration": 2.5127456188201904 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.00785642, + "balance_loss_clip": 1.05540252, + "balance_loss_mlp": 1.0002116, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.858904405262454, + "language_loss": 0.74784821, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.76718867, + "num_input_tokens_seen": 61186535, + "step": 2823, + "time_per_iteration": 2.5132718086242676 + }, + { + "auxiliary_loss_clip": 0.01135746, + "auxiliary_loss_mlp": 0.01059135, + "balance_loss_clip": 1.05451894, + "balance_loss_mlp": 1.04003811, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.77091980563443, + "language_loss": 0.60589075, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62783962, + "num_input_tokens_seen": 61208965, + "step": 2824, + "time_per_iteration": 2.650712013244629 + }, + { + "auxiliary_loss_clip": 0.01142366, + "auxiliary_loss_mlp": 0.01048482, + "balance_loss_clip": 1.05561304, + "balance_loss_mlp": 1.02900338, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.749649394534509, + "language_loss": 0.73197556, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75388408, + "num_input_tokens_seen": 61230670, + "step": 2825, + "time_per_iteration": 2.6050915718078613 + }, + { + "auxiliary_loss_clip": 0.01166795, + "auxiliary_loss_mlp": 0.0105548, + "balance_loss_clip": 1.05532765, + "balance_loss_mlp": 1.03476143, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.9504041520336652, + "language_loss": 0.8564052, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87862802, + "num_input_tokens_seen": 61249510, + "step": 2826, + "time_per_iteration": 3.955172061920166 + }, + { + "auxiliary_loss_clip": 0.0114416, + "auxiliary_loss_mlp": 0.01055021, + "balance_loss_clip": 1.05136442, + "balance_loss_mlp": 1.03533959, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 1.6728657433610803, + "language_loss": 0.82689118, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84888303, + "num_input_tokens_seen": 61269440, + "step": 2827, + "time_per_iteration": 2.5247809886932373 + }, + { + "auxiliary_loss_clip": 0.01136757, + "auxiliary_loss_mlp": 0.01049506, + "balance_loss_clip": 1.05169845, + "balance_loss_mlp": 1.02899027, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 1.6854968067511351, + "language_loss": 0.74072051, + "learning_rate": 3.797813774376267e-06, + "loss": 0.76258314, + "num_input_tokens_seen": 61288195, + "step": 2828, + "time_per_iteration": 2.5277769565582275 + }, + { + "auxiliary_loss_clip": 0.01056907, + "auxiliary_loss_mlp": 0.01012345, + "balance_loss_clip": 1.0409739, + "balance_loss_mlp": 1.00948417, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.7617687316087005, + "language_loss": 0.56407386, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58476639, + "num_input_tokens_seen": 61350850, + "step": 2829, + "time_per_iteration": 4.861382961273193 + }, + { + "auxiliary_loss_clip": 0.01119147, + "auxiliary_loss_mlp": 0.01056691, + "balance_loss_clip": 1.04565704, + "balance_loss_mlp": 1.03587699, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 2.000323809169149, + "language_loss": 0.83289814, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85465658, + "num_input_tokens_seen": 61370765, + "step": 2830, + "time_per_iteration": 2.599285125732422 + }, + { + "auxiliary_loss_clip": 0.01129729, + "auxiliary_loss_mlp": 0.01050079, + "balance_loss_clip": 1.05196404, + "balance_loss_mlp": 1.03007555, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 1.905470618638539, + "language_loss": 0.78634477, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80814278, + "num_input_tokens_seen": 61388935, + "step": 2831, + "time_per_iteration": 2.6019628047943115 + }, + { + "auxiliary_loss_clip": 0.01128842, + "auxiliary_loss_mlp": 0.01053093, + "balance_loss_clip": 1.05290794, + "balance_loss_mlp": 1.03173065, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.1675107403177325, + "language_loss": 0.80140638, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.82322574, + "num_input_tokens_seen": 61407350, + "step": 2832, + "time_per_iteration": 2.5294313430786133 + }, + { + "auxiliary_loss_clip": 0.01134419, + "auxiliary_loss_mlp": 0.01050207, + "balance_loss_clip": 1.05235505, + "balance_loss_mlp": 1.03151512, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.682428799541844, + "language_loss": 0.88766843, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.90951467, + "num_input_tokens_seen": 61429010, + "step": 2833, + "time_per_iteration": 2.532953977584839 + }, + { + "auxiliary_loss_clip": 0.01163619, + "auxiliary_loss_mlp": 0.01046475, + "balance_loss_clip": 1.05607367, + "balance_loss_mlp": 1.02831912, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.129829981887259, + "language_loss": 0.72511882, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74721974, + "num_input_tokens_seen": 61450040, + "step": 2834, + "time_per_iteration": 2.605407476425171 + }, + { + "auxiliary_loss_clip": 0.01127893, + "auxiliary_loss_mlp": 0.01054082, + "balance_loss_clip": 1.05582428, + "balance_loss_mlp": 1.03549778, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 2.1904487326812325, + "language_loss": 0.8637054, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88552511, + "num_input_tokens_seen": 61468585, + "step": 2835, + "time_per_iteration": 2.5481903553009033 + }, + { + "auxiliary_loss_clip": 0.01155951, + "auxiliary_loss_mlp": 0.01052814, + "balance_loss_clip": 1.0545491, + "balance_loss_mlp": 1.03141618, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 2.315968901750431, + "language_loss": 0.74103498, + "learning_rate": 3.796446484348989e-06, + "loss": 0.76312256, + "num_input_tokens_seen": 61486330, + "step": 2836, + "time_per_iteration": 3.8260421752929688 + }, + { + "auxiliary_loss_clip": 0.01106667, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.04641879, + "balance_loss_mlp": 1.02384031, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.7132150093734273, + "language_loss": 0.80094779, + "learning_rate": 3.796275266481036e-06, + "loss": 0.82247537, + "num_input_tokens_seen": 61503950, + "step": 2837, + "time_per_iteration": 2.539482831954956 + }, + { + "auxiliary_loss_clip": 0.01150007, + "auxiliary_loss_mlp": 0.01047662, + "balance_loss_clip": 1.05455661, + "balance_loss_mlp": 1.02842116, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 2.609961347021542, + "language_loss": 0.8329401, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85491675, + "num_input_tokens_seen": 61523550, + "step": 2838, + "time_per_iteration": 2.5104899406433105 + }, + { + "auxiliary_loss_clip": 0.0111369, + "auxiliary_loss_mlp": 0.01046735, + "balance_loss_clip": 1.05557823, + "balance_loss_mlp": 1.0279119, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.7803753886998277, + "language_loss": 0.9375912, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95919544, + "num_input_tokens_seen": 61542720, + "step": 2839, + "time_per_iteration": 2.619856595993042 + }, + { + "auxiliary_loss_clip": 0.01132051, + "auxiliary_loss_mlp": 0.01050425, + "balance_loss_clip": 1.05242217, + "balance_loss_mlp": 1.02961135, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.0678249573476943, + "language_loss": 0.84017611, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86200082, + "num_input_tokens_seen": 61563040, + "step": 2840, + "time_per_iteration": 2.5643913745880127 + }, + { + "auxiliary_loss_clip": 0.01156003, + "auxiliary_loss_mlp": 0.01048507, + "balance_loss_clip": 1.05477536, + "balance_loss_mlp": 1.02735937, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 1.857238655099095, + "language_loss": 0.7640785, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78612351, + "num_input_tokens_seen": 61581890, + "step": 2841, + "time_per_iteration": 2.4800398349761963 + }, + { + "auxiliary_loss_clip": 0.01140897, + "auxiliary_loss_mlp": 0.01051946, + "balance_loss_clip": 1.05265927, + "balance_loss_mlp": 1.03249097, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.9806357728991106, + "language_loss": 0.76798201, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.78991044, + "num_input_tokens_seen": 61602095, + "step": 2842, + "time_per_iteration": 2.546588659286499 + }, + { + "auxiliary_loss_clip": 0.01161, + "auxiliary_loss_mlp": 0.01044465, + "balance_loss_clip": 1.05683994, + "balance_loss_mlp": 1.02571321, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.0228793470060378, + "language_loss": 0.85625279, + "learning_rate": 3.795246529087043e-06, + "loss": 0.87830746, + "num_input_tokens_seen": 61620400, + "step": 2843, + "time_per_iteration": 2.437666654586792 + }, + { + "auxiliary_loss_clip": 0.01162547, + "auxiliary_loss_mlp": 0.010451, + "balance_loss_clip": 1.05755162, + "balance_loss_mlp": 1.02606273, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.7278102236973083, + "language_loss": 0.6841473, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70622385, + "num_input_tokens_seen": 61637680, + "step": 2844, + "time_per_iteration": 3.8721015453338623 + }, + { + "auxiliary_loss_clip": 0.01136824, + "auxiliary_loss_mlp": 0.00786796, + "balance_loss_clip": 1.05005932, + "balance_loss_mlp": 1.00028467, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.897671538397617, + "language_loss": 0.78347015, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80270636, + "num_input_tokens_seen": 61655630, + "step": 2845, + "time_per_iteration": 2.4948995113372803 + }, + { + "auxiliary_loss_clip": 0.01148312, + "auxiliary_loss_mlp": 0.01046623, + "balance_loss_clip": 1.05302763, + "balance_loss_mlp": 1.02871799, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.3813999179057763, + "language_loss": 0.77863836, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.80058777, + "num_input_tokens_seen": 61673475, + "step": 2846, + "time_per_iteration": 2.4358339309692383 + }, + { + "auxiliary_loss_clip": 0.01149121, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.05568445, + "balance_loss_mlp": 1.02514005, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7151408834800181, + "language_loss": 0.79692793, + "learning_rate": 3.794559342552472e-06, + "loss": 0.81885678, + "num_input_tokens_seen": 61693370, + "step": 2847, + "time_per_iteration": 2.5108540058135986 + }, + { + "auxiliary_loss_clip": 0.01148368, + "auxiliary_loss_mlp": 0.01047088, + "balance_loss_clip": 1.05113852, + "balance_loss_mlp": 1.0282408, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.4039874630443947, + "language_loss": 0.86750948, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.88946402, + "num_input_tokens_seen": 61710820, + "step": 2848, + "time_per_iteration": 2.445556879043579 + }, + { + "auxiliary_loss_clip": 0.01119961, + "auxiliary_loss_mlp": 0.01045595, + "balance_loss_clip": 1.05012834, + "balance_loss_mlp": 1.02598488, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.8433931889759037, + "language_loss": 0.75280273, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77445829, + "num_input_tokens_seen": 61729855, + "step": 2849, + "time_per_iteration": 2.567160129547119 + }, + { + "auxiliary_loss_clip": 0.01032268, + "auxiliary_loss_mlp": 0.01016037, + "balance_loss_clip": 1.02221966, + "balance_loss_mlp": 1.01348639, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.8098918775882901, + "language_loss": 0.57549351, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59597659, + "num_input_tokens_seen": 61790290, + "step": 2850, + "time_per_iteration": 3.0988709926605225 + }, + { + "auxiliary_loss_clip": 0.01119045, + "auxiliary_loss_mlp": 0.01045675, + "balance_loss_clip": 1.04980588, + "balance_loss_mlp": 1.02735209, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.2149118931423564, + "language_loss": 0.81244361, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83409089, + "num_input_tokens_seen": 61809265, + "step": 2851, + "time_per_iteration": 2.5468063354492188 + }, + { + "auxiliary_loss_clip": 0.01121202, + "auxiliary_loss_mlp": 0.0104649, + "balance_loss_clip": 1.05462921, + "balance_loss_mlp": 1.02895403, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.000057601420184, + "language_loss": 0.9355098, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.9571867, + "num_input_tokens_seen": 61828980, + "step": 2852, + "time_per_iteration": 2.575247049331665 + }, + { + "auxiliary_loss_clip": 0.01125104, + "auxiliary_loss_mlp": 0.01048883, + "balance_loss_clip": 1.048437, + "balance_loss_mlp": 1.03054905, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 2.457486666120473, + "language_loss": 0.69348812, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71522796, + "num_input_tokens_seen": 61847915, + "step": 2853, + "time_per_iteration": 2.533026933670044 + }, + { + "auxiliary_loss_clip": 0.01127674, + "auxiliary_loss_mlp": 0.01048821, + "balance_loss_clip": 1.05815935, + "balance_loss_mlp": 1.03111899, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 1.970961275303657, + "language_loss": 0.66553092, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68729591, + "num_input_tokens_seen": 61865570, + "step": 2854, + "time_per_iteration": 2.556549549102783 + }, + { + "auxiliary_loss_clip": 0.01125939, + "auxiliary_loss_mlp": 0.0105226, + "balance_loss_clip": 1.04784179, + "balance_loss_mlp": 1.03433132, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.5771918624586296, + "language_loss": 0.89367318, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91545522, + "num_input_tokens_seen": 61883340, + "step": 2855, + "time_per_iteration": 2.4986190795898438 + }, + { + "auxiliary_loss_clip": 0.01163536, + "auxiliary_loss_mlp": 0.01048463, + "balance_loss_clip": 1.05645442, + "balance_loss_mlp": 1.03076077, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.055903400663358, + "language_loss": 0.83018136, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.85230136, + "num_input_tokens_seen": 61900610, + "step": 2856, + "time_per_iteration": 2.478058338165283 + }, + { + "auxiliary_loss_clip": 0.01151224, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_clip": 1.05541444, + "balance_loss_mlp": 1.0294838, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 2.0979322387797716, + "language_loss": 0.86803102, + "learning_rate": 3.792836613639026e-06, + "loss": 0.89001977, + "num_input_tokens_seen": 61916795, + "step": 2857, + "time_per_iteration": 2.470764398574829 + }, + { + "auxiliary_loss_clip": 0.01149243, + "auxiliary_loss_mlp": 0.0105742, + "balance_loss_clip": 1.05446434, + "balance_loss_mlp": 1.03838241, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 1.9960496627850677, + "language_loss": 0.78240782, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80447435, + "num_input_tokens_seen": 61936665, + "step": 2858, + "time_per_iteration": 2.4792768955230713 + }, + { + "auxiliary_loss_clip": 0.01148033, + "auxiliary_loss_mlp": 0.01055709, + "balance_loss_clip": 1.0519191, + "balance_loss_mlp": 1.0349431, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.241605190438837, + "language_loss": 0.77385402, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79589146, + "num_input_tokens_seen": 61954415, + "step": 2859, + "time_per_iteration": 2.481640577316284 + }, + { + "auxiliary_loss_clip": 0.01114549, + "auxiliary_loss_mlp": 0.0104659, + "balance_loss_clip": 1.05453348, + "balance_loss_mlp": 1.02817178, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 2.546563897990275, + "language_loss": 0.76900816, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79061955, + "num_input_tokens_seen": 61973940, + "step": 2860, + "time_per_iteration": 2.5761256217956543 + }, + { + "auxiliary_loss_clip": 0.01150192, + "auxiliary_loss_mlp": 0.01048321, + "balance_loss_clip": 1.05224526, + "balance_loss_mlp": 1.02961743, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.394343679645515, + "language_loss": 0.81933075, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84131587, + "num_input_tokens_seen": 61991845, + "step": 2861, + "time_per_iteration": 2.4689254760742188 + }, + { + "auxiliary_loss_clip": 0.01132574, + "auxiliary_loss_mlp": 0.01049713, + "balance_loss_clip": 1.04955482, + "balance_loss_mlp": 1.03249323, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 2.043361026834935, + "language_loss": 0.85858077, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.8804037, + "num_input_tokens_seen": 62009395, + "step": 2862, + "time_per_iteration": 2.4998836517333984 + }, + { + "auxiliary_loss_clip": 0.01118321, + "auxiliary_loss_mlp": 0.01047991, + "balance_loss_clip": 1.04780006, + "balance_loss_mlp": 1.030586, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 3.9077352709297775, + "language_loss": 0.77896643, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80062956, + "num_input_tokens_seen": 62029005, + "step": 2863, + "time_per_iteration": 2.573181390762329 + }, + { + "auxiliary_loss_clip": 0.01120737, + "auxiliary_loss_mlp": 0.00783101, + "balance_loss_clip": 1.0471561, + "balance_loss_mlp": 1.00025225, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.728692710803037, + "language_loss": 0.72656685, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74560523, + "num_input_tokens_seen": 62048730, + "step": 2864, + "time_per_iteration": 2.6093876361846924 + }, + { + "auxiliary_loss_clip": 0.01121411, + "auxiliary_loss_mlp": 0.01050147, + "balance_loss_clip": 1.04771698, + "balance_loss_mlp": 1.03242016, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.7030100921743947, + "language_loss": 0.72837383, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75008941, + "num_input_tokens_seen": 62069000, + "step": 2865, + "time_per_iteration": 2.569495677947998 + }, + { + "auxiliary_loss_clip": 0.01145021, + "auxiliary_loss_mlp": 0.00784159, + "balance_loss_clip": 1.05192029, + "balance_loss_mlp": 1.00032735, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 2.4888489693233358, + "language_loss": 0.78755552, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.80684727, + "num_input_tokens_seen": 62086750, + "step": 2866, + "time_per_iteration": 3.992867946624756 + }, + { + "auxiliary_loss_clip": 0.01159149, + "auxiliary_loss_mlp": 0.01044288, + "balance_loss_clip": 1.0527432, + "balance_loss_mlp": 1.02526188, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.867847645624895, + "language_loss": 0.79891574, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82095003, + "num_input_tokens_seen": 62106240, + "step": 2867, + "time_per_iteration": 2.450268268585205 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01044418, + "balance_loss_clip": 1.05197012, + "balance_loss_mlp": 1.02584565, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.6377974357558311, + "language_loss": 0.79844069, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.82023984, + "num_input_tokens_seen": 62124895, + "step": 2868, + "time_per_iteration": 4.0524256229400635 + }, + { + "auxiliary_loss_clip": 0.01118449, + "auxiliary_loss_mlp": 0.01045417, + "balance_loss_clip": 1.05599797, + "balance_loss_mlp": 1.02817941, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.7206118968495927, + "language_loss": 0.83300292, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.85464156, + "num_input_tokens_seen": 62143510, + "step": 2869, + "time_per_iteration": 2.5433850288391113 + }, + { + "auxiliary_loss_clip": 0.01136094, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.05232096, + "balance_loss_mlp": 1.02872443, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 1.8183787616501628, + "language_loss": 0.7723341, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79417259, + "num_input_tokens_seen": 62162285, + "step": 2870, + "time_per_iteration": 2.5009543895721436 + }, + { + "auxiliary_loss_clip": 0.01153108, + "auxiliary_loss_mlp": 0.01043556, + "balance_loss_clip": 1.05257964, + "balance_loss_mlp": 1.02705717, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 4.129692290406625, + "language_loss": 0.77232027, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79428685, + "num_input_tokens_seen": 62180970, + "step": 2871, + "time_per_iteration": 2.47479248046875 + }, + { + "auxiliary_loss_clip": 0.0113311, + "auxiliary_loss_mlp": 0.01046167, + "balance_loss_clip": 1.0505085, + "balance_loss_mlp": 1.02715313, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.8398483187414385, + "language_loss": 0.74759215, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76938492, + "num_input_tokens_seen": 62198965, + "step": 2872, + "time_per_iteration": 2.561004161834717 + }, + { + "auxiliary_loss_clip": 0.0115343, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.05034864, + "balance_loss_mlp": 1.0235455, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 7.6435406148338885, + "language_loss": 0.82904518, + "learning_rate": 3.790066109323988e-06, + "loss": 0.85099554, + "num_input_tokens_seen": 62219890, + "step": 2873, + "time_per_iteration": 2.4778220653533936 + }, + { + "auxiliary_loss_clip": 0.0111245, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_clip": 1.04550481, + "balance_loss_mlp": 1.02497625, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.08126789208604, + "language_loss": 0.74910575, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77067137, + "num_input_tokens_seen": 62237140, + "step": 2874, + "time_per_iteration": 3.9611220359802246 + }, + { + "auxiliary_loss_clip": 0.01159132, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_clip": 1.05234575, + "balance_loss_mlp": 1.02644408, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 2.5526850434985917, + "language_loss": 0.80886757, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.83091748, + "num_input_tokens_seen": 62255405, + "step": 2875, + "time_per_iteration": 2.43361759185791 + }, + { + "auxiliary_loss_clip": 0.01138876, + "auxiliary_loss_mlp": 0.01049239, + "balance_loss_clip": 1.05296922, + "balance_loss_mlp": 1.03006983, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.894704509576806, + "language_loss": 0.88123161, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.90311277, + "num_input_tokens_seen": 62271280, + "step": 2876, + "time_per_iteration": 2.4772520065307617 + }, + { + "auxiliary_loss_clip": 0.01136955, + "auxiliary_loss_mlp": 0.01046204, + "balance_loss_clip": 1.05433774, + "balance_loss_mlp": 1.02814388, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 4.164283096823979, + "language_loss": 0.84436798, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86619955, + "num_input_tokens_seen": 62289140, + "step": 2877, + "time_per_iteration": 2.490253448486328 + }, + { + "auxiliary_loss_clip": 0.01126123, + "auxiliary_loss_mlp": 0.01049741, + "balance_loss_clip": 1.05298948, + "balance_loss_mlp": 1.03143024, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 1.9238499549384314, + "language_loss": 0.79315901, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81491768, + "num_input_tokens_seen": 62307490, + "step": 2878, + "time_per_iteration": 2.574507474899292 + }, + { + "auxiliary_loss_clip": 0.01136185, + "auxiliary_loss_mlp": 0.0104543, + "balance_loss_clip": 1.05154967, + "balance_loss_mlp": 1.02776337, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.794924546886212, + "language_loss": 0.70546973, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72728586, + "num_input_tokens_seen": 62328570, + "step": 2879, + "time_per_iteration": 2.5837454795837402 + }, + { + "auxiliary_loss_clip": 0.01134749, + "auxiliary_loss_mlp": 0.01052483, + "balance_loss_clip": 1.05196095, + "balance_loss_mlp": 1.0351975, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.581556139293537, + "language_loss": 0.83273512, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85460746, + "num_input_tokens_seen": 62345735, + "step": 2880, + "time_per_iteration": 2.481459379196167 + }, + { + "auxiliary_loss_clip": 0.01112128, + "auxiliary_loss_mlp": 0.01050847, + "balance_loss_clip": 1.04922414, + "balance_loss_mlp": 1.03031945, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 3.432373740863527, + "language_loss": 0.80673325, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.82836294, + "num_input_tokens_seen": 62365525, + "step": 2881, + "time_per_iteration": 2.593801736831665 + }, + { + "auxiliary_loss_clip": 0.01137414, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_clip": 1.05279171, + "balance_loss_mlp": 1.03108585, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.008091955904663, + "language_loss": 0.77090144, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79275715, + "num_input_tokens_seen": 62385160, + "step": 2882, + "time_per_iteration": 2.536558151245117 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.0104815, + "balance_loss_clip": 1.05753732, + "balance_loss_mlp": 1.03137696, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.8184374922447537, + "language_loss": 0.76466608, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78637409, + "num_input_tokens_seen": 62405280, + "step": 2883, + "time_per_iteration": 4.1939404010772705 + }, + { + "auxiliary_loss_clip": 0.01115679, + "auxiliary_loss_mlp": 0.01043192, + "balance_loss_clip": 1.05037975, + "balance_loss_mlp": 1.02553725, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 2.1174169749489544, + "language_loss": 0.85635287, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87794161, + "num_input_tokens_seen": 62423665, + "step": 2884, + "time_per_iteration": 2.5752639770507812 + }, + { + "auxiliary_loss_clip": 0.0113924, + "auxiliary_loss_mlp": 0.00783963, + "balance_loss_clip": 1.0556252, + "balance_loss_mlp": 1.00039208, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 2.70133764173643, + "language_loss": 0.74391204, + "learning_rate": 3.787976825866055e-06, + "loss": 0.76314408, + "num_input_tokens_seen": 62445170, + "step": 2885, + "time_per_iteration": 2.593539237976074 + }, + { + "auxiliary_loss_clip": 0.01130557, + "auxiliary_loss_mlp": 0.01044258, + "balance_loss_clip": 1.05080056, + "balance_loss_mlp": 1.02729464, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.4728888623683116, + "language_loss": 0.70686692, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72861505, + "num_input_tokens_seen": 62466135, + "step": 2886, + "time_per_iteration": 2.5493369102478027 + }, + { + "auxiliary_loss_clip": 0.01148518, + "auxiliary_loss_mlp": 0.01045814, + "balance_loss_clip": 1.05353665, + "balance_loss_mlp": 1.02722955, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 2.0411023147772283, + "language_loss": 0.69796944, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.71991289, + "num_input_tokens_seen": 62483910, + "step": 2887, + "time_per_iteration": 2.4918766021728516 + }, + { + "auxiliary_loss_clip": 0.01119157, + "auxiliary_loss_mlp": 0.01047432, + "balance_loss_clip": 1.05009389, + "balance_loss_mlp": 1.02940726, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.8117526933439345, + "language_loss": 0.85071176, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87237763, + "num_input_tokens_seen": 62501530, + "step": 2888, + "time_per_iteration": 2.5178356170654297 + }, + { + "auxiliary_loss_clip": 0.0109997, + "auxiliary_loss_mlp": 0.01048872, + "balance_loss_clip": 1.05040431, + "balance_loss_mlp": 1.02785563, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 3.2072781399578365, + "language_loss": 0.79063404, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.81212246, + "num_input_tokens_seen": 62521295, + "step": 2889, + "time_per_iteration": 2.638042688369751 + }, + { + "auxiliary_loss_clip": 0.01117617, + "auxiliary_loss_mlp": 0.00783737, + "balance_loss_clip": 1.05424547, + "balance_loss_mlp": 1.00040746, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.4139700819385475, + "language_loss": 0.84469861, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86371219, + "num_input_tokens_seen": 62539615, + "step": 2890, + "time_per_iteration": 2.55908465385437 + }, + { + "auxiliary_loss_clip": 0.01147139, + "auxiliary_loss_mlp": 0.01049502, + "balance_loss_clip": 1.05608702, + "balance_loss_mlp": 1.0318948, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.9097151942557924, + "language_loss": 0.82103139, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84299779, + "num_input_tokens_seen": 62556820, + "step": 2891, + "time_per_iteration": 2.4964206218719482 + }, + { + "auxiliary_loss_clip": 0.01105624, + "auxiliary_loss_mlp": 0.01050934, + "balance_loss_clip": 1.04158926, + "balance_loss_mlp": 1.02940464, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.1294497437999724, + "language_loss": 0.81674755, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.83831316, + "num_input_tokens_seen": 62572450, + "step": 2892, + "time_per_iteration": 2.508665084838867 + }, + { + "auxiliary_loss_clip": 0.01151676, + "auxiliary_loss_mlp": 0.01056266, + "balance_loss_clip": 1.05613708, + "balance_loss_mlp": 1.03729951, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 3.0228550922763864, + "language_loss": 0.74975443, + "learning_rate": 3.786578545502627e-06, + "loss": 0.77183384, + "num_input_tokens_seen": 62592580, + "step": 2893, + "time_per_iteration": 2.569622755050659 + }, + { + "auxiliary_loss_clip": 0.01135809, + "auxiliary_loss_mlp": 0.01044568, + "balance_loss_clip": 1.05054712, + "balance_loss_mlp": 1.02555406, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.4946362979870305, + "language_loss": 0.82923913, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85104293, + "num_input_tokens_seen": 62611220, + "step": 2894, + "time_per_iteration": 2.5221736431121826 + }, + { + "auxiliary_loss_clip": 0.01119658, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.05124402, + "balance_loss_mlp": 1.02686822, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 1.8507429580287056, + "language_loss": 0.74075794, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76243627, + "num_input_tokens_seen": 62629185, + "step": 2895, + "time_per_iteration": 2.538069486618042 + }, + { + "auxiliary_loss_clip": 0.01024074, + "auxiliary_loss_mlp": 0.0101705, + "balance_loss_clip": 1.02565849, + "balance_loss_mlp": 1.01480854, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8820173746162311, + "language_loss": 0.62759733, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64800858, + "num_input_tokens_seen": 62691895, + "step": 2896, + "time_per_iteration": 3.1989846229553223 + }, + { + "auxiliary_loss_clip": 0.01133153, + "auxiliary_loss_mlp": 0.00786017, + "balance_loss_clip": 1.05109572, + "balance_loss_mlp": 1.00044596, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 1.7843924758014842, + "language_loss": 0.76272309, + "learning_rate": 3.785877779175034e-06, + "loss": 0.78191477, + "num_input_tokens_seen": 62713790, + "step": 2897, + "time_per_iteration": 2.576796293258667 + }, + { + "auxiliary_loss_clip": 0.01145728, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.05520439, + "balance_loss_mlp": 1.02362728, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 2.1317542224024275, + "language_loss": 0.69086164, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71273333, + "num_input_tokens_seen": 62736285, + "step": 2898, + "time_per_iteration": 2.589921474456787 + }, + { + "auxiliary_loss_clip": 0.01132845, + "auxiliary_loss_mlp": 0.0104396, + "balance_loss_clip": 1.05149281, + "balance_loss_mlp": 1.02567315, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 1.9766016697541673, + "language_loss": 0.7610594, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.7828275, + "num_input_tokens_seen": 62756240, + "step": 2899, + "time_per_iteration": 2.589794874191284 + }, + { + "auxiliary_loss_clip": 0.01101538, + "auxiliary_loss_mlp": 0.01042559, + "balance_loss_clip": 1.04886746, + "balance_loss_mlp": 1.02352107, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.9404654441110933, + "language_loss": 0.72588193, + "learning_rate": 3.785351493339121e-06, + "loss": 0.74732292, + "num_input_tokens_seen": 62775910, + "step": 2900, + "time_per_iteration": 2.574349880218506 + }, + { + "auxiliary_loss_clip": 0.01112259, + "auxiliary_loss_mlp": 0.00785162, + "balance_loss_clip": 1.04829621, + "balance_loss_mlp": 1.00049961, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.5523579229467275, + "language_loss": 0.69968045, + "learning_rate": 3.785175929316863e-06, + "loss": 0.71865469, + "num_input_tokens_seen": 62799385, + "step": 2901, + "time_per_iteration": 2.7228000164031982 + }, + { + "auxiliary_loss_clip": 0.01133855, + "auxiliary_loss_mlp": 0.01050141, + "balance_loss_clip": 1.05144727, + "balance_loss_mlp": 1.03194928, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.8130644014592845, + "language_loss": 0.76241505, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78425509, + "num_input_tokens_seen": 62819380, + "step": 2902, + "time_per_iteration": 2.5376365184783936 + }, + { + "auxiliary_loss_clip": 0.01151416, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.05426812, + "balance_loss_mlp": 1.03316939, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 3.7978297556389125, + "language_loss": 0.81524223, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.83727062, + "num_input_tokens_seen": 62836205, + "step": 2903, + "time_per_iteration": 2.449570655822754 + }, + { + "auxiliary_loss_clip": 0.01130215, + "auxiliary_loss_mlp": 0.01043211, + "balance_loss_clip": 1.05322671, + "balance_loss_mlp": 1.02507901, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 1.91345014926277, + "language_loss": 0.73306721, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75480151, + "num_input_tokens_seen": 62854045, + "step": 2904, + "time_per_iteration": 2.4826860427856445 + }, + { + "auxiliary_loss_clip": 0.01106241, + "auxiliary_loss_mlp": 0.01045852, + "balance_loss_clip": 1.04637694, + "balance_loss_mlp": 1.02766085, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.9661316020770296, + "language_loss": 0.6455496, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66707051, + "num_input_tokens_seen": 62873075, + "step": 2905, + "time_per_iteration": 4.0915913581848145 + }, + { + "auxiliary_loss_clip": 0.01135398, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_clip": 1.05423856, + "balance_loss_mlp": 1.02748358, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 2.0075165941621287, + "language_loss": 0.79520357, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81702185, + "num_input_tokens_seen": 62892675, + "step": 2906, + "time_per_iteration": 2.563476085662842 + }, + { + "auxiliary_loss_clip": 0.01150778, + "auxiliary_loss_mlp": 0.01053752, + "balance_loss_clip": 1.0562737, + "balance_loss_mlp": 1.03552508, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 2.2606841315778015, + "language_loss": 0.81047457, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83251989, + "num_input_tokens_seen": 62910675, + "step": 2907, + "time_per_iteration": 2.4893436431884766 + }, + { + "auxiliary_loss_clip": 0.01153509, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.05867076, + "balance_loss_mlp": 1.03089952, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.64611591918161, + "language_loss": 0.81090176, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83292413, + "num_input_tokens_seen": 62928130, + "step": 2908, + "time_per_iteration": 3.998579978942871 + }, + { + "auxiliary_loss_clip": 0.011308, + "auxiliary_loss_mlp": 0.01055664, + "balance_loss_clip": 1.05146706, + "balance_loss_mlp": 1.03613746, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 3.5355694246061584, + "language_loss": 0.80316949, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82503414, + "num_input_tokens_seen": 62944290, + "step": 2909, + "time_per_iteration": 2.4741060733795166 + }, + { + "auxiliary_loss_clip": 0.01095862, + "auxiliary_loss_mlp": 0.0105727, + "balance_loss_clip": 1.04943132, + "balance_loss_mlp": 1.03535914, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 5.28739841634414, + "language_loss": 0.76687908, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78841031, + "num_input_tokens_seen": 62963505, + "step": 2910, + "time_per_iteration": 2.544053077697754 + }, + { + "auxiliary_loss_clip": 0.01166314, + "auxiliary_loss_mlp": 0.01052905, + "balance_loss_clip": 1.05819631, + "balance_loss_mlp": 1.0331285, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.5762096032206576, + "language_loss": 0.871925, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89411724, + "num_input_tokens_seen": 62985020, + "step": 2911, + "time_per_iteration": 2.496882200241089 + }, + { + "auxiliary_loss_clip": 0.01160708, + "auxiliary_loss_mlp": 0.00786342, + "balance_loss_clip": 1.05438662, + "balance_loss_mlp": 1.00048184, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.1442580441425263, + "language_loss": 0.89502466, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91449511, + "num_input_tokens_seen": 63001745, + "step": 2912, + "time_per_iteration": 2.4224140644073486 + }, + { + "auxiliary_loss_clip": 0.01150887, + "auxiliary_loss_mlp": 0.01044564, + "balance_loss_clip": 1.05164063, + "balance_loss_mlp": 1.02518058, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 1.7289024756889275, + "language_loss": 0.72448903, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74644351, + "num_input_tokens_seen": 63019750, + "step": 2913, + "time_per_iteration": 2.460867404937744 + }, + { + "auxiliary_loss_clip": 0.01142811, + "auxiliary_loss_mlp": 0.01045728, + "balance_loss_clip": 1.05505943, + "balance_loss_mlp": 1.02741778, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 1.930857465690327, + "language_loss": 0.68926835, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71115375, + "num_input_tokens_seen": 63039500, + "step": 2914, + "time_per_iteration": 3.8874576091766357 + }, + { + "auxiliary_loss_clip": 0.01146961, + "auxiliary_loss_mlp": 0.01047654, + "balance_loss_clip": 1.05573094, + "balance_loss_mlp": 1.02915263, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 2.037274962104809, + "language_loss": 0.93769574, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95964193, + "num_input_tokens_seen": 63059785, + "step": 2915, + "time_per_iteration": 2.479597806930542 + }, + { + "auxiliary_loss_clip": 0.01119923, + "auxiliary_loss_mlp": 0.0104468, + "balance_loss_clip": 1.05053711, + "balance_loss_mlp": 1.02578497, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.7643854119053004, + "language_loss": 0.80846632, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83011228, + "num_input_tokens_seen": 63079385, + "step": 2916, + "time_per_iteration": 2.567068099975586 + }, + { + "auxiliary_loss_clip": 0.01150351, + "auxiliary_loss_mlp": 0.01058714, + "balance_loss_clip": 1.05410123, + "balance_loss_mlp": 1.03983152, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 3.2428573059245496, + "language_loss": 0.73738354, + "learning_rate": 3.782357703104799e-06, + "loss": 0.75947422, + "num_input_tokens_seen": 63098970, + "step": 2917, + "time_per_iteration": 2.4962751865386963 + }, + { + "auxiliary_loss_clip": 0.01141177, + "auxiliary_loss_mlp": 0.01050505, + "balance_loss_clip": 1.05549836, + "balance_loss_mlp": 1.03057325, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 2.2058351742110616, + "language_loss": 0.7710613, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79297817, + "num_input_tokens_seen": 63118750, + "step": 2918, + "time_per_iteration": 2.5335793495178223 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.01050057, + "balance_loss_clip": 1.05167556, + "balance_loss_mlp": 1.028862, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 2.2040727984066653, + "language_loss": 0.74261892, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76413882, + "num_input_tokens_seen": 63136865, + "step": 2919, + "time_per_iteration": 2.673889398574829 + }, + { + "auxiliary_loss_clip": 0.01131492, + "auxiliary_loss_mlp": 0.01049275, + "balance_loss_clip": 1.05098271, + "balance_loss_mlp": 1.03039205, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 2.1286373224189306, + "language_loss": 0.74640524, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76821291, + "num_input_tokens_seen": 63158325, + "step": 2920, + "time_per_iteration": 2.5817387104034424 + }, + { + "auxiliary_loss_clip": 0.01124837, + "auxiliary_loss_mlp": 0.01047498, + "balance_loss_clip": 1.04597747, + "balance_loss_mlp": 1.02896082, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.547781575693447, + "language_loss": 0.79217792, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81390131, + "num_input_tokens_seen": 63173115, + "step": 2921, + "time_per_iteration": 2.48964524269104 + }, + { + "auxiliary_loss_clip": 0.01126044, + "auxiliary_loss_mlp": 0.01046254, + "balance_loss_clip": 1.05030537, + "balance_loss_mlp": 1.02684629, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.5642176476549137, + "language_loss": 0.87698841, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89871132, + "num_input_tokens_seen": 63192880, + "step": 2922, + "time_per_iteration": 3.933818817138672 + }, + { + "auxiliary_loss_clip": 0.01151413, + "auxiliary_loss_mlp": 0.01052887, + "balance_loss_clip": 1.05503082, + "balance_loss_mlp": 1.0346719, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.5496998103026587, + "language_loss": 0.62241316, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64445615, + "num_input_tokens_seen": 63214395, + "step": 2923, + "time_per_iteration": 2.515540361404419 + }, + { + "auxiliary_loss_clip": 0.01135375, + "auxiliary_loss_mlp": 0.01047518, + "balance_loss_clip": 1.05375695, + "balance_loss_mlp": 1.02801585, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.126977386288804, + "language_loss": 0.80032122, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82215023, + "num_input_tokens_seen": 63231020, + "step": 2924, + "time_per_iteration": 2.521221160888672 + }, + { + "auxiliary_loss_clip": 0.01139143, + "auxiliary_loss_mlp": 0.01061147, + "balance_loss_clip": 1.05515206, + "balance_loss_mlp": 1.04077411, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 2.6196094374524965, + "language_loss": 0.7141481, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73615104, + "num_input_tokens_seen": 63246245, + "step": 2925, + "time_per_iteration": 2.4640119075775146 + }, + { + "auxiliary_loss_clip": 0.01123939, + "auxiliary_loss_mlp": 0.01042253, + "balance_loss_clip": 1.05416548, + "balance_loss_mlp": 1.02460957, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.7256150374017654, + "language_loss": 0.72025168, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74191356, + "num_input_tokens_seen": 63267790, + "step": 2926, + "time_per_iteration": 2.56894588470459 + }, + { + "auxiliary_loss_clip": 0.01109017, + "auxiliary_loss_mlp": 0.0105292, + "balance_loss_clip": 1.0467453, + "balance_loss_mlp": 1.03065169, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.9125637543781417, + "language_loss": 0.84883922, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.8704586, + "num_input_tokens_seen": 63286830, + "step": 2927, + "time_per_iteration": 2.5407865047454834 + }, + { + "auxiliary_loss_clip": 0.01108435, + "auxiliary_loss_mlp": 0.01044433, + "balance_loss_clip": 1.05497599, + "balance_loss_mlp": 1.02695644, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 10.323732892295261, + "language_loss": 0.72206783, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74359649, + "num_input_tokens_seen": 63308870, + "step": 2928, + "time_per_iteration": 2.6952602863311768 + }, + { + "auxiliary_loss_clip": 0.01125014, + "auxiliary_loss_mlp": 0.01045679, + "balance_loss_clip": 1.05034423, + "balance_loss_mlp": 1.02721393, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 2.0169035058981337, + "language_loss": 0.83013237, + "learning_rate": 3.780232677305744e-06, + "loss": 0.8518393, + "num_input_tokens_seen": 63329005, + "step": 2929, + "time_per_iteration": 2.5864181518554688 + }, + { + "auxiliary_loss_clip": 0.01131847, + "auxiliary_loss_mlp": 0.01038213, + "balance_loss_clip": 1.05081415, + "balance_loss_mlp": 1.02052271, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.581549077399104, + "language_loss": 0.79117012, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81287074, + "num_input_tokens_seen": 63349390, + "step": 2930, + "time_per_iteration": 2.6303927898406982 + }, + { + "auxiliary_loss_clip": 0.01164486, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.05849671, + "balance_loss_mlp": 1.02801299, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.073039427359738, + "language_loss": 0.77049077, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.7926088, + "num_input_tokens_seen": 63368835, + "step": 2931, + "time_per_iteration": 2.4929544925689697 + }, + { + "auxiliary_loss_clip": 0.01079973, + "auxiliary_loss_mlp": 0.01043614, + "balance_loss_clip": 1.04279041, + "balance_loss_mlp": 1.02496946, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.583629840522386, + "language_loss": 0.74945009, + "learning_rate": 3.779699901503696e-06, + "loss": 0.77068591, + "num_input_tokens_seen": 63385220, + "step": 2932, + "time_per_iteration": 2.6309995651245117 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01046406, + "balance_loss_clip": 1.05338717, + "balance_loss_mlp": 1.0262959, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.45977703477994, + "language_loss": 0.89708066, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.91910052, + "num_input_tokens_seen": 63400865, + "step": 2933, + "time_per_iteration": 2.455672025680542 + }, + { + "auxiliary_loss_clip": 0.01161472, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_clip": 1.05912209, + "balance_loss_mlp": 1.03248572, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.6221589185064333, + "language_loss": 0.88590288, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90801293, + "num_input_tokens_seen": 63421390, + "step": 2934, + "time_per_iteration": 2.490057945251465 + }, + { + "auxiliary_loss_clip": 0.01133336, + "auxiliary_loss_mlp": 0.01046846, + "balance_loss_clip": 1.05665445, + "balance_loss_mlp": 1.02921462, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.5276876515308404, + "language_loss": 0.70554078, + "learning_rate": 3.779166518324077e-06, + "loss": 0.72734261, + "num_input_tokens_seen": 63444715, + "step": 2935, + "time_per_iteration": 2.8173294067382812 + }, + { + "auxiliary_loss_clip": 0.01131149, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_clip": 1.05387068, + "balance_loss_mlp": 1.02289093, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 1.949096035694976, + "language_loss": 0.70248419, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.72421461, + "num_input_tokens_seen": 63465525, + "step": 2936, + "time_per_iteration": 2.5764317512512207 + }, + { + "auxiliary_loss_clip": 0.01114978, + "auxiliary_loss_mlp": 0.01047801, + "balance_loss_clip": 1.05322528, + "balance_loss_mlp": 1.0296452, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 2.083411347937605, + "language_loss": 0.71347213, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73509991, + "num_input_tokens_seen": 63485815, + "step": 2937, + "time_per_iteration": 2.6485977172851562 + }, + { + "auxiliary_loss_clip": 0.01142151, + "auxiliary_loss_mlp": 0.01047971, + "balance_loss_clip": 1.0585804, + "balance_loss_mlp": 1.02870655, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.436171003446247, + "language_loss": 0.7548089, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.77671015, + "num_input_tokens_seen": 63503905, + "step": 2938, + "time_per_iteration": 2.5865838527679443 + }, + { + "auxiliary_loss_clip": 0.01155121, + "auxiliary_loss_mlp": 0.01041203, + "balance_loss_clip": 1.05930531, + "balance_loss_mlp": 1.02315474, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 1.939036891976631, + "language_loss": 0.7078234, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.72978657, + "num_input_tokens_seen": 63521985, + "step": 2939, + "time_per_iteration": 2.5048131942749023 + }, + { + "auxiliary_loss_clip": 0.01167151, + "auxiliary_loss_mlp": 0.0104378, + "balance_loss_clip": 1.06215882, + "balance_loss_mlp": 1.02538586, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 2.6702410702323647, + "language_loss": 0.74444938, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.76655865, + "num_input_tokens_seen": 63539830, + "step": 2940, + "time_per_iteration": 2.464853048324585 + }, + { + "auxiliary_loss_clip": 0.01125879, + "auxiliary_loss_mlp": 0.01044743, + "balance_loss_clip": 1.05474925, + "balance_loss_mlp": 1.02454853, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.233920629483509, + "language_loss": 0.85808021, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87978649, + "num_input_tokens_seen": 63555495, + "step": 2941, + "time_per_iteration": 2.5042381286621094 + }, + { + "auxiliary_loss_clip": 0.01167326, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.05984485, + "balance_loss_mlp": 1.02178144, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 1.935671469006392, + "language_loss": 0.76969457, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79177511, + "num_input_tokens_seen": 63575290, + "step": 2942, + "time_per_iteration": 2.5166146755218506 + }, + { + "auxiliary_loss_clip": 0.01112362, + "auxiliary_loss_mlp": 0.00790145, + "balance_loss_clip": 1.05183399, + "balance_loss_mlp": 1.00080657, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 2.915930389586309, + "language_loss": 0.80513191, + "learning_rate": 3.77774119516197e-06, + "loss": 0.824157, + "num_input_tokens_seen": 63594670, + "step": 2943, + "time_per_iteration": 2.5652670860290527 + }, + { + "auxiliary_loss_clip": 0.01135104, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.05083346, + "balance_loss_mlp": 1.02952564, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 1.7428143675839032, + "language_loss": 0.80889249, + "learning_rate": 3.777562726341155e-06, + "loss": 0.83075464, + "num_input_tokens_seen": 63614780, + "step": 2944, + "time_per_iteration": 4.000368356704712 + }, + { + "auxiliary_loss_clip": 0.01169804, + "auxiliary_loss_mlp": 0.01058811, + "balance_loss_clip": 1.0610745, + "balance_loss_mlp": 1.04079843, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.9758986918485084, + "language_loss": 0.73791194, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.76019812, + "num_input_tokens_seen": 63637190, + "step": 2945, + "time_per_iteration": 2.6353142261505127 + }, + { + "auxiliary_loss_clip": 0.01155275, + "auxiliary_loss_mlp": 0.01046951, + "balance_loss_clip": 1.06095243, + "balance_loss_mlp": 1.02909374, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 3.6861770580490467, + "language_loss": 0.77897573, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.80099797, + "num_input_tokens_seen": 63652140, + "step": 2946, + "time_per_iteration": 2.4230356216430664 + }, + { + "auxiliary_loss_clip": 0.01113052, + "auxiliary_loss_mlp": 0.01051719, + "balance_loss_clip": 1.04788995, + "balance_loss_mlp": 1.03225243, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.9769510413940725, + "language_loss": 0.76764059, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78928828, + "num_input_tokens_seen": 63671700, + "step": 2947, + "time_per_iteration": 4.086950302124023 + }, + { + "auxiliary_loss_clip": 0.01152961, + "auxiliary_loss_mlp": 0.01045914, + "balance_loss_clip": 1.05523479, + "balance_loss_mlp": 1.02722192, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.2243232203099783, + "language_loss": 0.72915626, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.75114501, + "num_input_tokens_seen": 63691685, + "step": 2948, + "time_per_iteration": 2.6007795333862305 + }, + { + "auxiliary_loss_clip": 0.01159798, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.06162107, + "balance_loss_mlp": 1.03214478, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.9038847094451226, + "language_loss": 0.81889629, + "learning_rate": 3.776669371292171e-06, + "loss": 0.84100187, + "num_input_tokens_seen": 63711720, + "step": 2949, + "time_per_iteration": 2.525191068649292 + }, + { + "auxiliary_loss_clip": 0.01085858, + "auxiliary_loss_mlp": 0.01014703, + "balance_loss_clip": 1.05391645, + "balance_loss_mlp": 1.01162744, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.756110727369116, + "language_loss": 0.64967579, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67068142, + "num_input_tokens_seen": 63776280, + "step": 2950, + "time_per_iteration": 3.1370508670806885 + }, + { + "auxiliary_loss_clip": 0.01126755, + "auxiliary_loss_mlp": 0.01048042, + "balance_loss_clip": 1.05577648, + "balance_loss_mlp": 1.02908814, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.8925690918478262, + "language_loss": 0.84350502, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.86525309, + "num_input_tokens_seen": 63797535, + "step": 2951, + "time_per_iteration": 2.5926740169525146 + }, + { + "auxiliary_loss_clip": 0.01131939, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_clip": 1.05294871, + "balance_loss_mlp": 1.03630459, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 2.4934988765792347, + "language_loss": 0.80677176, + "learning_rate": 3.776132549750806e-06, + "loss": 0.8286466, + "num_input_tokens_seen": 63817045, + "step": 2952, + "time_per_iteration": 2.557086229324341 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01053114, + "balance_loss_clip": 1.06299901, + "balance_loss_mlp": 1.03353965, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.1486354600950675, + "language_loss": 0.79704541, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.81928575, + "num_input_tokens_seen": 63837665, + "step": 2953, + "time_per_iteration": 3.851168394088745 + }, + { + "auxiliary_loss_clip": 0.01129321, + "auxiliary_loss_mlp": 0.01052057, + "balance_loss_clip": 1.05337381, + "balance_loss_mlp": 1.03348422, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.915376403984561, + "language_loss": 0.88054448, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90235829, + "num_input_tokens_seen": 63858455, + "step": 2954, + "time_per_iteration": 2.6493117809295654 + }, + { + "auxiliary_loss_clip": 0.01146748, + "auxiliary_loss_mlp": 0.01052328, + "balance_loss_clip": 1.06023371, + "balance_loss_mlp": 1.03275418, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 1.7894595625694751, + "language_loss": 0.85158181, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.87357259, + "num_input_tokens_seen": 63876935, + "step": 2955, + "time_per_iteration": 2.5351712703704834 + }, + { + "auxiliary_loss_clip": 0.01136245, + "auxiliary_loss_mlp": 0.01055196, + "balance_loss_clip": 1.05405092, + "balance_loss_mlp": 1.03471565, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 1.8163530856363483, + "language_loss": 0.7130388, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73495317, + "num_input_tokens_seen": 63896815, + "step": 2956, + "time_per_iteration": 2.522132396697998 + }, + { + "auxiliary_loss_clip": 0.01153628, + "auxiliary_loss_mlp": 0.01051491, + "balance_loss_clip": 1.05859709, + "balance_loss_mlp": 1.03232253, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.8162987000038446, + "language_loss": 0.83193529, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.8539865, + "num_input_tokens_seen": 63916140, + "step": 2957, + "time_per_iteration": 2.5264017581939697 + }, + { + "auxiliary_loss_clip": 0.01107498, + "auxiliary_loss_mlp": 0.01047629, + "balance_loss_clip": 1.04927003, + "balance_loss_mlp": 1.02855515, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 2.353307824244183, + "language_loss": 0.75510848, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.77665973, + "num_input_tokens_seen": 63935220, + "step": 2958, + "time_per_iteration": 2.6452972888946533 + }, + { + "auxiliary_loss_clip": 0.0115299, + "auxiliary_loss_mlp": 0.01050241, + "balance_loss_clip": 1.06315839, + "balance_loss_mlp": 1.03175128, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 1.914645534944627, + "language_loss": 0.80670822, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.82874054, + "num_input_tokens_seen": 63954550, + "step": 2959, + "time_per_iteration": 2.5315101146698 + }, + { + "auxiliary_loss_clip": 0.01174973, + "auxiliary_loss_mlp": 0.01054979, + "balance_loss_clip": 1.0628686, + "balance_loss_mlp": 1.03392673, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 1.8675831748867302, + "language_loss": 0.52235061, + "learning_rate": 3.774698062689362e-06, + "loss": 0.5446502, + "num_input_tokens_seen": 63972425, + "step": 2960, + "time_per_iteration": 2.431777238845825 + }, + { + "auxiliary_loss_clip": 0.01119287, + "auxiliary_loss_mlp": 0.01057652, + "balance_loss_clip": 1.05407536, + "balance_loss_mlp": 1.03702891, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.7976030441622135, + "language_loss": 0.89226723, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.91403663, + "num_input_tokens_seen": 63992165, + "step": 2961, + "time_per_iteration": 2.5996460914611816 + }, + { + "auxiliary_loss_clip": 0.01126992, + "auxiliary_loss_mlp": 0.01058242, + "balance_loss_clip": 1.05722094, + "balance_loss_mlp": 1.03710639, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 1.628850938584766, + "language_loss": 0.79121548, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81306785, + "num_input_tokens_seen": 64013470, + "step": 2962, + "time_per_iteration": 4.007189750671387 + }, + { + "auxiliary_loss_clip": 0.01153137, + "auxiliary_loss_mlp": 0.01059604, + "balance_loss_clip": 1.05853534, + "balance_loss_mlp": 1.03743076, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.66632258499089, + "language_loss": 0.74984658, + "learning_rate": 3.774159019458203e-06, + "loss": 0.77197403, + "num_input_tokens_seen": 64030975, + "step": 2963, + "time_per_iteration": 2.494187116622925 + }, + { + "auxiliary_loss_clip": 0.01146454, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_clip": 1.05878949, + "balance_loss_mlp": 1.02519655, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 2.1098937133256976, + "language_loss": 0.7859571, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80788267, + "num_input_tokens_seen": 64050075, + "step": 2964, + "time_per_iteration": 2.5258588790893555 + }, + { + "auxiliary_loss_clip": 0.01157824, + "auxiliary_loss_mlp": 0.00786275, + "balance_loss_clip": 1.06037617, + "balance_loss_mlp": 1.00121284, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 2.916965986033709, + "language_loss": 0.81057513, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83001614, + "num_input_tokens_seen": 64071920, + "step": 2965, + "time_per_iteration": 2.5206258296966553 + }, + { + "auxiliary_loss_clip": 0.01153345, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.05692053, + "balance_loss_mlp": 1.03347039, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.5906318453789154, + "language_loss": 0.94692433, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96896923, + "num_input_tokens_seen": 64086835, + "step": 2966, + "time_per_iteration": 2.435987949371338 + }, + { + "auxiliary_loss_clip": 0.01121582, + "auxiliary_loss_mlp": 0.00786811, + "balance_loss_clip": 1.05491948, + "balance_loss_mlp": 1.00119102, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 3.3229217425454807, + "language_loss": 0.73216546, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.75124937, + "num_input_tokens_seen": 64107360, + "step": 2967, + "time_per_iteration": 2.7106869220733643 + }, + { + "auxiliary_loss_clip": 0.01135498, + "auxiliary_loss_mlp": 0.01053206, + "balance_loss_clip": 1.05543506, + "balance_loss_mlp": 1.03310776, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 1.9628517927222633, + "language_loss": 0.77088261, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79276967, + "num_input_tokens_seen": 64124690, + "step": 2968, + "time_per_iteration": 2.504789352416992 + }, + { + "auxiliary_loss_clip": 0.01089918, + "auxiliary_loss_mlp": 0.01047581, + "balance_loss_clip": 1.0439167, + "balance_loss_mlp": 1.02787614, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 1.6354328901869764, + "language_loss": 0.75647116, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.77784616, + "num_input_tokens_seen": 64146315, + "step": 2969, + "time_per_iteration": 2.6763696670532227 + }, + { + "auxiliary_loss_clip": 0.01069563, + "auxiliary_loss_mlp": 0.01028997, + "balance_loss_clip": 1.05476975, + "balance_loss_mlp": 1.02657735, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8600522287442537, + "language_loss": 0.69024456, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71123016, + "num_input_tokens_seen": 64210875, + "step": 2970, + "time_per_iteration": 3.196012020111084 + }, + { + "auxiliary_loss_clip": 0.01133799, + "auxiliary_loss_mlp": 0.01048029, + "balance_loss_clip": 1.0533998, + "balance_loss_mlp": 1.02788281, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 1.771747044674582, + "language_loss": 0.67706007, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69887835, + "num_input_tokens_seen": 64230740, + "step": 2971, + "time_per_iteration": 2.664808988571167 + }, + { + "auxiliary_loss_clip": 0.0111637, + "auxiliary_loss_mlp": 0.01048246, + "balance_loss_clip": 1.0532577, + "balance_loss_mlp": 1.02714634, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 5.919625558309414, + "language_loss": 0.89576793, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91741419, + "num_input_tokens_seen": 64252300, + "step": 2972, + "time_per_iteration": 2.630100965499878 + }, + { + "auxiliary_loss_clip": 0.01126789, + "auxiliary_loss_mlp": 0.01058211, + "balance_loss_clip": 1.05228174, + "balance_loss_mlp": 1.03718269, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.2021474142363004, + "language_loss": 0.8802641, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90211415, + "num_input_tokens_seen": 64270105, + "step": 2973, + "time_per_iteration": 2.552565813064575 + }, + { + "auxiliary_loss_clip": 0.01165241, + "auxiliary_loss_mlp": 0.01054247, + "balance_loss_clip": 1.05723286, + "balance_loss_mlp": 1.03442287, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.6612252909965686, + "language_loss": 0.76321197, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78540683, + "num_input_tokens_seen": 64287250, + "step": 2974, + "time_per_iteration": 2.4611690044403076 + }, + { + "auxiliary_loss_clip": 0.01142768, + "auxiliary_loss_mlp": 0.01061129, + "balance_loss_clip": 1.05626202, + "balance_loss_mlp": 1.04092312, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.1444015210983873, + "language_loss": 0.74542105, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76745999, + "num_input_tokens_seen": 64307140, + "step": 2975, + "time_per_iteration": 2.617753267288208 + }, + { + "auxiliary_loss_clip": 0.01152076, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.0543884, + "balance_loss_mlp": 1.03193688, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 4.07762193237743, + "language_loss": 0.73424065, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75627553, + "num_input_tokens_seen": 64328760, + "step": 2976, + "time_per_iteration": 2.53472638130188 + }, + { + "auxiliary_loss_clip": 0.01151125, + "auxiliary_loss_mlp": 0.01044437, + "balance_loss_clip": 1.05742955, + "balance_loss_mlp": 1.02845657, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.734358935013401, + "language_loss": 0.77412641, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79608202, + "num_input_tokens_seen": 64348800, + "step": 2977, + "time_per_iteration": 2.54030704498291 + }, + { + "auxiliary_loss_clip": 0.01128412, + "auxiliary_loss_mlp": 0.01054725, + "balance_loss_clip": 1.05797887, + "balance_loss_mlp": 1.03661704, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.0240172648491, + "language_loss": 0.79725569, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81908709, + "num_input_tokens_seen": 64367955, + "step": 2978, + "time_per_iteration": 2.5944840908050537 + }, + { + "auxiliary_loss_clip": 0.01146909, + "auxiliary_loss_mlp": 0.01054479, + "balance_loss_clip": 1.055269, + "balance_loss_mlp": 1.03503621, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.4752821555455629, + "language_loss": 0.76351273, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78552663, + "num_input_tokens_seen": 64389805, + "step": 2979, + "time_per_iteration": 2.588465690612793 + }, + { + "auxiliary_loss_clip": 0.01124812, + "auxiliary_loss_mlp": 0.01050392, + "balance_loss_clip": 1.05439913, + "balance_loss_mlp": 1.03171206, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 1.7108090724767868, + "language_loss": 0.69486272, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.71661478, + "num_input_tokens_seen": 64408220, + "step": 2980, + "time_per_iteration": 2.5314948558807373 + }, + { + "auxiliary_loss_clip": 0.01154646, + "auxiliary_loss_mlp": 0.0104805, + "balance_loss_clip": 1.05555701, + "balance_loss_mlp": 1.02616346, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.8440621762822005, + "language_loss": 0.70766962, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72969657, + "num_input_tokens_seen": 64426380, + "step": 2981, + "time_per_iteration": 2.4575467109680176 + }, + { + "auxiliary_loss_clip": 0.01138796, + "auxiliary_loss_mlp": 0.01068581, + "balance_loss_clip": 1.05744493, + "balance_loss_mlp": 1.04876888, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.470069827481919, + "language_loss": 0.82109922, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84317297, + "num_input_tokens_seen": 64444355, + "step": 2982, + "time_per_iteration": 2.487299680709839 + }, + { + "auxiliary_loss_clip": 0.01163723, + "auxiliary_loss_mlp": 0.01049858, + "balance_loss_clip": 1.05768228, + "balance_loss_mlp": 1.03188109, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 1.4810122993011205, + "language_loss": 0.83037055, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.8525064, + "num_input_tokens_seen": 64467800, + "step": 2983, + "time_per_iteration": 2.5374302864074707 + }, + { + "auxiliary_loss_clip": 0.01156759, + "auxiliary_loss_mlp": 0.01055288, + "balance_loss_clip": 1.05399156, + "balance_loss_mlp": 1.03584492, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.801533894676885, + "language_loss": 0.85199904, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87411952, + "num_input_tokens_seen": 64487230, + "step": 2984, + "time_per_iteration": 3.9911653995513916 + }, + { + "auxiliary_loss_clip": 0.01128077, + "auxiliary_loss_mlp": 0.01049166, + "balance_loss_clip": 1.05160451, + "balance_loss_mlp": 1.02954388, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 1.4320550840203468, + "language_loss": 0.89272207, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91449451, + "num_input_tokens_seen": 64509165, + "step": 2985, + "time_per_iteration": 2.6053783893585205 + }, + { + "auxiliary_loss_clip": 0.01161307, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_clip": 1.05789351, + "balance_loss_mlp": 1.03380394, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.822506944071193, + "language_loss": 0.69394487, + "learning_rate": 3.770006252694922e-06, + "loss": 0.71606767, + "num_input_tokens_seen": 64527940, + "step": 2986, + "time_per_iteration": 2.453700542449951 + }, + { + "auxiliary_loss_clip": 0.01162926, + "auxiliary_loss_mlp": 0.00785818, + "balance_loss_clip": 1.05768907, + "balance_loss_mlp": 1.00137269, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.3905704507068193, + "language_loss": 0.77638805, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79587543, + "num_input_tokens_seen": 64545230, + "step": 2987, + "time_per_iteration": 3.967909336090088 + }, + { + "auxiliary_loss_clip": 0.01167191, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_clip": 1.05678546, + "balance_loss_mlp": 1.0280515, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 1.6954611067276908, + "language_loss": 0.78437138, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80652452, + "num_input_tokens_seen": 64563820, + "step": 2988, + "time_per_iteration": 2.4257025718688965 + }, + { + "auxiliary_loss_clip": 0.01025191, + "auxiliary_loss_mlp": 0.00760189, + "balance_loss_clip": 1.03698444, + "balance_loss_mlp": 1.00121295, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7655762039733003, + "language_loss": 0.62713373, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64498746, + "num_input_tokens_seen": 64621315, + "step": 2989, + "time_per_iteration": 3.077286958694458 + }, + { + "auxiliary_loss_clip": 0.01144671, + "auxiliary_loss_mlp": 0.01043251, + "balance_loss_clip": 1.05684388, + "balance_loss_mlp": 1.02494025, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 4.0879015233851, + "language_loss": 0.70206732, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72394657, + "num_input_tokens_seen": 64639885, + "step": 2990, + "time_per_iteration": 2.5302746295928955 + }, + { + "auxiliary_loss_clip": 0.01142114, + "auxiliary_loss_mlp": 0.01050094, + "balance_loss_clip": 1.05449986, + "balance_loss_mlp": 1.0314374, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.8351127096529147, + "language_loss": 0.69013613, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.71205819, + "num_input_tokens_seen": 64661220, + "step": 2991, + "time_per_iteration": 2.682215929031372 + }, + { + "auxiliary_loss_clip": 0.0110901, + "auxiliary_loss_mlp": 0.01044532, + "balance_loss_clip": 1.05198038, + "balance_loss_mlp": 1.02502906, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.8095153480053259, + "language_loss": 0.82585418, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84738958, + "num_input_tokens_seen": 64682530, + "step": 2992, + "time_per_iteration": 3.996781349182129 + }, + { + "auxiliary_loss_clip": 0.01146666, + "auxiliary_loss_mlp": 0.01048615, + "balance_loss_clip": 1.05323911, + "balance_loss_mlp": 1.03032827, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.062772499295929, + "language_loss": 0.81758451, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.83953738, + "num_input_tokens_seen": 64701025, + "step": 2993, + "time_per_iteration": 2.4828991889953613 + }, + { + "auxiliary_loss_clip": 0.01139925, + "auxiliary_loss_mlp": 0.01044504, + "balance_loss_clip": 1.04945755, + "balance_loss_mlp": 1.02605069, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.9253124632456367, + "language_loss": 0.78794539, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80978966, + "num_input_tokens_seen": 64719570, + "step": 2994, + "time_per_iteration": 2.4946980476379395 + }, + { + "auxiliary_loss_clip": 0.01165128, + "auxiliary_loss_mlp": 0.01044237, + "balance_loss_clip": 1.05606449, + "balance_loss_mlp": 1.02641535, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 1.9460787238676078, + "language_loss": 0.79942703, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82152069, + "num_input_tokens_seen": 64738110, + "step": 2995, + "time_per_iteration": 2.442544460296631 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01048287, + "balance_loss_clip": 1.05429578, + "balance_loss_mlp": 1.03124046, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.7110251483864094, + "language_loss": 0.8441776, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86615777, + "num_input_tokens_seen": 64756345, + "step": 2996, + "time_per_iteration": 2.4778199195861816 + }, + { + "auxiliary_loss_clip": 0.01130033, + "auxiliary_loss_mlp": 0.01041366, + "balance_loss_clip": 1.0591526, + "balance_loss_mlp": 1.02332985, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 1.864163069720382, + "language_loss": 0.87959886, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90131283, + "num_input_tokens_seen": 64776375, + "step": 2997, + "time_per_iteration": 2.6019089221954346 + }, + { + "auxiliary_loss_clip": 0.01139711, + "auxiliary_loss_mlp": 0.01048368, + "balance_loss_clip": 1.04872477, + "balance_loss_mlp": 1.02834105, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 1.7830515361984236, + "language_loss": 0.85525769, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.8771385, + "num_input_tokens_seen": 64796210, + "step": 2998, + "time_per_iteration": 2.6194963455200195 + }, + { + "auxiliary_loss_clip": 0.0116196, + "auxiliary_loss_mlp": 0.01046184, + "balance_loss_clip": 1.0573653, + "balance_loss_mlp": 1.02819562, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.7788682069407997, + "language_loss": 0.84346104, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86554247, + "num_input_tokens_seen": 64818590, + "step": 2999, + "time_per_iteration": 2.5219738483428955 + }, + { + "auxiliary_loss_clip": 0.01148263, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_clip": 1.05200279, + "balance_loss_mlp": 1.03167558, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.6104554282563925, + "language_loss": 0.75087696, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77287161, + "num_input_tokens_seen": 64838350, + "step": 3000, + "time_per_iteration": 3.8568098545074463 + }, + { + "auxiliary_loss_clip": 0.01138896, + "auxiliary_loss_mlp": 0.00786131, + "balance_loss_clip": 1.05506158, + "balance_loss_mlp": 1.0014441, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 1.6727932764502844, + "language_loss": 0.71257597, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73182631, + "num_input_tokens_seen": 64858065, + "step": 3001, + "time_per_iteration": 2.5498766899108887 + }, + { + "auxiliary_loss_clip": 0.01154167, + "auxiliary_loss_mlp": 0.01049843, + "balance_loss_clip": 1.05584979, + "balance_loss_mlp": 1.03059101, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.3343833017662976, + "language_loss": 0.88203299, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90407306, + "num_input_tokens_seen": 64877305, + "step": 3002, + "time_per_iteration": 2.499195098876953 + }, + { + "auxiliary_loss_clip": 0.0116222, + "auxiliary_loss_mlp": 0.01047497, + "balance_loss_clip": 1.05642605, + "balance_loss_mlp": 1.0293299, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.588961737395044, + "language_loss": 0.80568016, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.82777733, + "num_input_tokens_seen": 64896955, + "step": 3003, + "time_per_iteration": 2.4511771202087402 + }, + { + "auxiliary_loss_clip": 0.01165149, + "auxiliary_loss_mlp": 0.01042716, + "balance_loss_clip": 1.05699897, + "balance_loss_mlp": 1.02457285, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 1.9792295894603311, + "language_loss": 0.6690619, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69114053, + "num_input_tokens_seen": 64917080, + "step": 3004, + "time_per_iteration": 2.4962804317474365 + }, + { + "auxiliary_loss_clip": 0.01152267, + "auxiliary_loss_mlp": 0.01051077, + "balance_loss_clip": 1.05591571, + "balance_loss_mlp": 1.03222942, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.71662054520213, + "language_loss": 0.85073709, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87277061, + "num_input_tokens_seen": 64935215, + "step": 3005, + "time_per_iteration": 2.4673571586608887 + }, + { + "auxiliary_loss_clip": 0.01147098, + "auxiliary_loss_mlp": 0.01042949, + "balance_loss_clip": 1.05435693, + "balance_loss_mlp": 1.02610469, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.510850323512729, + "language_loss": 0.8341592, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85605967, + "num_input_tokens_seen": 64956275, + "step": 3006, + "time_per_iteration": 2.54426646232605 + }, + { + "auxiliary_loss_clip": 0.01131079, + "auxiliary_loss_mlp": 0.0105525, + "balance_loss_clip": 1.04745972, + "balance_loss_mlp": 1.03611732, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 2.2930513531679977, + "language_loss": 0.77175653, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79361975, + "num_input_tokens_seen": 64979390, + "step": 3007, + "time_per_iteration": 2.569795846939087 + }, + { + "auxiliary_loss_clip": 0.01075827, + "auxiliary_loss_mlp": 0.01017539, + "balance_loss_clip": 1.05220079, + "balance_loss_mlp": 1.01490402, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8094731154282079, + "language_loss": 0.57019937, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59113312, + "num_input_tokens_seen": 65043135, + "step": 3008, + "time_per_iteration": 3.299694538116455 + }, + { + "auxiliary_loss_clip": 0.01136761, + "auxiliary_loss_mlp": 0.01050027, + "balance_loss_clip": 1.05346954, + "balance_loss_mlp": 1.03134704, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.709225879442944, + "language_loss": 0.67349505, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69536293, + "num_input_tokens_seen": 65062845, + "step": 3009, + "time_per_iteration": 2.5758776664733887 + }, + { + "auxiliary_loss_clip": 0.01163427, + "auxiliary_loss_mlp": 0.01044258, + "balance_loss_clip": 1.05653572, + "balance_loss_mlp": 1.02719891, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.8167538131015664, + "language_loss": 0.76172167, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.78379858, + "num_input_tokens_seen": 65082110, + "step": 3010, + "time_per_iteration": 2.4724392890930176 + }, + { + "auxiliary_loss_clip": 0.01126936, + "auxiliary_loss_mlp": 0.01041669, + "balance_loss_clip": 1.05247056, + "balance_loss_mlp": 1.02517033, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.5446017421421063, + "language_loss": 0.67284298, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.694529, + "num_input_tokens_seen": 65101985, + "step": 3011, + "time_per_iteration": 2.5321953296661377 + }, + { + "auxiliary_loss_clip": 0.01112281, + "auxiliary_loss_mlp": 0.00786772, + "balance_loss_clip": 1.04583681, + "balance_loss_mlp": 1.00129247, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 1.5263726458674818, + "language_loss": 0.7152226, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73421311, + "num_input_tokens_seen": 65129295, + "step": 3012, + "time_per_iteration": 2.8373873233795166 + }, + { + "auxiliary_loss_clip": 0.01136582, + "auxiliary_loss_mlp": 0.01048739, + "balance_loss_clip": 1.05580282, + "balance_loss_mlp": 1.0310483, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 3.1032329903800715, + "language_loss": 0.63, + "learning_rate": 3.765085966704609e-06, + "loss": 0.6518532, + "num_input_tokens_seen": 65150625, + "step": 3013, + "time_per_iteration": 2.636774778366089 + }, + { + "auxiliary_loss_clip": 0.01136421, + "auxiliary_loss_mlp": 0.01050921, + "balance_loss_clip": 1.05206418, + "balance_loss_mlp": 1.03346848, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.8089241268688392, + "language_loss": 0.76065922, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78253269, + "num_input_tokens_seen": 65170880, + "step": 3014, + "time_per_iteration": 2.5432851314544678 + }, + { + "auxiliary_loss_clip": 0.01167244, + "auxiliary_loss_mlp": 0.0104733, + "balance_loss_clip": 1.05783248, + "balance_loss_mlp": 1.02686191, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.703293870535627, + "language_loss": 0.66130465, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.6834504, + "num_input_tokens_seen": 65192530, + "step": 3015, + "time_per_iteration": 2.511240243911743 + }, + { + "auxiliary_loss_clip": 0.01135202, + "auxiliary_loss_mlp": 0.00785725, + "balance_loss_clip": 1.05529392, + "balance_loss_mlp": 1.00129128, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.6677800838024046, + "language_loss": 0.77743876, + "learning_rate": 3.764536253816785e-06, + "loss": 0.79664809, + "num_input_tokens_seen": 65211675, + "step": 3016, + "time_per_iteration": 2.5224952697753906 + }, + { + "auxiliary_loss_clip": 0.01151622, + "auxiliary_loss_mlp": 0.01050338, + "balance_loss_clip": 1.05657005, + "balance_loss_mlp": 1.03104997, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 2.216007152022403, + "language_loss": 0.83600843, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85802799, + "num_input_tokens_seen": 65231185, + "step": 3017, + "time_per_iteration": 2.4952008724212646 + }, + { + "auxiliary_loss_clip": 0.01146389, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.05387688, + "balance_loss_mlp": 1.02181602, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.6088323003380003, + "language_loss": 0.67673695, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69858694, + "num_input_tokens_seen": 65251645, + "step": 3018, + "time_per_iteration": 2.65901780128479 + }, + { + "auxiliary_loss_clip": 0.01151848, + "auxiliary_loss_mlp": 0.00785036, + "balance_loss_clip": 1.05673039, + "balance_loss_mlp": 1.0013175, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 1.8795538455610943, + "language_loss": 0.75997829, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77934712, + "num_input_tokens_seen": 65271125, + "step": 3019, + "time_per_iteration": 2.544480085372925 + }, + { + "auxiliary_loss_clip": 0.01134105, + "auxiliary_loss_mlp": 0.01045697, + "balance_loss_clip": 1.06291354, + "balance_loss_mlp": 1.02589631, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.1139965101577136, + "language_loss": 0.81460118, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.8363992, + "num_input_tokens_seen": 65290600, + "step": 3020, + "time_per_iteration": 2.612592935562134 + }, + { + "auxiliary_loss_clip": 0.01140939, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_clip": 1.05775452, + "balance_loss_mlp": 1.02350843, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 1.895533027262837, + "language_loss": 0.77061796, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79246032, + "num_input_tokens_seen": 65311040, + "step": 3021, + "time_per_iteration": 2.561718225479126 + }, + { + "auxiliary_loss_clip": 0.01146038, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_clip": 1.05316782, + "balance_loss_mlp": 1.02360368, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.6556612202475962, + "language_loss": 0.84974492, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87162811, + "num_input_tokens_seen": 65332115, + "step": 3022, + "time_per_iteration": 2.521127223968506 + }, + { + "auxiliary_loss_clip": 0.01145874, + "auxiliary_loss_mlp": 0.01044657, + "balance_loss_clip": 1.06458211, + "balance_loss_mlp": 1.02503562, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 2.0539308982985665, + "language_loss": 0.69803333, + "learning_rate": 3.763251248837859e-06, + "loss": 0.71993864, + "num_input_tokens_seen": 65352210, + "step": 3023, + "time_per_iteration": 2.604294538497925 + }, + { + "auxiliary_loss_clip": 0.01136521, + "auxiliary_loss_mlp": 0.01047868, + "balance_loss_clip": 1.05105424, + "balance_loss_mlp": 1.02978396, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 2.5553796965789393, + "language_loss": 0.73948503, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76132894, + "num_input_tokens_seen": 65370600, + "step": 3024, + "time_per_iteration": 3.938422918319702 + }, + { + "auxiliary_loss_clip": 0.01149588, + "auxiliary_loss_mlp": 0.01045938, + "balance_loss_clip": 1.05713212, + "balance_loss_mlp": 1.0275563, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 1.8503503617768682, + "language_loss": 0.88321251, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90516776, + "num_input_tokens_seen": 65387270, + "step": 3025, + "time_per_iteration": 2.4392800331115723 + }, + { + "auxiliary_loss_clip": 0.01142121, + "auxiliary_loss_mlp": 0.0104984, + "balance_loss_clip": 1.05816746, + "balance_loss_mlp": 1.03093314, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 1.7331232504378402, + "language_loss": 0.7920804, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81400001, + "num_input_tokens_seen": 65406550, + "step": 3026, + "time_per_iteration": 4.014906167984009 + }, + { + "auxiliary_loss_clip": 0.0113571, + "auxiliary_loss_mlp": 0.0105494, + "balance_loss_clip": 1.05680037, + "balance_loss_mlp": 1.03659344, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6619868920085883, + "language_loss": 0.76057386, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78248036, + "num_input_tokens_seen": 65425955, + "step": 3027, + "time_per_iteration": 2.544583559036255 + }, + { + "auxiliary_loss_clip": 0.0116624, + "auxiliary_loss_mlp": 0.01053082, + "balance_loss_clip": 1.05670667, + "balance_loss_mlp": 1.03456855, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.8369981900248458, + "language_loss": 0.85637558, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87856877, + "num_input_tokens_seen": 65442820, + "step": 3028, + "time_per_iteration": 2.423845052719116 + }, + { + "auxiliary_loss_clip": 0.01160891, + "auxiliary_loss_mlp": 0.01045429, + "balance_loss_clip": 1.05774164, + "balance_loss_mlp": 1.02719021, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.6076861130872202, + "language_loss": 0.83287573, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.85493892, + "num_input_tokens_seen": 65461825, + "step": 3029, + "time_per_iteration": 2.4839119911193848 + }, + { + "auxiliary_loss_clip": 0.0112144, + "auxiliary_loss_mlp": 0.01049332, + "balance_loss_clip": 1.05123258, + "balance_loss_mlp": 1.02938819, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.1019092893336984, + "language_loss": 0.77929711, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80100489, + "num_input_tokens_seen": 65479480, + "step": 3030, + "time_per_iteration": 2.5218441486358643 + }, + { + "auxiliary_loss_clip": 0.01141657, + "auxiliary_loss_mlp": 0.01043284, + "balance_loss_clip": 1.05089211, + "balance_loss_mlp": 1.02509308, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 2.1216345875839475, + "language_loss": 0.8470602, + "learning_rate": 3.761778660099352e-06, + "loss": 0.8689096, + "num_input_tokens_seen": 65497775, + "step": 3031, + "time_per_iteration": 3.974623680114746 + }, + { + "auxiliary_loss_clip": 0.01124738, + "auxiliary_loss_mlp": 0.00784233, + "balance_loss_clip": 1.05185366, + "balance_loss_mlp": 1.00111723, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.6381492955000325, + "language_loss": 0.79808712, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81717682, + "num_input_tokens_seen": 65516505, + "step": 3032, + "time_per_iteration": 2.6092779636383057 + }, + { + "auxiliary_loss_clip": 0.01168466, + "auxiliary_loss_mlp": 0.01045612, + "balance_loss_clip": 1.05930781, + "balance_loss_mlp": 1.02702773, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 1.87834896305161, + "language_loss": 0.81325263, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83539343, + "num_input_tokens_seen": 65536160, + "step": 3033, + "time_per_iteration": 2.5213212966918945 + }, + { + "auxiliary_loss_clip": 0.01062128, + "auxiliary_loss_mlp": 0.01003779, + "balance_loss_clip": 1.06391048, + "balance_loss_mlp": 1.00121641, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8783741922512457, + "language_loss": 0.63558871, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65624779, + "num_input_tokens_seen": 65589375, + "step": 3034, + "time_per_iteration": 3.1136560440063477 + }, + { + "auxiliary_loss_clip": 0.01134979, + "auxiliary_loss_mlp": 0.01041024, + "balance_loss_clip": 1.05703783, + "balance_loss_mlp": 1.023453, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 1.7475098670533307, + "language_loss": 0.79566836, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81742841, + "num_input_tokens_seen": 65606720, + "step": 3035, + "time_per_iteration": 2.5507514476776123 + }, + { + "auxiliary_loss_clip": 0.01143152, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_clip": 1.0614624, + "balance_loss_mlp": 1.02801752, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 1.8072019761388936, + "language_loss": 0.8495841, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87146717, + "num_input_tokens_seen": 65625495, + "step": 3036, + "time_per_iteration": 2.570413112640381 + }, + { + "auxiliary_loss_clip": 0.01143705, + "auxiliary_loss_mlp": 0.01041758, + "balance_loss_clip": 1.05504084, + "balance_loss_mlp": 1.02419806, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 1.897767907356192, + "language_loss": 0.80268949, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82454413, + "num_input_tokens_seen": 65643515, + "step": 3037, + "time_per_iteration": 2.490386486053467 + }, + { + "auxiliary_loss_clip": 0.01144323, + "auxiliary_loss_mlp": 0.0078605, + "balance_loss_clip": 1.06004643, + "balance_loss_mlp": 1.00101185, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.5913884911606524, + "language_loss": 0.8018049, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.8211087, + "num_input_tokens_seen": 65658155, + "step": 3038, + "time_per_iteration": 2.5013515949249268 + }, + { + "auxiliary_loss_clip": 0.01133253, + "auxiliary_loss_mlp": 0.01047374, + "balance_loss_clip": 1.05322516, + "balance_loss_mlp": 1.02844393, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.7664423458001237, + "language_loss": 0.67486441, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69667065, + "num_input_tokens_seen": 65679310, + "step": 3039, + "time_per_iteration": 4.097077131271362 + }, + { + "auxiliary_loss_clip": 0.01136166, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_clip": 1.05154514, + "balance_loss_mlp": 1.02861845, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.8407451669371966, + "language_loss": 0.73915112, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.7609843, + "num_input_tokens_seen": 65705235, + "step": 3040, + "time_per_iteration": 2.8139915466308594 + }, + { + "auxiliary_loss_clip": 0.01145943, + "auxiliary_loss_mlp": 0.01042655, + "balance_loss_clip": 1.05458617, + "balance_loss_mlp": 1.02440405, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 2.040106613527033, + "language_loss": 0.60605037, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62793636, + "num_input_tokens_seen": 65727575, + "step": 3041, + "time_per_iteration": 2.568145751953125 + }, + { + "auxiliary_loss_clip": 0.01122544, + "auxiliary_loss_mlp": 0.01054781, + "balance_loss_clip": 1.05073524, + "balance_loss_mlp": 1.0360769, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.589912967502473, + "language_loss": 0.60295355, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.62472677, + "num_input_tokens_seen": 65751370, + "step": 3042, + "time_per_iteration": 2.840999126434326 + }, + { + "auxiliary_loss_clip": 0.01128842, + "auxiliary_loss_mlp": 0.01048478, + "balance_loss_clip": 1.05195069, + "balance_loss_mlp": 1.03014338, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 2.1171701021506695, + "language_loss": 0.87479311, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89656627, + "num_input_tokens_seen": 65771040, + "step": 3043, + "time_per_iteration": 2.606431245803833 + }, + { + "auxiliary_loss_clip": 0.0106132, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.04267454, + "balance_loss_mlp": 1.02881598, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 2.0772750795654162, + "language_loss": 0.71167302, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.73277557, + "num_input_tokens_seen": 65789345, + "step": 3044, + "time_per_iteration": 2.7720158100128174 + }, + { + "auxiliary_loss_clip": 0.0111232, + "auxiliary_loss_mlp": 0.01052941, + "balance_loss_clip": 1.05129731, + "balance_loss_mlp": 1.03240097, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.4235508324357107, + "language_loss": 0.64272511, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66437775, + "num_input_tokens_seen": 65810990, + "step": 3045, + "time_per_iteration": 2.8866751194000244 + }, + { + "auxiliary_loss_clip": 0.01162578, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.05876136, + "balance_loss_mlp": 1.03007483, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.6225798829944775, + "language_loss": 0.78849894, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.8106063, + "num_input_tokens_seen": 65827230, + "step": 3046, + "time_per_iteration": 2.445133924484253 + }, + { + "auxiliary_loss_clip": 0.01127344, + "auxiliary_loss_mlp": 0.01044623, + "balance_loss_clip": 1.05057788, + "balance_loss_mlp": 1.0254426, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 2.5465524678534925, + "language_loss": 0.79224783, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81396747, + "num_input_tokens_seen": 65845900, + "step": 3047, + "time_per_iteration": 2.5563502311706543 + }, + { + "auxiliary_loss_clip": 0.01154342, + "auxiliary_loss_mlp": 0.01042946, + "balance_loss_clip": 1.06135702, + "balance_loss_mlp": 1.02532709, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5602550780336089, + "language_loss": 0.80803674, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.8300097, + "num_input_tokens_seen": 65868730, + "step": 3048, + "time_per_iteration": 2.610478639602661 + }, + { + "auxiliary_loss_clip": 0.01147943, + "auxiliary_loss_mlp": 0.0104997, + "balance_loss_clip": 1.05463552, + "balance_loss_mlp": 1.02990675, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8812854360072255, + "language_loss": 0.86593556, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88791466, + "num_input_tokens_seen": 65888420, + "step": 3049, + "time_per_iteration": 2.4678525924682617 + }, + { + "auxiliary_loss_clip": 0.01152603, + "auxiliary_loss_mlp": 0.01055055, + "balance_loss_clip": 1.05516815, + "balance_loss_mlp": 1.03450286, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.0714799671223316, + "language_loss": 0.76738232, + "learning_rate": 3.75826413248424e-06, + "loss": 0.78945893, + "num_input_tokens_seen": 65905840, + "step": 3050, + "time_per_iteration": 2.4683995246887207 + }, + { + "auxiliary_loss_clip": 0.01136336, + "auxiliary_loss_mlp": 0.01041662, + "balance_loss_clip": 1.05102932, + "balance_loss_mlp": 1.02330363, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 1.997016892300619, + "language_loss": 0.99472082, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01650083, + "num_input_tokens_seen": 65922845, + "step": 3051, + "time_per_iteration": 2.5279154777526855 + }, + { + "auxiliary_loss_clip": 0.01132705, + "auxiliary_loss_mlp": 0.01045959, + "balance_loss_clip": 1.052284, + "balance_loss_mlp": 1.02607465, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 2.0876119427417916, + "language_loss": 0.86418253, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88596922, + "num_input_tokens_seen": 65945555, + "step": 3052, + "time_per_iteration": 2.5974671840667725 + }, + { + "auxiliary_loss_clip": 0.0116072, + "auxiliary_loss_mlp": 0.01048593, + "balance_loss_clip": 1.05620742, + "balance_loss_mlp": 1.03007936, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.701264761884445, + "language_loss": 0.72927189, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75136495, + "num_input_tokens_seen": 65963965, + "step": 3053, + "time_per_iteration": 2.4842774868011475 + }, + { + "auxiliary_loss_clip": 0.01167739, + "auxiliary_loss_mlp": 0.01051548, + "balance_loss_clip": 1.06071377, + "balance_loss_mlp": 1.03283191, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.6472674554723865, + "language_loss": 0.62269491, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64488775, + "num_input_tokens_seen": 65985965, + "step": 3054, + "time_per_iteration": 2.545727014541626 + }, + { + "auxiliary_loss_clip": 0.01115344, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.05701041, + "balance_loss_mlp": 1.02999592, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.046046473237514, + "language_loss": 0.7832123, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80484664, + "num_input_tokens_seen": 66005645, + "step": 3055, + "time_per_iteration": 2.6483616828918457 + }, + { + "auxiliary_loss_clip": 0.01104544, + "auxiliary_loss_mlp": 0.01066013, + "balance_loss_clip": 1.05181003, + "balance_loss_mlp": 1.0464505, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.6129885202858252, + "language_loss": 0.70225996, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72396553, + "num_input_tokens_seen": 66025675, + "step": 3056, + "time_per_iteration": 2.6513235569000244 + }, + { + "auxiliary_loss_clip": 0.01152291, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_clip": 1.0566709, + "balance_loss_mlp": 1.02407682, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.4897279124398413, + "language_loss": 0.80295354, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82489812, + "num_input_tokens_seen": 66046125, + "step": 3057, + "time_per_iteration": 2.517028570175171 + }, + { + "auxiliary_loss_clip": 0.01157562, + "auxiliary_loss_mlp": 0.01051172, + "balance_loss_clip": 1.05609894, + "balance_loss_mlp": 1.03041792, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.2275845222279904, + "language_loss": 0.82978064, + "learning_rate": 3.756777127858533e-06, + "loss": 0.85186797, + "num_input_tokens_seen": 66064375, + "step": 3058, + "time_per_iteration": 2.490913152694702 + }, + { + "auxiliary_loss_clip": 0.01127457, + "auxiliary_loss_mlp": 0.00786159, + "balance_loss_clip": 1.04948211, + "balance_loss_mlp": 1.00085664, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.154547645732033, + "language_loss": 0.86070633, + "learning_rate": 3.756590952429017e-06, + "loss": 0.87984252, + "num_input_tokens_seen": 66084590, + "step": 3059, + "time_per_iteration": 2.6200368404388428 + }, + { + "auxiliary_loss_clip": 0.01163087, + "auxiliary_loss_mlp": 0.00784797, + "balance_loss_clip": 1.05728817, + "balance_loss_mlp": 1.00088763, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 2.198942828573308, + "language_loss": 0.73125339, + "learning_rate": 3.756404710389396e-06, + "loss": 0.75073218, + "num_input_tokens_seen": 66107105, + "step": 3060, + "time_per_iteration": 2.5343968868255615 + }, + { + "auxiliary_loss_clip": 0.01158266, + "auxiliary_loss_mlp": 0.01048192, + "balance_loss_clip": 1.05873036, + "balance_loss_mlp": 1.027843, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.5372143824318105, + "language_loss": 0.73076528, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.75282985, + "num_input_tokens_seen": 66129295, + "step": 3061, + "time_per_iteration": 2.5371179580688477 + }, + { + "auxiliary_loss_clip": 0.01146244, + "auxiliary_loss_mlp": 0.01055948, + "balance_loss_clip": 1.05491865, + "balance_loss_mlp": 1.03592086, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.5965088231040334, + "language_loss": 0.81962734, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.84164929, + "num_input_tokens_seen": 66146910, + "step": 3062, + "time_per_iteration": 2.4818952083587646 + }, + { + "auxiliary_loss_clip": 0.01152738, + "auxiliary_loss_mlp": 0.01050573, + "balance_loss_clip": 1.0570662, + "balance_loss_mlp": 1.03141606, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.8834745216927287, + "language_loss": 0.73099595, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.75302905, + "num_input_tokens_seen": 66165370, + "step": 3063, + "time_per_iteration": 3.92618727684021 + }, + { + "auxiliary_loss_clip": 0.01145233, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.05627251, + "balance_loss_mlp": 1.03328085, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 1.9505913054359716, + "language_loss": 0.65688884, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.67884213, + "num_input_tokens_seen": 66186210, + "step": 3064, + "time_per_iteration": 2.521439552307129 + }, + { + "auxiliary_loss_clip": 0.0115219, + "auxiliary_loss_mlp": 0.01048615, + "balance_loss_clip": 1.05872798, + "balance_loss_mlp": 1.03050733, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 1.7724227957778234, + "language_loss": 0.68827361, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.71028167, + "num_input_tokens_seen": 66204800, + "step": 3065, + "time_per_iteration": 4.071812868118286 + }, + { + "auxiliary_loss_clip": 0.01144779, + "auxiliary_loss_mlp": 0.01045299, + "balance_loss_clip": 1.05832958, + "balance_loss_mlp": 1.02614188, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 2.030877282755504, + "language_loss": 0.7259481, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.74784887, + "num_input_tokens_seen": 66222195, + "step": 3066, + "time_per_iteration": 2.5877187252044678 + }, + { + "auxiliary_loss_clip": 0.01133517, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_clip": 1.04900265, + "balance_loss_mlp": 1.02646899, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 2.258634243621725, + "language_loss": 0.82244086, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84422195, + "num_input_tokens_seen": 66239505, + "step": 3067, + "time_per_iteration": 2.500649929046631 + }, + { + "auxiliary_loss_clip": 0.01082054, + "auxiliary_loss_mlp": 0.00759272, + "balance_loss_clip": 1.04224515, + "balance_loss_mlp": 1.00079846, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7825205829339031, + "language_loss": 0.5969522, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61536551, + "num_input_tokens_seen": 66295695, + "step": 3068, + "time_per_iteration": 2.9079513549804688 + }, + { + "auxiliary_loss_clip": 0.01135697, + "auxiliary_loss_mlp": 0.01048306, + "balance_loss_clip": 1.05849957, + "balance_loss_mlp": 1.02990031, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.7240889472016359, + "language_loss": 0.76655197, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78839201, + "num_input_tokens_seen": 66315315, + "step": 3069, + "time_per_iteration": 2.5329926013946533 + }, + { + "auxiliary_loss_clip": 0.0115118, + "auxiliary_loss_mlp": 0.01045166, + "balance_loss_clip": 1.05360818, + "balance_loss_mlp": 1.02603316, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 2.2185581778510852, + "language_loss": 0.85236764, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.87433112, + "num_input_tokens_seen": 66333675, + "step": 3070, + "time_per_iteration": 3.944803237915039 + }, + { + "auxiliary_loss_clip": 0.01132848, + "auxiliary_loss_mlp": 0.01042324, + "balance_loss_clip": 1.05267799, + "balance_loss_mlp": 1.02300024, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 2.5710394328197452, + "language_loss": 0.77548122, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79723299, + "num_input_tokens_seen": 66354075, + "step": 3071, + "time_per_iteration": 2.587465763092041 + }, + { + "auxiliary_loss_clip": 0.01120649, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.05423069, + "balance_loss_mlp": 1.03175855, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 3.05460887615776, + "language_loss": 0.77290404, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79462445, + "num_input_tokens_seen": 66372520, + "step": 3072, + "time_per_iteration": 2.519465923309326 + }, + { + "auxiliary_loss_clip": 0.01144367, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.04915166, + "balance_loss_mlp": 1.02857327, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.6692103067694333, + "language_loss": 0.86177546, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88370985, + "num_input_tokens_seen": 66390745, + "step": 3073, + "time_per_iteration": 2.4929165840148926 + }, + { + "auxiliary_loss_clip": 0.01165465, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.05927634, + "balance_loss_mlp": 1.0262779, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.890809991795765, + "language_loss": 0.91949713, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.94158524, + "num_input_tokens_seen": 66410525, + "step": 3074, + "time_per_iteration": 2.4676880836486816 + }, + { + "auxiliary_loss_clip": 0.0111168, + "auxiliary_loss_mlp": 0.01048184, + "balance_loss_clip": 1.04516983, + "balance_loss_mlp": 1.02764416, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.6713785519979898, + "language_loss": 0.64732218, + "learning_rate": 3.75360309139087e-06, + "loss": 0.66892081, + "num_input_tokens_seen": 66432535, + "step": 3075, + "time_per_iteration": 2.617276191711426 + }, + { + "auxiliary_loss_clip": 0.01140675, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_clip": 1.05730391, + "balance_loss_mlp": 1.02692974, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.934323690088919, + "language_loss": 0.72397101, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74582362, + "num_input_tokens_seen": 66450620, + "step": 3076, + "time_per_iteration": 2.567758083343506 + }, + { + "auxiliary_loss_clip": 0.01128807, + "auxiliary_loss_mlp": 0.0104736, + "balance_loss_clip": 1.05798149, + "balance_loss_mlp": 1.02947891, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.463642782293301, + "language_loss": 0.80990517, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83166683, + "num_input_tokens_seen": 66467865, + "step": 3077, + "time_per_iteration": 2.651914358139038 + }, + { + "auxiliary_loss_clip": 0.01134746, + "auxiliary_loss_mlp": 0.01045228, + "balance_loss_clip": 1.05589557, + "balance_loss_mlp": 1.02757263, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.7533370540071145, + "language_loss": 0.78725755, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.8090573, + "num_input_tokens_seen": 66486245, + "step": 3078, + "time_per_iteration": 2.58017635345459 + }, + { + "auxiliary_loss_clip": 0.01164382, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.0599556, + "balance_loss_mlp": 1.02544224, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.8009145524271113, + "language_loss": 0.7782135, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.80028248, + "num_input_tokens_seen": 66506510, + "step": 3079, + "time_per_iteration": 3.9789888858795166 + }, + { + "auxiliary_loss_clip": 0.01127065, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.05035043, + "balance_loss_mlp": 1.02198672, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 2.2578271340081923, + "language_loss": 0.81947047, + "learning_rate": 3.752665892369369e-06, + "loss": 0.8411411, + "num_input_tokens_seen": 66530960, + "step": 3080, + "time_per_iteration": 2.722257375717163 + }, + { + "auxiliary_loss_clip": 0.01126725, + "auxiliary_loss_mlp": 0.01042416, + "balance_loss_clip": 1.05663919, + "balance_loss_mlp": 1.02420092, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.316449395105544, + "language_loss": 0.74156189, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76325333, + "num_input_tokens_seen": 66550275, + "step": 3081, + "time_per_iteration": 2.6188549995422363 + }, + { + "auxiliary_loss_clip": 0.01134744, + "auxiliary_loss_mlp": 0.01051026, + "balance_loss_clip": 1.05786085, + "balance_loss_mlp": 1.03201222, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.2167547271877024, + "language_loss": 0.71838647, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.74024415, + "num_input_tokens_seen": 66569040, + "step": 3082, + "time_per_iteration": 2.560525417327881 + }, + { + "auxiliary_loss_clip": 0.01133609, + "auxiliary_loss_mlp": 0.01046955, + "balance_loss_clip": 1.06262529, + "balance_loss_mlp": 1.02738059, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 2.1820509548147884, + "language_loss": 0.69475842, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71656406, + "num_input_tokens_seen": 66587775, + "step": 3083, + "time_per_iteration": 2.54663348197937 + }, + { + "auxiliary_loss_clip": 0.01132357, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_clip": 1.05456066, + "balance_loss_mlp": 1.02944601, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 2.2825010655915814, + "language_loss": 0.68485707, + "learning_rate": 3.751914936806767e-06, + "loss": 0.70665205, + "num_input_tokens_seen": 66610800, + "step": 3084, + "time_per_iteration": 2.6985554695129395 + }, + { + "auxiliary_loss_clip": 0.01161762, + "auxiliary_loss_mlp": 0.01037067, + "balance_loss_clip": 1.05847764, + "balance_loss_mlp": 1.02009153, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.6627613242000543, + "language_loss": 0.77983034, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80181855, + "num_input_tokens_seen": 66630960, + "step": 3085, + "time_per_iteration": 2.504641056060791 + }, + { + "auxiliary_loss_clip": 0.0116016, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.0564009, + "balance_loss_mlp": 1.02980638, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.7433177628305416, + "language_loss": 0.7322185, + "learning_rate": 3.751539060400244e-06, + "loss": 0.75429654, + "num_input_tokens_seen": 66650585, + "step": 3086, + "time_per_iteration": 2.5069494247436523 + }, + { + "auxiliary_loss_clip": 0.0114881, + "auxiliary_loss_mlp": 0.01048049, + "balance_loss_clip": 1.05637407, + "balance_loss_mlp": 1.02967882, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 3.0159949732555495, + "language_loss": 0.6968261, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.7187947, + "num_input_tokens_seen": 66670045, + "step": 3087, + "time_per_iteration": 2.486834764480591 + }, + { + "auxiliary_loss_clip": 0.0112814, + "auxiliary_loss_mlp": 0.0105086, + "balance_loss_clip": 1.05824518, + "balance_loss_mlp": 1.03175104, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 1.860773956594353, + "language_loss": 0.72576785, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74755788, + "num_input_tokens_seen": 66688790, + "step": 3088, + "time_per_iteration": 2.5483486652374268 + }, + { + "auxiliary_loss_clip": 0.01136357, + "auxiliary_loss_mlp": 0.01043776, + "balance_loss_clip": 1.05410695, + "balance_loss_mlp": 1.02625179, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.259657390281655, + "language_loss": 0.91955858, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94135988, + "num_input_tokens_seen": 66708090, + "step": 3089, + "time_per_iteration": 2.579549551010132 + }, + { + "auxiliary_loss_clip": 0.01118292, + "auxiliary_loss_mlp": 0.01049185, + "balance_loss_clip": 1.05576336, + "balance_loss_mlp": 1.03106546, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 3.5659684215627316, + "language_loss": 0.57769203, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59936678, + "num_input_tokens_seen": 66727320, + "step": 3090, + "time_per_iteration": 2.651841640472412 + }, + { + "auxiliary_loss_clip": 0.01132359, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.05047131, + "balance_loss_mlp": 1.02590537, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 13.143105957120355, + "language_loss": 0.82039112, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84215373, + "num_input_tokens_seen": 66747505, + "step": 3091, + "time_per_iteration": 2.591108560562134 + }, + { + "auxiliary_loss_clip": 0.01117251, + "auxiliary_loss_mlp": 0.01051558, + "balance_loss_clip": 1.05914116, + "balance_loss_mlp": 1.03305674, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.308303105806354, + "language_loss": 0.84335613, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.86504424, + "num_input_tokens_seen": 66766425, + "step": 3092, + "time_per_iteration": 2.6618518829345703 + }, + { + "auxiliary_loss_clip": 0.01141347, + "auxiliary_loss_mlp": 0.010451, + "balance_loss_clip": 1.05440915, + "balance_loss_mlp": 1.0265038, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 2.726681073838218, + "language_loss": 0.93139875, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95326328, + "num_input_tokens_seen": 66781130, + "step": 3093, + "time_per_iteration": 2.4926674365997314 + }, + { + "auxiliary_loss_clip": 0.01137491, + "auxiliary_loss_mlp": 0.01043443, + "balance_loss_clip": 1.05726612, + "balance_loss_mlp": 1.02597857, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 1.7276533459462111, + "language_loss": 0.77343243, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79524183, + "num_input_tokens_seen": 66797535, + "step": 3094, + "time_per_iteration": 2.5516483783721924 + }, + { + "auxiliary_loss_clip": 0.01103486, + "auxiliary_loss_mlp": 0.01043698, + "balance_loss_clip": 1.05630493, + "balance_loss_mlp": 1.0271399, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.692354418636488, + "language_loss": 0.69889677, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72036856, + "num_input_tokens_seen": 66821720, + "step": 3095, + "time_per_iteration": 2.9044642448425293 + }, + { + "auxiliary_loss_clip": 0.0111753, + "auxiliary_loss_mlp": 0.01050502, + "balance_loss_clip": 1.05247164, + "balance_loss_mlp": 1.0304513, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.8529231338060175, + "language_loss": 0.8080048, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82968509, + "num_input_tokens_seen": 66839060, + "step": 3096, + "time_per_iteration": 2.568908929824829 + }, + { + "auxiliary_loss_clip": 0.01152264, + "auxiliary_loss_mlp": 0.01044041, + "balance_loss_clip": 1.05767655, + "balance_loss_mlp": 1.02567077, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.224620578586941, + "language_loss": 0.75048494, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.772448, + "num_input_tokens_seen": 66857760, + "step": 3097, + "time_per_iteration": 2.5035433769226074 + }, + { + "auxiliary_loss_clip": 0.01142462, + "auxiliary_loss_mlp": 0.01045108, + "balance_loss_clip": 1.06204212, + "balance_loss_mlp": 1.02773905, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 4.8692587925173445, + "language_loss": 0.66041577, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68229151, + "num_input_tokens_seen": 66876460, + "step": 3098, + "time_per_iteration": 2.5436930656433105 + }, + { + "auxiliary_loss_clip": 0.01166671, + "auxiliary_loss_mlp": 0.01047558, + "balance_loss_clip": 1.05963361, + "balance_loss_mlp": 1.02759027, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.561847880967411, + "language_loss": 0.69563454, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.71777678, + "num_input_tokens_seen": 66897960, + "step": 3099, + "time_per_iteration": 2.4865307807922363 + }, + { + "auxiliary_loss_clip": 0.01151688, + "auxiliary_loss_mlp": 0.01046809, + "balance_loss_clip": 1.05767226, + "balance_loss_mlp": 1.02821231, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.8159704183886631, + "language_loss": 0.71580499, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73778993, + "num_input_tokens_seen": 66917675, + "step": 3100, + "time_per_iteration": 2.5692760944366455 + }, + { + "auxiliary_loss_clip": 0.01142784, + "auxiliary_loss_mlp": 0.01052219, + "balance_loss_clip": 1.05717218, + "balance_loss_mlp": 1.03272843, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 1.8577504479842262, + "language_loss": 0.80337429, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.8253243, + "num_input_tokens_seen": 66936000, + "step": 3101, + "time_per_iteration": 2.5980541706085205 + }, + { + "auxiliary_loss_clip": 0.01118142, + "auxiliary_loss_mlp": 0.01045462, + "balance_loss_clip": 1.05501318, + "balance_loss_mlp": 1.02818847, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 1.9097898282397432, + "language_loss": 0.76868606, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79032207, + "num_input_tokens_seen": 66955700, + "step": 3102, + "time_per_iteration": 4.016865253448486 + }, + { + "auxiliary_loss_clip": 0.01155996, + "auxiliary_loss_mlp": 0.0103982, + "balance_loss_clip": 1.05942953, + "balance_loss_mlp": 1.02261829, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.1108297704744587, + "language_loss": 0.7672267, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.78918481, + "num_input_tokens_seen": 66972815, + "step": 3103, + "time_per_iteration": 2.489668846130371 + }, + { + "auxiliary_loss_clip": 0.0113728, + "auxiliary_loss_mlp": 0.01044203, + "balance_loss_clip": 1.05641842, + "balance_loss_mlp": 1.02672696, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 3.8795413105466396, + "language_loss": 0.78764737, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.80946219, + "num_input_tokens_seen": 66992280, + "step": 3104, + "time_per_iteration": 4.027761459350586 + }, + { + "auxiliary_loss_clip": 0.01111971, + "auxiliary_loss_mlp": 0.01058666, + "balance_loss_clip": 1.05168819, + "balance_loss_mlp": 1.0397718, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 2.234426982675982, + "language_loss": 0.85112756, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87283397, + "num_input_tokens_seen": 67012220, + "step": 3105, + "time_per_iteration": 2.5609161853790283 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.05420899, + "balance_loss_mlp": 1.02914953, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.9876300976783123, + "language_loss": 0.86975473, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89154088, + "num_input_tokens_seen": 67032030, + "step": 3106, + "time_per_iteration": 2.5870847702026367 + }, + { + "auxiliary_loss_clip": 0.01151768, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.05471396, + "balance_loss_mlp": 1.02787197, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.9789900715180406, + "language_loss": 0.77917826, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80115581, + "num_input_tokens_seen": 67048920, + "step": 3107, + "time_per_iteration": 2.4752280712127686 + }, + { + "auxiliary_loss_clip": 0.01156899, + "auxiliary_loss_mlp": 0.01053868, + "balance_loss_clip": 1.05920231, + "balance_loss_mlp": 1.03504515, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 2.710344534744489, + "language_loss": 0.74687457, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76898223, + "num_input_tokens_seen": 67068645, + "step": 3108, + "time_per_iteration": 2.5407447814941406 + }, + { + "auxiliary_loss_clip": 0.01112169, + "auxiliary_loss_mlp": 0.01043267, + "balance_loss_clip": 1.05226135, + "balance_loss_mlp": 1.02507567, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.7247392146474487, + "language_loss": 0.74172825, + "learning_rate": 3.747197400772658e-06, + "loss": 0.7632826, + "num_input_tokens_seen": 67087075, + "step": 3109, + "time_per_iteration": 3.9262351989746094 + }, + { + "auxiliary_loss_clip": 0.01148888, + "auxiliary_loss_mlp": 0.01047976, + "balance_loss_clip": 1.05684483, + "balance_loss_mlp": 1.03024924, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.5232645420105335, + "language_loss": 0.84258878, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86455739, + "num_input_tokens_seen": 67108040, + "step": 3110, + "time_per_iteration": 2.5449464321136475 + }, + { + "auxiliary_loss_clip": 0.0115183, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.06182945, + "balance_loss_mlp": 1.02456474, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.8466613637083125, + "language_loss": 0.84740591, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86935186, + "num_input_tokens_seen": 67127605, + "step": 3111, + "time_per_iteration": 2.560140609741211 + }, + { + "auxiliary_loss_clip": 0.01132423, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.05480146, + "balance_loss_mlp": 1.02105403, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8144827569757265, + "language_loss": 0.76968586, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.79139441, + "num_input_tokens_seen": 67145785, + "step": 3112, + "time_per_iteration": 2.5411484241485596 + }, + { + "auxiliary_loss_clip": 0.01153972, + "auxiliary_loss_mlp": 0.01042589, + "balance_loss_clip": 1.0586729, + "balance_loss_mlp": 1.0255897, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 2.0640475190297676, + "language_loss": 0.64486086, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66682649, + "num_input_tokens_seen": 67165930, + "step": 3113, + "time_per_iteration": 2.547093152999878 + }, + { + "auxiliary_loss_clip": 0.01160593, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.06038511, + "balance_loss_mlp": 1.02735269, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.361017793986343, + "language_loss": 0.81755185, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83961082, + "num_input_tokens_seen": 67185830, + "step": 3114, + "time_per_iteration": 2.621215581893921 + }, + { + "auxiliary_loss_clip": 0.01114188, + "auxiliary_loss_mlp": 0.01053929, + "balance_loss_clip": 1.05522001, + "balance_loss_mlp": 1.03393734, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 3.797803000661534, + "language_loss": 0.57193279, + "learning_rate": 3.74605902628851e-06, + "loss": 0.59361398, + "num_input_tokens_seen": 67206930, + "step": 3115, + "time_per_iteration": 2.680150270462036 + }, + { + "auxiliary_loss_clip": 0.01131773, + "auxiliary_loss_mlp": 0.01062019, + "balance_loss_clip": 1.05973959, + "balance_loss_mlp": 1.04338646, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.6470763552601622, + "language_loss": 0.71183985, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73377776, + "num_input_tokens_seen": 67226290, + "step": 3116, + "time_per_iteration": 2.591029167175293 + }, + { + "auxiliary_loss_clip": 0.01156649, + "auxiliary_loss_mlp": 0.0103589, + "balance_loss_clip": 1.05667067, + "balance_loss_mlp": 1.01932025, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 2.1926224113515502, + "language_loss": 0.78879446, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81071985, + "num_input_tokens_seen": 67244410, + "step": 3117, + "time_per_iteration": 2.486436605453491 + }, + { + "auxiliary_loss_clip": 0.01141388, + "auxiliary_loss_mlp": 0.01049532, + "balance_loss_clip": 1.05904341, + "balance_loss_mlp": 1.03188884, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.7075082264448218, + "language_loss": 0.8432129, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86512208, + "num_input_tokens_seen": 67264470, + "step": 3118, + "time_per_iteration": 4.072889089584351 + }, + { + "auxiliary_loss_clip": 0.01152829, + "auxiliary_loss_mlp": 0.01045627, + "balance_loss_clip": 1.05967569, + "balance_loss_mlp": 1.02921176, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.9077085816398114, + "language_loss": 0.76388907, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78587365, + "num_input_tokens_seen": 67284315, + "step": 3119, + "time_per_iteration": 2.502357244491577 + }, + { + "auxiliary_loss_clip": 0.01163631, + "auxiliary_loss_mlp": 0.01046218, + "balance_loss_clip": 1.0589416, + "balance_loss_mlp": 1.02932632, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.9045964373095536, + "language_loss": 0.81841117, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84050965, + "num_input_tokens_seen": 67302780, + "step": 3120, + "time_per_iteration": 2.462062358856201 + }, + { + "auxiliary_loss_clip": 0.0113555, + "auxiliary_loss_mlp": 0.01041009, + "balance_loss_clip": 1.05376387, + "balance_loss_mlp": 1.02453399, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.8387998198064885, + "language_loss": 0.85161114, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.87337673, + "num_input_tokens_seen": 67323405, + "step": 3121, + "time_per_iteration": 2.5743305683135986 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01047911, + "balance_loss_clip": 1.04578924, + "balance_loss_mlp": 1.03054261, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.5342061302969117, + "language_loss": 0.70357192, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72507358, + "num_input_tokens_seen": 67345800, + "step": 3122, + "time_per_iteration": 2.706294298171997 + }, + { + "auxiliary_loss_clip": 0.01161028, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.05982029, + "balance_loss_mlp": 1.02272105, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 1.901218125201645, + "language_loss": 0.71088284, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.73290449, + "num_input_tokens_seen": 67363575, + "step": 3123, + "time_per_iteration": 2.432047128677368 + }, + { + "auxiliary_loss_clip": 0.0114917, + "auxiliary_loss_mlp": 0.01043901, + "balance_loss_clip": 1.05668652, + "balance_loss_mlp": 1.02759337, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 1.9349160282187352, + "language_loss": 0.7437017, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76563239, + "num_input_tokens_seen": 67381765, + "step": 3124, + "time_per_iteration": 2.50592041015625 + }, + { + "auxiliary_loss_clip": 0.01163096, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_clip": 1.05904126, + "balance_loss_mlp": 1.02847958, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.618368460066606, + "language_loss": 0.81472701, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.83682626, + "num_input_tokens_seen": 67405000, + "step": 3125, + "time_per_iteration": 2.621434450149536 + }, + { + "auxiliary_loss_clip": 0.01040826, + "auxiliary_loss_mlp": 0.01013638, + "balance_loss_clip": 1.03641284, + "balance_loss_mlp": 1.01113415, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9393526207330799, + "language_loss": 0.63550973, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65605438, + "num_input_tokens_seen": 67467140, + "step": 3126, + "time_per_iteration": 3.207414150238037 + }, + { + "auxiliary_loss_clip": 0.01132446, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.05729938, + "balance_loss_mlp": 1.02175844, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.6601490666250158, + "language_loss": 0.81640196, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.8381151, + "num_input_tokens_seen": 67487980, + "step": 3127, + "time_per_iteration": 2.5790748596191406 + }, + { + "auxiliary_loss_clip": 0.01080448, + "auxiliary_loss_mlp": 0.01005145, + "balance_loss_clip": 1.04097164, + "balance_loss_mlp": 1.00262928, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7656664855099027, + "language_loss": 0.61929131, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64014721, + "num_input_tokens_seen": 67552500, + "step": 3128, + "time_per_iteration": 3.102757692337036 + }, + { + "auxiliary_loss_clip": 0.01113486, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_clip": 1.04913652, + "balance_loss_mlp": 1.03007913, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.0065649347988077, + "language_loss": 0.71261954, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73424613, + "num_input_tokens_seen": 67573295, + "step": 3129, + "time_per_iteration": 2.7129061222076416 + }, + { + "auxiliary_loss_clip": 0.01158798, + "auxiliary_loss_mlp": 0.01047003, + "balance_loss_clip": 1.05660677, + "balance_loss_mlp": 1.02920473, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 1.894554463104732, + "language_loss": 0.85244298, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87450093, + "num_input_tokens_seen": 67590010, + "step": 3130, + "time_per_iteration": 2.5374836921691895 + }, + { + "auxiliary_loss_clip": 0.01113074, + "auxiliary_loss_mlp": 0.01050355, + "balance_loss_clip": 1.05006146, + "balance_loss_mlp": 1.03267598, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 2.3009904047933865, + "language_loss": 0.76412928, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7857635, + "num_input_tokens_seen": 67611110, + "step": 3131, + "time_per_iteration": 2.6663007736206055 + }, + { + "auxiliary_loss_clip": 0.01129499, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.0570147, + "balance_loss_mlp": 1.02770567, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.8283654665462803, + "language_loss": 0.81305993, + "learning_rate": 3.74282069289017e-06, + "loss": 0.83481908, + "num_input_tokens_seen": 67631990, + "step": 3132, + "time_per_iteration": 2.6490843296051025 + }, + { + "auxiliary_loss_clip": 0.0109621, + "auxiliary_loss_mlp": 0.00786609, + "balance_loss_clip": 1.04536521, + "balance_loss_mlp": 1.00073075, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 1.723674637952778, + "language_loss": 0.79840672, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81723493, + "num_input_tokens_seen": 67650490, + "step": 3133, + "time_per_iteration": 2.69357967376709 + }, + { + "auxiliary_loss_clip": 0.01128135, + "auxiliary_loss_mlp": 0.01059207, + "balance_loss_clip": 1.0579282, + "balance_loss_mlp": 1.04031265, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 1.8121967219351782, + "language_loss": 0.82618308, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.8480565, + "num_input_tokens_seen": 67668860, + "step": 3134, + "time_per_iteration": 2.6039175987243652 + }, + { + "auxiliary_loss_clip": 0.0113449, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_clip": 1.05175078, + "balance_loss_mlp": 1.02833295, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.4948727068978873, + "language_loss": 0.82523155, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8470335, + "num_input_tokens_seen": 67690220, + "step": 3135, + "time_per_iteration": 2.613617181777954 + }, + { + "auxiliary_loss_clip": 0.01143908, + "auxiliary_loss_mlp": 0.01053719, + "balance_loss_clip": 1.05353558, + "balance_loss_mlp": 1.03657675, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.8699186994165136, + "language_loss": 0.78517532, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.80715156, + "num_input_tokens_seen": 67709820, + "step": 3136, + "time_per_iteration": 2.5851686000823975 + }, + { + "auxiliary_loss_clip": 0.01137995, + "auxiliary_loss_mlp": 0.01047074, + "balance_loss_clip": 1.05418074, + "balance_loss_mlp": 1.02908516, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 2.5532083947245927, + "language_loss": 0.81059957, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83245027, + "num_input_tokens_seen": 67729490, + "step": 3137, + "time_per_iteration": 2.58066463470459 + }, + { + "auxiliary_loss_clip": 0.01158571, + "auxiliary_loss_mlp": 0.01047215, + "balance_loss_clip": 1.05721354, + "balance_loss_mlp": 1.03108621, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.935283935947669, + "language_loss": 0.80843961, + "learning_rate": 3.741673189793504e-06, + "loss": 0.8304975, + "num_input_tokens_seen": 67749665, + "step": 3138, + "time_per_iteration": 2.4588799476623535 + }, + { + "auxiliary_loss_clip": 0.01151077, + "auxiliary_loss_mlp": 0.01054486, + "balance_loss_clip": 1.05620301, + "balance_loss_mlp": 1.03647351, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 2.075182291043008, + "language_loss": 0.63880283, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.66085845, + "num_input_tokens_seen": 67776230, + "step": 3139, + "time_per_iteration": 2.658787250518799 + }, + { + "auxiliary_loss_clip": 0.01154454, + "auxiliary_loss_mlp": 0.01048528, + "balance_loss_clip": 1.0529263, + "balance_loss_mlp": 1.03018188, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.1121738723158736, + "language_loss": 0.71523416, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73726392, + "num_input_tokens_seen": 67795080, + "step": 3140, + "time_per_iteration": 2.4431865215301514 + }, + { + "auxiliary_loss_clip": 0.01158978, + "auxiliary_loss_mlp": 0.01043657, + "balance_loss_clip": 1.05516148, + "balance_loss_mlp": 1.02517939, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 2.7931667235774174, + "language_loss": 0.86708343, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.88910985, + "num_input_tokens_seen": 67813110, + "step": 3141, + "time_per_iteration": 3.848741292953491 + }, + { + "auxiliary_loss_clip": 0.01134201, + "auxiliary_loss_mlp": 0.01043776, + "balance_loss_clip": 1.05052912, + "balance_loss_mlp": 1.02495325, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 4.033562457369615, + "language_loss": 0.7692762, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79105598, + "num_input_tokens_seen": 67831070, + "step": 3142, + "time_per_iteration": 2.4805943965911865 + }, + { + "auxiliary_loss_clip": 0.01131848, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.05499792, + "balance_loss_mlp": 1.02236617, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.7550281474768268, + "language_loss": 0.78815818, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80985558, + "num_input_tokens_seen": 67852170, + "step": 3143, + "time_per_iteration": 2.5636720657348633 + }, + { + "auxiliary_loss_clip": 0.01120324, + "auxiliary_loss_mlp": 0.01044589, + "balance_loss_clip": 1.04930782, + "balance_loss_mlp": 1.02751839, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 2.0170672263412457, + "language_loss": 0.71600538, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73765457, + "num_input_tokens_seen": 67869945, + "step": 3144, + "time_per_iteration": 4.025663614273071 + }, + { + "auxiliary_loss_clip": 0.01128444, + "auxiliary_loss_mlp": 0.0104642, + "balance_loss_clip": 1.05028391, + "balance_loss_mlp": 1.02721572, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.3787110530444693, + "language_loss": 0.73885441, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.76060307, + "num_input_tokens_seen": 67890240, + "step": 3145, + "time_per_iteration": 2.5979366302490234 + }, + { + "auxiliary_loss_clip": 0.01115096, + "auxiliary_loss_mlp": 0.0104571, + "balance_loss_clip": 1.0462606, + "balance_loss_mlp": 1.02935469, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.528716028083863, + "language_loss": 0.76155591, + "learning_rate": 3.740139487448616e-06, + "loss": 0.78316396, + "num_input_tokens_seen": 67907825, + "step": 3146, + "time_per_iteration": 2.5589609146118164 + }, + { + "auxiliary_loss_clip": 0.01100042, + "auxiliary_loss_mlp": 0.01054392, + "balance_loss_clip": 1.04491544, + "balance_loss_mlp": 1.03484154, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 3.1893385881839738, + "language_loss": 0.78719103, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80873537, + "num_input_tokens_seen": 67926670, + "step": 3147, + "time_per_iteration": 2.5666258335113525 + }, + { + "auxiliary_loss_clip": 0.01147883, + "auxiliary_loss_mlp": 0.01045255, + "balance_loss_clip": 1.055462, + "balance_loss_mlp": 1.02813673, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 2.4931918772804043, + "language_loss": 0.67111105, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69304252, + "num_input_tokens_seen": 67943645, + "step": 3148, + "time_per_iteration": 2.510281801223755 + }, + { + "auxiliary_loss_clip": 0.01118641, + "auxiliary_loss_mlp": 0.01039881, + "balance_loss_clip": 1.0469265, + "balance_loss_mlp": 1.02232111, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 2.0912839979481364, + "language_loss": 0.75651658, + "learning_rate": 3.739563260095902e-06, + "loss": 0.7781018, + "num_input_tokens_seen": 67962345, + "step": 3149, + "time_per_iteration": 3.9453587532043457 + }, + { + "auxiliary_loss_clip": 0.0113361, + "auxiliary_loss_mlp": 0.01043556, + "balance_loss_clip": 1.05341578, + "balance_loss_mlp": 1.02712905, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.214243977565069, + "language_loss": 0.80685228, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.82862401, + "num_input_tokens_seen": 67979760, + "step": 3150, + "time_per_iteration": 2.4920387268066406 + }, + { + "auxiliary_loss_clip": 0.0114021, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.05282569, + "balance_loss_mlp": 1.03155589, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.2357745020929074, + "language_loss": 0.85028797, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87217766, + "num_input_tokens_seen": 67996895, + "step": 3151, + "time_per_iteration": 2.488189220428467 + }, + { + "auxiliary_loss_clip": 0.01124084, + "auxiliary_loss_mlp": 0.01046107, + "balance_loss_clip": 1.05402374, + "balance_loss_mlp": 1.02913129, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.602799366490877, + "language_loss": 0.74127042, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76297235, + "num_input_tokens_seen": 68018365, + "step": 3152, + "time_per_iteration": 2.605278730392456 + }, + { + "auxiliary_loss_clip": 0.0112304, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_clip": 1.05062389, + "balance_loss_mlp": 1.02713394, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 19.57695249640169, + "language_loss": 0.76031548, + "learning_rate": 3.738794033491209e-06, + "loss": 0.78199768, + "num_input_tokens_seen": 68037985, + "step": 3153, + "time_per_iteration": 2.5906550884246826 + }, + { + "auxiliary_loss_clip": 0.01161607, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.05779588, + "balance_loss_mlp": 1.02715206, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.945105064863819, + "language_loss": 0.79157996, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81364304, + "num_input_tokens_seen": 68057975, + "step": 3154, + "time_per_iteration": 2.4695885181427 + }, + { + "auxiliary_loss_clip": 0.01119962, + "auxiliary_loss_mlp": 0.01049888, + "balance_loss_clip": 1.04751801, + "balance_loss_mlp": 1.03069568, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 4.279092247750554, + "language_loss": 0.7225858, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74428433, + "num_input_tokens_seen": 68074175, + "step": 3155, + "time_per_iteration": 2.5121445655822754 + }, + { + "auxiliary_loss_clip": 0.01128034, + "auxiliary_loss_mlp": 0.01042891, + "balance_loss_clip": 1.05164981, + "balance_loss_mlp": 1.02586818, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 2.34584515821292, + "language_loss": 0.74044979, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76215899, + "num_input_tokens_seen": 68095230, + "step": 3156, + "time_per_iteration": 2.5735361576080322 + }, + { + "auxiliary_loss_clip": 0.01160319, + "auxiliary_loss_mlp": 0.01040757, + "balance_loss_clip": 1.0561347, + "balance_loss_mlp": 1.02399611, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.8684983126332297, + "language_loss": 0.68193781, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70394856, + "num_input_tokens_seen": 68113805, + "step": 3157, + "time_per_iteration": 4.040220499038696 + }, + { + "auxiliary_loss_clip": 0.01118272, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.0491457, + "balance_loss_mlp": 1.02400291, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 6.139099621449056, + "language_loss": 0.80015486, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82174987, + "num_input_tokens_seen": 68133190, + "step": 3158, + "time_per_iteration": 2.597975015640259 + }, + { + "auxiliary_loss_clip": 0.01162927, + "auxiliary_loss_mlp": 0.0104334, + "balance_loss_clip": 1.05676568, + "balance_loss_mlp": 1.0255537, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 2.0360453085342063, + "language_loss": 0.72542453, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74748719, + "num_input_tokens_seen": 68152330, + "step": 3159, + "time_per_iteration": 2.4936511516571045 + }, + { + "auxiliary_loss_clip": 0.0114857, + "auxiliary_loss_mlp": 0.01051808, + "balance_loss_clip": 1.05663311, + "balance_loss_mlp": 1.03325927, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.8431753272157807, + "language_loss": 0.85278207, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.8747859, + "num_input_tokens_seen": 68170185, + "step": 3160, + "time_per_iteration": 2.487858295440674 + }, + { + "auxiliary_loss_clip": 0.01133217, + "auxiliary_loss_mlp": 0.01047541, + "balance_loss_clip": 1.05368125, + "balance_loss_mlp": 1.03148341, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 2.198289497047286, + "language_loss": 0.73479611, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.7566036, + "num_input_tokens_seen": 68191665, + "step": 3161, + "time_per_iteration": 2.575932264328003 + }, + { + "auxiliary_loss_clip": 0.01141592, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.0535717, + "balance_loss_mlp": 1.02667141, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.702168730384022, + "language_loss": 0.80961806, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83148956, + "num_input_tokens_seen": 68214635, + "step": 3162, + "time_per_iteration": 2.6249494552612305 + }, + { + "auxiliary_loss_clip": 0.01161366, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.05804253, + "balance_loss_mlp": 1.02509224, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 2.162634683784222, + "language_loss": 0.75790972, + "learning_rate": 3.73686635253511e-06, + "loss": 0.7799511, + "num_input_tokens_seen": 68232150, + "step": 3163, + "time_per_iteration": 2.4520204067230225 + }, + { + "auxiliary_loss_clip": 0.01104099, + "auxiliary_loss_mlp": 0.01043622, + "balance_loss_clip": 1.05161655, + "balance_loss_mlp": 1.02500176, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.6543507900808252, + "language_loss": 0.74574822, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76722538, + "num_input_tokens_seen": 68253370, + "step": 3164, + "time_per_iteration": 2.7064530849456787 + }, + { + "auxiliary_loss_clip": 0.01145647, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.05594945, + "balance_loss_mlp": 1.01886165, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.489896198052542, + "language_loss": 0.66784924, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.6896677, + "num_input_tokens_seen": 68278895, + "step": 3165, + "time_per_iteration": 2.848742961883545 + }, + { + "auxiliary_loss_clip": 0.01149596, + "auxiliary_loss_mlp": 0.01048767, + "balance_loss_clip": 1.05640626, + "balance_loss_mlp": 1.02991962, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.1468390911991944, + "language_loss": 0.74184036, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76382399, + "num_input_tokens_seen": 68294880, + "step": 3166, + "time_per_iteration": 2.453961133956909 + }, + { + "auxiliary_loss_clip": 0.01043633, + "auxiliary_loss_mlp": 0.01020005, + "balance_loss_clip": 1.03661418, + "balance_loss_mlp": 1.01715553, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 1.1752256712118379, + "language_loss": 0.50376403, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52440035, + "num_input_tokens_seen": 68359665, + "step": 3167, + "time_per_iteration": 3.140029191970825 + }, + { + "auxiliary_loss_clip": 0.01143647, + "auxiliary_loss_mlp": 0.01050696, + "balance_loss_clip": 1.05700517, + "balance_loss_mlp": 1.03286219, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7914241504612927, + "language_loss": 0.74387538, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76581883, + "num_input_tokens_seen": 68378950, + "step": 3168, + "time_per_iteration": 2.511521100997925 + }, + { + "auxiliary_loss_clip": 0.01031037, + "auxiliary_loss_mlp": 0.01014493, + "balance_loss_clip": 1.02595973, + "balance_loss_mlp": 1.01198924, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8619116335180627, + "language_loss": 0.6004405, + "learning_rate": 3.73570658211056e-06, + "loss": 0.6208958, + "num_input_tokens_seen": 68434235, + "step": 3169, + "time_per_iteration": 3.045316219329834 + }, + { + "auxiliary_loss_clip": 0.01109334, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.05412674, + "balance_loss_mlp": 1.03422594, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 2.249007857337855, + "language_loss": 0.78741103, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80902612, + "num_input_tokens_seen": 68453830, + "step": 3170, + "time_per_iteration": 2.6658122539520264 + }, + { + "auxiliary_loss_clip": 0.01146334, + "auxiliary_loss_mlp": 0.01042385, + "balance_loss_clip": 1.0557313, + "balance_loss_mlp": 1.02459896, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.8687933696797563, + "language_loss": 0.78509825, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80698544, + "num_input_tokens_seen": 68473005, + "step": 3171, + "time_per_iteration": 2.545017957687378 + }, + { + "auxiliary_loss_clip": 0.01163418, + "auxiliary_loss_mlp": 0.01044086, + "balance_loss_clip": 1.05687475, + "balance_loss_mlp": 1.02515519, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 2.1974290754996435, + "language_loss": 0.78449672, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80657172, + "num_input_tokens_seen": 68493470, + "step": 3172, + "time_per_iteration": 2.5311977863311768 + }, + { + "auxiliary_loss_clip": 0.01149959, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_clip": 1.05645394, + "balance_loss_mlp": 1.03258073, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.7824566618271676, + "language_loss": 0.80152833, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82352877, + "num_input_tokens_seen": 68511290, + "step": 3173, + "time_per_iteration": 2.4690048694610596 + }, + { + "auxiliary_loss_clip": 0.01119355, + "auxiliary_loss_mlp": 0.00785174, + "balance_loss_clip": 1.05059493, + "balance_loss_mlp": 1.00049543, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.481657173777326, + "language_loss": 0.79046309, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80950844, + "num_input_tokens_seen": 68532575, + "step": 3174, + "time_per_iteration": 2.6118040084838867 + }, + { + "auxiliary_loss_clip": 0.01113102, + "auxiliary_loss_mlp": 0.01043154, + "balance_loss_clip": 1.05136812, + "balance_loss_mlp": 1.02574992, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.760915932650432, + "language_loss": 0.80729401, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82885659, + "num_input_tokens_seen": 68548760, + "step": 3175, + "time_per_iteration": 2.561415910720825 + }, + { + "auxiliary_loss_clip": 0.01088122, + "auxiliary_loss_mlp": 0.0105928, + "balance_loss_clip": 1.04670632, + "balance_loss_mlp": 1.04008734, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.082937819283617, + "language_loss": 0.85326982, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87474382, + "num_input_tokens_seen": 68563100, + "step": 3176, + "time_per_iteration": 2.6025595664978027 + }, + { + "auxiliary_loss_clip": 0.01139525, + "auxiliary_loss_mlp": 0.01057211, + "balance_loss_clip": 1.05709136, + "balance_loss_mlp": 1.03689826, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 3.5283186498930066, + "language_loss": 0.81225848, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83422589, + "num_input_tokens_seen": 68581650, + "step": 3177, + "time_per_iteration": 2.548567056655884 + }, + { + "auxiliary_loss_clip": 0.01130305, + "auxiliary_loss_mlp": 0.01043695, + "balance_loss_clip": 1.04960966, + "balance_loss_mlp": 1.02531242, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 4.492256755636188, + "language_loss": 0.74382961, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76556963, + "num_input_tokens_seen": 68600360, + "step": 3178, + "time_per_iteration": 2.5070888996124268 + }, + { + "auxiliary_loss_clip": 0.01152206, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.0589273, + "balance_loss_mlp": 1.02009583, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.6202150804057356, + "language_loss": 0.814964, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83685654, + "num_input_tokens_seen": 68617885, + "step": 3179, + "time_per_iteration": 2.514094352722168 + }, + { + "auxiliary_loss_clip": 0.01152506, + "auxiliary_loss_mlp": 0.01043574, + "balance_loss_clip": 1.05947733, + "balance_loss_mlp": 1.02560902, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.948978079636597, + "language_loss": 0.79362094, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81558168, + "num_input_tokens_seen": 68634550, + "step": 3180, + "time_per_iteration": 3.8873450756073 + }, + { + "auxiliary_loss_clip": 0.01135681, + "auxiliary_loss_mlp": 0.01050118, + "balance_loss_clip": 1.05600262, + "balance_loss_mlp": 1.03135395, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.4654478238565427, + "language_loss": 0.79388213, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81574011, + "num_input_tokens_seen": 68651895, + "step": 3181, + "time_per_iteration": 2.513180732727051 + }, + { + "auxiliary_loss_clip": 0.01148729, + "auxiliary_loss_mlp": 0.01048165, + "balance_loss_clip": 1.05583668, + "balance_loss_mlp": 1.03054547, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.9581730281983658, + "language_loss": 0.74081254, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.7627815, + "num_input_tokens_seen": 68671500, + "step": 3182, + "time_per_iteration": 2.4903922080993652 + }, + { + "auxiliary_loss_clip": 0.01137075, + "auxiliary_loss_mlp": 0.01046132, + "balance_loss_clip": 1.05994689, + "balance_loss_mlp": 1.02804828, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.8353295410363895, + "language_loss": 0.65016556, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67199767, + "num_input_tokens_seen": 68690570, + "step": 3183, + "time_per_iteration": 4.051580429077148 + }, + { + "auxiliary_loss_clip": 0.01137445, + "auxiliary_loss_mlp": 0.01049203, + "balance_loss_clip": 1.05236614, + "balance_loss_mlp": 1.03015304, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.6590925147602933, + "language_loss": 0.73729312, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.75915962, + "num_input_tokens_seen": 68709735, + "step": 3184, + "time_per_iteration": 2.583106279373169 + }, + { + "auxiliary_loss_clip": 0.01130757, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.05276942, + "balance_loss_mlp": 1.02868712, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 2.0373517387272293, + "language_loss": 0.88233304, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90413022, + "num_input_tokens_seen": 68727565, + "step": 3185, + "time_per_iteration": 2.509129524230957 + }, + { + "auxiliary_loss_clip": 0.01162136, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.05766988, + "balance_loss_mlp": 1.02539897, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 1.9594583870414393, + "language_loss": 0.72965574, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.751719, + "num_input_tokens_seen": 68748110, + "step": 3186, + "time_per_iteration": 2.454505681991577 + }, + { + "auxiliary_loss_clip": 0.01141248, + "auxiliary_loss_mlp": 0.01045993, + "balance_loss_clip": 1.05995178, + "balance_loss_mlp": 1.02532232, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.017236803988477, + "language_loss": 0.83590788, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.85778034, + "num_input_tokens_seen": 68769765, + "step": 3187, + "time_per_iteration": 2.5577707290649414 + }, + { + "auxiliary_loss_clip": 0.01062958, + "auxiliary_loss_mlp": 0.01016645, + "balance_loss_clip": 1.03217959, + "balance_loss_mlp": 1.01411748, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8660081434360289, + "language_loss": 0.55895948, + "learning_rate": 3.732018351516544e-06, + "loss": 0.57975554, + "num_input_tokens_seen": 68826815, + "step": 3188, + "time_per_iteration": 4.4651055335998535 + }, + { + "auxiliary_loss_clip": 0.01146102, + "auxiliary_loss_mlp": 0.01051732, + "balance_loss_clip": 1.05721223, + "balance_loss_mlp": 1.03351641, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 2.21036743578956, + "language_loss": 0.70348108, + "learning_rate": 3.731823576891397e-06, + "loss": 0.7254594, + "num_input_tokens_seen": 68847585, + "step": 3189, + "time_per_iteration": 2.5650739669799805 + }, + { + "auxiliary_loss_clip": 0.0112191, + "auxiliary_loss_mlp": 0.01041397, + "balance_loss_clip": 1.05006218, + "balance_loss_mlp": 1.02363443, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.49259853552287, + "language_loss": 0.74776006, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76939315, + "num_input_tokens_seen": 68866620, + "step": 3190, + "time_per_iteration": 2.578070640563965 + }, + { + "auxiliary_loss_clip": 0.01117086, + "auxiliary_loss_mlp": 0.01063719, + "balance_loss_clip": 1.05011392, + "balance_loss_mlp": 1.04394174, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 1.94159608655251, + "language_loss": 0.8428219, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86462992, + "num_input_tokens_seen": 68885515, + "step": 3191, + "time_per_iteration": 2.527376651763916 + }, + { + "auxiliary_loss_clip": 0.01132039, + "auxiliary_loss_mlp": 0.01038925, + "balance_loss_clip": 1.05422127, + "balance_loss_mlp": 1.02211642, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.864142672000888, + "language_loss": 0.89838296, + "learning_rate": 3.73123885901997e-06, + "loss": 0.92009264, + "num_input_tokens_seen": 68903225, + "step": 3192, + "time_per_iteration": 2.5185163021087646 + }, + { + "auxiliary_loss_clip": 0.01137061, + "auxiliary_loss_mlp": 0.01054875, + "balance_loss_clip": 1.06004643, + "balance_loss_mlp": 1.0348357, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 1.8553207428783955, + "language_loss": 0.7477479, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.76966727, + "num_input_tokens_seen": 68922860, + "step": 3193, + "time_per_iteration": 2.5756874084472656 + }, + { + "auxiliary_loss_clip": 0.01133953, + "auxiliary_loss_mlp": 0.00785465, + "balance_loss_clip": 1.05355585, + "balance_loss_mlp": 1.0004971, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.6747803435013804, + "language_loss": 0.74642515, + "learning_rate": 3.730848718849612e-06, + "loss": 0.76561934, + "num_input_tokens_seen": 68943000, + "step": 3194, + "time_per_iteration": 2.6163289546966553 + }, + { + "auxiliary_loss_clip": 0.01061661, + "auxiliary_loss_mlp": 0.01008864, + "balance_loss_clip": 1.03287554, + "balance_loss_mlp": 1.0062418, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.8176202868340727, + "language_loss": 0.68517631, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70588154, + "num_input_tokens_seen": 69000255, + "step": 3195, + "time_per_iteration": 3.048797845840454 + }, + { + "auxiliary_loss_clip": 0.01123566, + "auxiliary_loss_mlp": 0.01053196, + "balance_loss_clip": 1.05361152, + "balance_loss_mlp": 1.03270411, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 2.794000528430297, + "language_loss": 0.73222369, + "learning_rate": 3.730458316143429e-06, + "loss": 0.75399131, + "num_input_tokens_seen": 69019665, + "step": 3196, + "time_per_iteration": 2.6068122386932373 + }, + { + "auxiliary_loss_clip": 0.01140029, + "auxiliary_loss_mlp": 0.01052905, + "balance_loss_clip": 1.05905473, + "balance_loss_mlp": 1.03443956, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.8465629648995958, + "language_loss": 0.83554566, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85747504, + "num_input_tokens_seen": 69039055, + "step": 3197, + "time_per_iteration": 4.124760866165161 + }, + { + "auxiliary_loss_clip": 0.01093518, + "auxiliary_loss_mlp": 0.01052327, + "balance_loss_clip": 1.05000412, + "balance_loss_mlp": 1.03173971, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 1.911027903362071, + "language_loss": 0.80058372, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82204223, + "num_input_tokens_seen": 69056370, + "step": 3198, + "time_per_iteration": 2.6214041709899902 + }, + { + "auxiliary_loss_clip": 0.01138246, + "auxiliary_loss_mlp": 0.01053695, + "balance_loss_clip": 1.05481803, + "balance_loss_mlp": 1.03450203, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 2.2508989386129152, + "language_loss": 0.7846871, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80660653, + "num_input_tokens_seen": 69075915, + "step": 3199, + "time_per_iteration": 2.5544748306274414 + }, + { + "auxiliary_loss_clip": 0.01122456, + "auxiliary_loss_mlp": 0.01055276, + "balance_loss_clip": 1.05090642, + "balance_loss_mlp": 1.03651237, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 7.087227787210921, + "language_loss": 0.83646083, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.8582381, + "num_input_tokens_seen": 69094145, + "step": 3200, + "time_per_iteration": 2.591787099838257 + }, + { + "auxiliary_loss_clip": 0.01163346, + "auxiliary_loss_mlp": 0.0105177, + "balance_loss_clip": 1.06063652, + "balance_loss_mlp": 1.03479457, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 1.7188966626134186, + "language_loss": 0.78982747, + "learning_rate": 3.729481161172443e-06, + "loss": 0.8119787, + "num_input_tokens_seen": 69111110, + "step": 3201, + "time_per_iteration": 2.4624722003936768 + }, + { + "auxiliary_loss_clip": 0.01106848, + "auxiliary_loss_mlp": 0.0105275, + "balance_loss_clip": 1.0477283, + "balance_loss_mlp": 1.03496432, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.061833613979033, + "language_loss": 0.69176495, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71336097, + "num_input_tokens_seen": 69130280, + "step": 3202, + "time_per_iteration": 2.577688694000244 + }, + { + "auxiliary_loss_clip": 0.01135394, + "auxiliary_loss_mlp": 0.010437, + "balance_loss_clip": 1.05357385, + "balance_loss_mlp": 1.02602088, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.798963659298295, + "language_loss": 0.91632622, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93811715, + "num_input_tokens_seen": 69149570, + "step": 3203, + "time_per_iteration": 2.5176937580108643 + }, + { + "auxiliary_loss_clip": 0.01150447, + "auxiliary_loss_mlp": 0.01054131, + "balance_loss_clip": 1.05449593, + "balance_loss_mlp": 1.03478312, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.526217150146675, + "language_loss": 0.81894362, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.84098947, + "num_input_tokens_seen": 69168190, + "step": 3204, + "time_per_iteration": 2.463327169418335 + }, + { + "auxiliary_loss_clip": 0.01118779, + "auxiliary_loss_mlp": 0.01047126, + "balance_loss_clip": 1.05011296, + "balance_loss_mlp": 1.02944756, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.1044348674450806, + "language_loss": 0.75977039, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.78142941, + "num_input_tokens_seen": 69186950, + "step": 3205, + "time_per_iteration": 2.567457675933838 + }, + { + "auxiliary_loss_clip": 0.01144903, + "auxiliary_loss_mlp": 0.01049649, + "balance_loss_clip": 1.06088817, + "balance_loss_mlp": 1.03125453, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 4.428952598707574, + "language_loss": 0.83334327, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85528874, + "num_input_tokens_seen": 69204850, + "step": 3206, + "time_per_iteration": 2.5227179527282715 + }, + { + "auxiliary_loss_clip": 0.01063139, + "auxiliary_loss_mlp": 0.0100488, + "balance_loss_clip": 1.04234886, + "balance_loss_mlp": 1.00262666, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8409554597898836, + "language_loss": 0.60571367, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62639385, + "num_input_tokens_seen": 69259200, + "step": 3207, + "time_per_iteration": 2.926900863647461 + }, + { + "auxiliary_loss_clip": 0.01122054, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_clip": 1.05127501, + "balance_loss_mlp": 1.02943301, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.3893450077901974, + "language_loss": 0.75061649, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77230901, + "num_input_tokens_seen": 69275835, + "step": 3208, + "time_per_iteration": 2.5354740619659424 + }, + { + "auxiliary_loss_clip": 0.01150579, + "auxiliary_loss_mlp": 0.00785125, + "balance_loss_clip": 1.05498052, + "balance_loss_mlp": 1.00048184, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 3.257525349971296, + "language_loss": 0.60949451, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62885153, + "num_input_tokens_seen": 69294810, + "step": 3209, + "time_per_iteration": 2.51189923286438 + }, + { + "auxiliary_loss_clip": 0.01166022, + "auxiliary_loss_mlp": 0.01052105, + "balance_loss_clip": 1.05962539, + "balance_loss_mlp": 1.03204226, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 2.0277955795982137, + "language_loss": 0.80171168, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82389295, + "num_input_tokens_seen": 69316065, + "step": 3210, + "time_per_iteration": 2.636220693588257 + }, + { + "auxiliary_loss_clip": 0.01119054, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_clip": 1.04725194, + "balance_loss_mlp": 1.02709663, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.02034845646272, + "language_loss": 0.82577342, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84741157, + "num_input_tokens_seen": 69332900, + "step": 3211, + "time_per_iteration": 2.545767068862915 + }, + { + "auxiliary_loss_clip": 0.01076706, + "auxiliary_loss_mlp": 0.01003849, + "balance_loss_clip": 1.03689361, + "balance_loss_mlp": 1.00145304, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.969177838843368, + "language_loss": 0.63703513, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65784073, + "num_input_tokens_seen": 69382535, + "step": 3212, + "time_per_iteration": 2.8731744289398193 + }, + { + "auxiliary_loss_clip": 0.01136579, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.05736971, + "balance_loss_mlp": 1.02568781, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 5.4808075120563355, + "language_loss": 0.7614032, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.7832067, + "num_input_tokens_seen": 69400600, + "step": 3213, + "time_per_iteration": 2.5966804027557373 + }, + { + "auxiliary_loss_clip": 0.01124876, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.05140078, + "balance_loss_mlp": 1.03020477, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.078150004738743, + "language_loss": 0.71184146, + "learning_rate": 3.726932887459503e-06, + "loss": 0.73358661, + "num_input_tokens_seen": 69417350, + "step": 3214, + "time_per_iteration": 2.5226950645446777 + }, + { + "auxiliary_loss_clip": 0.01159553, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.05606258, + "balance_loss_mlp": 1.03003383, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.206728639566865, + "language_loss": 0.75367868, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77576387, + "num_input_tokens_seen": 69431845, + "step": 3215, + "time_per_iteration": 2.4514119625091553 + }, + { + "auxiliary_loss_clip": 0.01118829, + "auxiliary_loss_mlp": 0.01049794, + "balance_loss_clip": 1.05246758, + "balance_loss_mlp": 1.03256774, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 1.9044726098316112, + "language_loss": 0.88299036, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90467662, + "num_input_tokens_seen": 69453275, + "step": 3216, + "time_per_iteration": 2.6272542476654053 + }, + { + "auxiliary_loss_clip": 0.01161001, + "auxiliary_loss_mlp": 0.01050565, + "balance_loss_clip": 1.05866337, + "balance_loss_mlp": 1.03321981, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.6811943410397867, + "language_loss": 0.79947174, + "learning_rate": 3.726343252048485e-06, + "loss": 0.82158732, + "num_input_tokens_seen": 69471830, + "step": 3217, + "time_per_iteration": 2.470104694366455 + }, + { + "auxiliary_loss_clip": 0.01146389, + "auxiliary_loss_mlp": 0.01050945, + "balance_loss_clip": 1.05730271, + "balance_loss_mlp": 1.03075147, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.1992923342985766, + "language_loss": 0.61501133, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63698465, + "num_input_tokens_seen": 69489320, + "step": 3218, + "time_per_iteration": 2.4938268661499023 + }, + { + "auxiliary_loss_clip": 0.01163375, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.06003463, + "balance_loss_mlp": 1.02612901, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.680130008441449, + "language_loss": 0.80458868, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.8266629, + "num_input_tokens_seen": 69506665, + "step": 3219, + "time_per_iteration": 2.4315032958984375 + }, + { + "auxiliary_loss_clip": 0.01109567, + "auxiliary_loss_mlp": 0.01047881, + "balance_loss_clip": 1.05115414, + "balance_loss_mlp": 1.02877188, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.4476552520233685, + "language_loss": 0.85563552, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87720996, + "num_input_tokens_seen": 69523835, + "step": 3220, + "time_per_iteration": 4.03101921081543 + }, + { + "auxiliary_loss_clip": 0.01155376, + "auxiliary_loss_mlp": 0.01039115, + "balance_loss_clip": 1.05672264, + "balance_loss_mlp": 1.02180552, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.5836298046044077, + "language_loss": 0.83947599, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86142087, + "num_input_tokens_seen": 69542620, + "step": 3221, + "time_per_iteration": 2.4734630584716797 + }, + { + "auxiliary_loss_clip": 0.01148068, + "auxiliary_loss_mlp": 0.01048672, + "balance_loss_clip": 1.05654538, + "balance_loss_mlp": 1.03198314, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.221017130980693, + "language_loss": 0.86701298, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88898033, + "num_input_tokens_seen": 69561130, + "step": 3222, + "time_per_iteration": 4.033426523208618 + }, + { + "auxiliary_loss_clip": 0.010771, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.04129672, + "balance_loss_mlp": 1.0251739, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 1.6136277407286561, + "language_loss": 0.78376091, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80497897, + "num_input_tokens_seen": 69580425, + "step": 3223, + "time_per_iteration": 2.639556646347046 + }, + { + "auxiliary_loss_clip": 0.01149161, + "auxiliary_loss_mlp": 0.01051356, + "balance_loss_clip": 1.05746102, + "balance_loss_mlp": 1.03320074, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.988880407521168, + "language_loss": 0.75498986, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77699494, + "num_input_tokens_seen": 69597085, + "step": 3224, + "time_per_iteration": 2.4755585193634033 + }, + { + "auxiliary_loss_clip": 0.01107995, + "auxiliary_loss_mlp": 0.01058035, + "balance_loss_clip": 1.04727352, + "balance_loss_mlp": 1.03722107, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.4328258795319666, + "language_loss": 0.71170396, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73336422, + "num_input_tokens_seen": 69618885, + "step": 3225, + "time_per_iteration": 2.7684080600738525 + }, + { + "auxiliary_loss_clip": 0.01125643, + "auxiliary_loss_mlp": 0.01050226, + "balance_loss_clip": 1.05309987, + "balance_loss_mlp": 1.03215396, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.1698118357309397, + "language_loss": 0.68617415, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.70793289, + "num_input_tokens_seen": 69638200, + "step": 3226, + "time_per_iteration": 2.600513219833374 + }, + { + "auxiliary_loss_clip": 0.01129736, + "auxiliary_loss_mlp": 0.0104136, + "balance_loss_clip": 1.05515981, + "balance_loss_mlp": 1.02168989, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 2.2465953647794326, + "language_loss": 0.76178539, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78349632, + "num_input_tokens_seen": 69657550, + "step": 3227, + "time_per_iteration": 4.001897096633911 + }, + { + "auxiliary_loss_clip": 0.01120696, + "auxiliary_loss_mlp": 0.01042377, + "balance_loss_clip": 1.05235338, + "balance_loss_mlp": 1.02461517, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.937250360269014, + "language_loss": 0.69897532, + "learning_rate": 3.724176216414662e-06, + "loss": 0.72060609, + "num_input_tokens_seen": 69675005, + "step": 3228, + "time_per_iteration": 2.5211312770843506 + }, + { + "auxiliary_loss_clip": 0.01152174, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.05805194, + "balance_loss_mlp": 1.03015828, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.9380497291499075, + "language_loss": 0.74633372, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76833242, + "num_input_tokens_seen": 69696455, + "step": 3229, + "time_per_iteration": 2.557715654373169 + }, + { + "auxiliary_loss_clip": 0.01122561, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_clip": 1.05351448, + "balance_loss_mlp": 1.0269444, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 2.503194798055427, + "language_loss": 0.65702194, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67869687, + "num_input_tokens_seen": 69714245, + "step": 3230, + "time_per_iteration": 2.5233895778656006 + }, + { + "auxiliary_loss_clip": 0.01126323, + "auxiliary_loss_mlp": 0.0078479, + "balance_loss_clip": 1.05143118, + "balance_loss_mlp": 1.00057483, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 2.0554812533579785, + "language_loss": 0.82086599, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83997709, + "num_input_tokens_seen": 69731515, + "step": 3231, + "time_per_iteration": 2.4949145317077637 + }, + { + "auxiliary_loss_clip": 0.0113489, + "auxiliary_loss_mlp": 0.01043557, + "balance_loss_clip": 1.05281544, + "balance_loss_mlp": 1.02388763, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 2.411421291723992, + "language_loss": 0.86902726, + "learning_rate": 3.72338624150555e-06, + "loss": 0.89081168, + "num_input_tokens_seen": 69748885, + "step": 3232, + "time_per_iteration": 2.5183582305908203 + }, + { + "auxiliary_loss_clip": 0.01104892, + "auxiliary_loss_mlp": 0.01055541, + "balance_loss_clip": 1.04985797, + "balance_loss_mlp": 1.0352037, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.8104782764272147, + "language_loss": 0.85388386, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87548828, + "num_input_tokens_seen": 69767540, + "step": 3233, + "time_per_iteration": 2.6163856983184814 + }, + { + "auxiliary_loss_clip": 0.01153127, + "auxiliary_loss_mlp": 0.01058385, + "balance_loss_clip": 1.05656123, + "balance_loss_mlp": 1.04027736, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 2.581649125876555, + "language_loss": 0.89275783, + "learning_rate": 3.722990861915158e-06, + "loss": 0.914873, + "num_input_tokens_seen": 69789340, + "step": 3234, + "time_per_iteration": 2.550199270248413 + }, + { + "auxiliary_loss_clip": 0.01141919, + "auxiliary_loss_mlp": 0.01053553, + "balance_loss_clip": 1.05242968, + "balance_loss_mlp": 1.03395545, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 6.412950499602741, + "language_loss": 0.78605181, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80800658, + "num_input_tokens_seen": 69806470, + "step": 3235, + "time_per_iteration": 2.552858829498291 + }, + { + "auxiliary_loss_clip": 0.0113977, + "auxiliary_loss_mlp": 0.01050782, + "balance_loss_clip": 1.05813336, + "balance_loss_mlp": 1.03234053, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 1.8785651351112553, + "language_loss": 0.79077232, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81267786, + "num_input_tokens_seen": 69822655, + "step": 3236, + "time_per_iteration": 3.9742817878723145 + }, + { + "auxiliary_loss_clip": 0.01161086, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_clip": 1.05963862, + "balance_loss_mlp": 1.02954364, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 1.646409941717702, + "language_loss": 0.75622821, + "learning_rate": 3.72239730252843e-06, + "loss": 0.77832103, + "num_input_tokens_seen": 69841895, + "step": 3237, + "time_per_iteration": 2.477328300476074 + }, + { + "auxiliary_loss_clip": 0.01163449, + "auxiliary_loss_mlp": 0.01053615, + "balance_loss_clip": 1.05749178, + "balance_loss_mlp": 1.03528118, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 1.9770864549181608, + "language_loss": 0.74879479, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.7709654, + "num_input_tokens_seen": 69862220, + "step": 3238, + "time_per_iteration": 2.4946513175964355 + }, + { + "auxiliary_loss_clip": 0.01113297, + "auxiliary_loss_mlp": 0.0104934, + "balance_loss_clip": 1.04954946, + "balance_loss_mlp": 1.03006411, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 2.047074309828953, + "language_loss": 0.73203605, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75366247, + "num_input_tokens_seen": 69881830, + "step": 3239, + "time_per_iteration": 2.565473794937134 + }, + { + "auxiliary_loss_clip": 0.01134119, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.05039608, + "balance_loss_mlp": 1.02982235, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 2.029154359488959, + "language_loss": 0.73485398, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75667977, + "num_input_tokens_seen": 69900515, + "step": 3240, + "time_per_iteration": 2.527031660079956 + }, + { + "auxiliary_loss_clip": 0.01134102, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.05813217, + "balance_loss_mlp": 1.02797985, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 1.8942084538074533, + "language_loss": 0.66188318, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.6836859, + "num_input_tokens_seen": 69920060, + "step": 3241, + "time_per_iteration": 2.5369644165039062 + }, + { + "auxiliary_loss_clip": 0.01134478, + "auxiliary_loss_mlp": 0.01051974, + "balance_loss_clip": 1.05219638, + "balance_loss_mlp": 1.03358042, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4649536123855433, + "language_loss": 0.8275007, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.84936529, + "num_input_tokens_seen": 69939820, + "step": 3242, + "time_per_iteration": 2.5379538536071777 + }, + { + "auxiliary_loss_clip": 0.01069978, + "auxiliary_loss_mlp": 0.01010915, + "balance_loss_clip": 1.0317204, + "balance_loss_mlp": 1.0083636, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8395715271614708, + "language_loss": 0.57504666, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59585559, + "num_input_tokens_seen": 70002145, + "step": 3243, + "time_per_iteration": 3.0350723266601562 + }, + { + "auxiliary_loss_clip": 0.01141537, + "auxiliary_loss_mlp": 0.01057887, + "balance_loss_clip": 1.05067134, + "balance_loss_mlp": 1.03671551, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 3.35384610512994, + "language_loss": 0.83379889, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85579312, + "num_input_tokens_seen": 70020510, + "step": 3244, + "time_per_iteration": 2.464200973510742 + }, + { + "auxiliary_loss_clip": 0.01147937, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.05779028, + "balance_loss_mlp": 1.03554368, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.9210443463987852, + "language_loss": 0.77074438, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79276228, + "num_input_tokens_seen": 70040760, + "step": 3245, + "time_per_iteration": 2.4864490032196045 + }, + { + "auxiliary_loss_clip": 0.01147046, + "auxiliary_loss_mlp": 0.01044578, + "balance_loss_clip": 1.05443037, + "balance_loss_mlp": 1.02514684, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.6327538805520128, + "language_loss": 0.83984196, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86175823, + "num_input_tokens_seen": 70058720, + "step": 3246, + "time_per_iteration": 2.481191873550415 + }, + { + "auxiliary_loss_clip": 0.0114999, + "auxiliary_loss_mlp": 0.00785229, + "balance_loss_clip": 1.05620313, + "balance_loss_mlp": 1.00065947, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.015603713111444, + "language_loss": 0.75559056, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.77494276, + "num_input_tokens_seen": 70076470, + "step": 3247, + "time_per_iteration": 2.5559070110321045 + }, + { + "auxiliary_loss_clip": 0.01122855, + "auxiliary_loss_mlp": 0.01046773, + "balance_loss_clip": 1.05527556, + "balance_loss_mlp": 1.02941585, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.610381421325109, + "language_loss": 0.75472152, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77641773, + "num_input_tokens_seen": 70096220, + "step": 3248, + "time_per_iteration": 2.6147735118865967 + }, + { + "auxiliary_loss_clip": 0.01157354, + "auxiliary_loss_mlp": 0.01048336, + "balance_loss_clip": 1.05349636, + "balance_loss_mlp": 1.03019285, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 1.915475667532781, + "language_loss": 0.78139412, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.803451, + "num_input_tokens_seen": 70114800, + "step": 3249, + "time_per_iteration": 2.471874237060547 + }, + { + "auxiliary_loss_clip": 0.01148348, + "auxiliary_loss_mlp": 0.01047452, + "balance_loss_clip": 1.05345011, + "balance_loss_mlp": 1.02928472, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.6001034088722579, + "language_loss": 0.73360866, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75556666, + "num_input_tokens_seen": 70134930, + "step": 3250, + "time_per_iteration": 2.4871761798858643 + }, + { + "auxiliary_loss_clip": 0.0110452, + "auxiliary_loss_mlp": 0.01048735, + "balance_loss_clip": 1.04879069, + "balance_loss_mlp": 1.03088951, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9696967502782654, + "language_loss": 0.79218429, + "learning_rate": 3.719619589699017e-06, + "loss": 0.81371689, + "num_input_tokens_seen": 70152045, + "step": 3251, + "time_per_iteration": 2.561272382736206 + }, + { + "auxiliary_loss_clip": 0.01157726, + "auxiliary_loss_mlp": 0.01046265, + "balance_loss_clip": 1.05420828, + "balance_loss_mlp": 1.02800262, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 2.166568577219748, + "language_loss": 0.84202588, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.86406577, + "num_input_tokens_seen": 70169240, + "step": 3252, + "time_per_iteration": 2.4198050498962402 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.01058469, + "balance_loss_clip": 1.05121577, + "balance_loss_mlp": 1.03659415, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.9610794885496625, + "language_loss": 0.7384569, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76048744, + "num_input_tokens_seen": 70192690, + "step": 3253, + "time_per_iteration": 2.571112871170044 + }, + { + "auxiliary_loss_clip": 0.01105879, + "auxiliary_loss_mlp": 0.01050648, + "balance_loss_clip": 1.0451442, + "balance_loss_mlp": 1.03197944, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.6351422949974868, + "language_loss": 0.76653206, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78809732, + "num_input_tokens_seen": 70209685, + "step": 3254, + "time_per_iteration": 2.5818843841552734 + }, + { + "auxiliary_loss_clip": 0.0102917, + "auxiliary_loss_mlp": 0.01012952, + "balance_loss_clip": 1.03208518, + "balance_loss_mlp": 1.01037681, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.766525340522001, + "language_loss": 0.55313432, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57355553, + "num_input_tokens_seen": 70265050, + "step": 3255, + "time_per_iteration": 3.1491339206695557 + }, + { + "auxiliary_loss_clip": 0.01138318, + "auxiliary_loss_mlp": 0.01049746, + "balance_loss_clip": 1.05616975, + "balance_loss_mlp": 1.03049362, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.861705609856213, + "language_loss": 0.70597798, + "learning_rate": 3.718624450942688e-06, + "loss": 0.72785866, + "num_input_tokens_seen": 70281830, + "step": 3256, + "time_per_iteration": 2.515564441680908 + }, + { + "auxiliary_loss_clip": 0.01156396, + "auxiliary_loss_mlp": 0.01047517, + "balance_loss_clip": 1.05391729, + "balance_loss_mlp": 1.02961183, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.2923186126442143, + "language_loss": 0.80489707, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82693624, + "num_input_tokens_seen": 70297420, + "step": 3257, + "time_per_iteration": 2.4438889026641846 + }, + { + "auxiliary_loss_clip": 0.01107056, + "auxiliary_loss_mlp": 0.0104453, + "balance_loss_clip": 1.05173707, + "balance_loss_mlp": 1.02681553, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 2.0754910263951865, + "language_loss": 0.74872446, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77024031, + "num_input_tokens_seen": 70319210, + "step": 3258, + "time_per_iteration": 2.597097158432007 + }, + { + "auxiliary_loss_clip": 0.01085715, + "auxiliary_loss_mlp": 0.01052136, + "balance_loss_clip": 1.04232836, + "balance_loss_mlp": 1.03131008, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.6600885558371754, + "language_loss": 0.73921013, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.76058865, + "num_input_tokens_seen": 70339045, + "step": 3259, + "time_per_iteration": 4.158523321151733 + }, + { + "auxiliary_loss_clip": 0.01136244, + "auxiliary_loss_mlp": 0.01050402, + "balance_loss_clip": 1.05381906, + "balance_loss_mlp": 1.02967167, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.4986011468409357, + "language_loss": 0.77109933, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79296583, + "num_input_tokens_seen": 70356505, + "step": 3260, + "time_per_iteration": 2.486923933029175 + }, + { + "auxiliary_loss_clip": 0.01149075, + "auxiliary_loss_mlp": 0.0105273, + "balance_loss_clip": 1.0521152, + "balance_loss_mlp": 1.03389466, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.1303134506112276, + "language_loss": 0.82233286, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84435093, + "num_input_tokens_seen": 70375410, + "step": 3261, + "time_per_iteration": 3.9722487926483154 + }, + { + "auxiliary_loss_clip": 0.01121065, + "auxiliary_loss_mlp": 0.01051692, + "balance_loss_clip": 1.05329001, + "balance_loss_mlp": 1.03260696, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 1.892852544612564, + "language_loss": 0.76692092, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78864849, + "num_input_tokens_seen": 70396315, + "step": 3262, + "time_per_iteration": 2.5986897945404053 + }, + { + "auxiliary_loss_clip": 0.01153817, + "auxiliary_loss_mlp": 0.0105574, + "balance_loss_clip": 1.06109309, + "balance_loss_mlp": 1.03783429, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.6090798846376726, + "language_loss": 0.86327481, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88537037, + "num_input_tokens_seen": 70417945, + "step": 3263, + "time_per_iteration": 2.5421719551086426 + }, + { + "auxiliary_loss_clip": 0.01135921, + "auxiliary_loss_mlp": 0.01050351, + "balance_loss_clip": 1.05448031, + "balance_loss_mlp": 1.03207636, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.680609115037549, + "language_loss": 0.74118, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76304269, + "num_input_tokens_seen": 70438690, + "step": 3264, + "time_per_iteration": 2.5407285690307617 + }, + { + "auxiliary_loss_clip": 0.01146175, + "auxiliary_loss_mlp": 0.01052178, + "balance_loss_clip": 1.05697274, + "balance_loss_mlp": 1.03455842, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 2.0820820226018246, + "language_loss": 0.78988457, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.81186807, + "num_input_tokens_seen": 70455385, + "step": 3265, + "time_per_iteration": 2.47965669631958 + }, + { + "auxiliary_loss_clip": 0.01023607, + "auxiliary_loss_mlp": 0.01036511, + "balance_loss_clip": 1.02555823, + "balance_loss_mlp": 1.03427017, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.8060110559093115, + "language_loss": 0.5338192, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55442035, + "num_input_tokens_seen": 70514280, + "step": 3266, + "time_per_iteration": 3.1470189094543457 + }, + { + "auxiliary_loss_clip": 0.01131756, + "auxiliary_loss_mlp": 0.00786389, + "balance_loss_clip": 1.05221403, + "balance_loss_mlp": 1.00067818, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9500094021754977, + "language_loss": 0.80137789, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82055932, + "num_input_tokens_seen": 70531800, + "step": 3267, + "time_per_iteration": 3.944145441055298 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.0501281, + "balance_loss_mlp": 1.02859426, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 1.7998734879540017, + "language_loss": 0.86545646, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.88724828, + "num_input_tokens_seen": 70550615, + "step": 3268, + "time_per_iteration": 2.4890224933624268 + }, + { + "auxiliary_loss_clip": 0.01106785, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.05353653, + "balance_loss_mlp": 1.02607226, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 1.9702306469378903, + "language_loss": 0.69301784, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71452242, + "num_input_tokens_seen": 70568690, + "step": 3269, + "time_per_iteration": 2.572282314300537 + }, + { + "auxiliary_loss_clip": 0.01119009, + "auxiliary_loss_mlp": 0.01062541, + "balance_loss_clip": 1.04668987, + "balance_loss_mlp": 1.04252577, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 1.9832529152495457, + "language_loss": 0.80683374, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82864916, + "num_input_tokens_seen": 70588665, + "step": 3270, + "time_per_iteration": 2.606862783432007 + }, + { + "auxiliary_loss_clip": 0.01142635, + "auxiliary_loss_mlp": 0.01049546, + "balance_loss_clip": 1.05100942, + "balance_loss_mlp": 1.03199863, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 2.187902119322974, + "language_loss": 0.84511179, + "learning_rate": 3.715629262894028e-06, + "loss": 0.8670336, + "num_input_tokens_seen": 70606900, + "step": 3271, + "time_per_iteration": 2.482891798019409 + }, + { + "auxiliary_loss_clip": 0.01141184, + "auxiliary_loss_mlp": 0.01053796, + "balance_loss_clip": 1.05359221, + "balance_loss_mlp": 1.03498459, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.016085059781545, + "language_loss": 0.80273819, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82468808, + "num_input_tokens_seen": 70625955, + "step": 3272, + "time_per_iteration": 2.519352912902832 + }, + { + "auxiliary_loss_clip": 0.01125471, + "auxiliary_loss_mlp": 0.01054293, + "balance_loss_clip": 1.0473268, + "balance_loss_mlp": 1.03196502, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 2.0272927719065206, + "language_loss": 0.80943763, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.83123529, + "num_input_tokens_seen": 70646090, + "step": 3273, + "time_per_iteration": 2.5326547622680664 + }, + { + "auxiliary_loss_clip": 0.01146394, + "auxiliary_loss_mlp": 0.01055154, + "balance_loss_clip": 1.05278563, + "balance_loss_mlp": 1.03680801, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 2.449812154755596, + "language_loss": 0.77966571, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80168116, + "num_input_tokens_seen": 70666065, + "step": 3274, + "time_per_iteration": 2.5220835208892822 + }, + { + "auxiliary_loss_clip": 0.01141526, + "auxiliary_loss_mlp": 0.01050638, + "balance_loss_clip": 1.05308533, + "balance_loss_mlp": 1.03094482, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.7648848475118832, + "language_loss": 0.81342983, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83535147, + "num_input_tokens_seen": 70681580, + "step": 3275, + "time_per_iteration": 2.479365825653076 + }, + { + "auxiliary_loss_clip": 0.01118469, + "auxiliary_loss_mlp": 0.01046482, + "balance_loss_clip": 1.05059659, + "balance_loss_mlp": 1.02684832, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 2.476810449337576, + "language_loss": 0.80354357, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.82519305, + "num_input_tokens_seen": 70697745, + "step": 3276, + "time_per_iteration": 4.035221576690674 + }, + { + "auxiliary_loss_clip": 0.01143457, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.05118573, + "balance_loss_mlp": 1.02114844, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.2174414694225186, + "language_loss": 0.89530361, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91713703, + "num_input_tokens_seen": 70715110, + "step": 3277, + "time_per_iteration": 2.4789516925811768 + }, + { + "auxiliary_loss_clip": 0.01109268, + "auxiliary_loss_mlp": 0.01048924, + "balance_loss_clip": 1.04898858, + "balance_loss_mlp": 1.02787185, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.2863478577688383, + "language_loss": 0.6201936, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64177555, + "num_input_tokens_seen": 70734715, + "step": 3278, + "time_per_iteration": 2.5852859020233154 + }, + { + "auxiliary_loss_clip": 0.01115147, + "auxiliary_loss_mlp": 0.0105362, + "balance_loss_clip": 1.04616618, + "balance_loss_mlp": 1.03427243, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 1.9224568318014985, + "language_loss": 0.73762035, + "learning_rate": 3.714025842413166e-06, + "loss": 0.7593081, + "num_input_tokens_seen": 70752650, + "step": 3279, + "time_per_iteration": 2.5797770023345947 + }, + { + "auxiliary_loss_clip": 0.01147263, + "auxiliary_loss_mlp": 0.01051612, + "balance_loss_clip": 1.05202699, + "balance_loss_mlp": 1.03337336, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.51920354226128, + "language_loss": 0.82596338, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84795213, + "num_input_tokens_seen": 70772365, + "step": 3280, + "time_per_iteration": 2.5031697750091553 + }, + { + "auxiliary_loss_clip": 0.01111181, + "auxiliary_loss_mlp": 0.01051394, + "balance_loss_clip": 1.04982281, + "balance_loss_mlp": 1.03236806, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.7682965486213633, + "language_loss": 0.78154039, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80316615, + "num_input_tokens_seen": 70790340, + "step": 3281, + "time_per_iteration": 2.5174012184143066 + }, + { + "auxiliary_loss_clip": 0.01124586, + "auxiliary_loss_mlp": 0.01043919, + "balance_loss_clip": 1.0503335, + "balance_loss_mlp": 1.02700329, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.9134304574775602, + "language_loss": 0.79590225, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81758738, + "num_input_tokens_seen": 70809295, + "step": 3282, + "time_per_iteration": 2.506035566329956 + }, + { + "auxiliary_loss_clip": 0.01120335, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.06003368, + "balance_loss_mlp": 1.02738953, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.088193748240561, + "language_loss": 0.719823, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.7414875, + "num_input_tokens_seen": 70828765, + "step": 3283, + "time_per_iteration": 2.601633071899414 + }, + { + "auxiliary_loss_clip": 0.01138588, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.05258751, + "balance_loss_mlp": 1.0303148, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.6060120133392766, + "language_loss": 0.79123795, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81309724, + "num_input_tokens_seen": 70846805, + "step": 3284, + "time_per_iteration": 2.443648338317871 + }, + { + "auxiliary_loss_clip": 0.01127498, + "auxiliary_loss_mlp": 0.00786234, + "balance_loss_clip": 1.04746628, + "balance_loss_mlp": 1.00079846, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 1.9018339870370595, + "language_loss": 0.86389732, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88303465, + "num_input_tokens_seen": 70863805, + "step": 3285, + "time_per_iteration": 2.521214485168457 + }, + { + "auxiliary_loss_clip": 0.01120212, + "auxiliary_loss_mlp": 0.01045048, + "balance_loss_clip": 1.05400085, + "balance_loss_mlp": 1.02680874, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.2137412015709956, + "language_loss": 0.88020408, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90185666, + "num_input_tokens_seen": 70882660, + "step": 3286, + "time_per_iteration": 2.537281036376953 + }, + { + "auxiliary_loss_clip": 0.01123534, + "auxiliary_loss_mlp": 0.01053761, + "balance_loss_clip": 1.05402458, + "balance_loss_mlp": 1.03263712, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.7778197420473387, + "language_loss": 0.78575987, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80753291, + "num_input_tokens_seen": 70898765, + "step": 3287, + "time_per_iteration": 2.513157367706299 + }, + { + "auxiliary_loss_clip": 0.01132877, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_clip": 1.05560422, + "balance_loss_mlp": 1.02686167, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.397774681530781, + "language_loss": 0.8091594, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83094871, + "num_input_tokens_seen": 70916370, + "step": 3288, + "time_per_iteration": 2.4677910804748535 + }, + { + "auxiliary_loss_clip": 0.0113183, + "auxiliary_loss_mlp": 0.01052597, + "balance_loss_clip": 1.04668927, + "balance_loss_mlp": 1.03436995, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.6657958214630484, + "language_loss": 0.73264915, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75449336, + "num_input_tokens_seen": 70934870, + "step": 3289, + "time_per_iteration": 2.469399929046631 + }, + { + "auxiliary_loss_clip": 0.01139734, + "auxiliary_loss_mlp": 0.01048684, + "balance_loss_clip": 1.05794752, + "balance_loss_mlp": 1.03058839, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 1.8635813657360694, + "language_loss": 0.7947219, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81660604, + "num_input_tokens_seen": 70955140, + "step": 3290, + "time_per_iteration": 2.5645971298217773 + }, + { + "auxiliary_loss_clip": 0.01052871, + "auxiliary_loss_mlp": 0.01007632, + "balance_loss_clip": 1.03416872, + "balance_loss_mlp": 1.00562906, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9096922168269846, + "language_loss": 0.60427654, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62488157, + "num_input_tokens_seen": 71012005, + "step": 3291, + "time_per_iteration": 3.119032382965088 + }, + { + "auxiliary_loss_clip": 0.01164164, + "auxiliary_loss_mlp": 0.01049583, + "balance_loss_clip": 1.05664623, + "balance_loss_mlp": 1.02992511, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.4742550817362097, + "language_loss": 0.81130135, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83343887, + "num_input_tokens_seen": 71031140, + "step": 3292, + "time_per_iteration": 2.5053865909576416 + }, + { + "auxiliary_loss_clip": 0.01124482, + "auxiliary_loss_mlp": 0.00788103, + "balance_loss_clip": 1.05438256, + "balance_loss_mlp": 1.00072408, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 1.7120647367812347, + "language_loss": 0.81959915, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.83872497, + "num_input_tokens_seen": 71050250, + "step": 3293, + "time_per_iteration": 2.531575918197632 + }, + { + "auxiliary_loss_clip": 0.0114707, + "auxiliary_loss_mlp": 0.0105871, + "balance_loss_clip": 1.05660331, + "balance_loss_mlp": 1.03870702, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 3.546375060954397, + "language_loss": 0.60965735, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63171518, + "num_input_tokens_seen": 71068665, + "step": 3294, + "time_per_iteration": 2.5232067108154297 + }, + { + "auxiliary_loss_clip": 0.01143343, + "auxiliary_loss_mlp": 0.01062792, + "balance_loss_clip": 1.0606612, + "balance_loss_mlp": 1.04544747, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 1.996436168200344, + "language_loss": 0.87056684, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89262819, + "num_input_tokens_seen": 71085320, + "step": 3295, + "time_per_iteration": 2.5755953788757324 + }, + { + "auxiliary_loss_clip": 0.01116919, + "auxiliary_loss_mlp": 0.01065433, + "balance_loss_clip": 1.05027211, + "balance_loss_mlp": 1.04831505, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 2.3498110184274195, + "language_loss": 0.80703115, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82885474, + "num_input_tokens_seen": 71102020, + "step": 3296, + "time_per_iteration": 2.5051190853118896 + }, + { + "auxiliary_loss_clip": 0.01129842, + "auxiliary_loss_mlp": 0.01054154, + "balance_loss_clip": 1.05220544, + "balance_loss_mlp": 1.03428197, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.1600542338347197, + "language_loss": 0.68751431, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70935428, + "num_input_tokens_seen": 71123390, + "step": 3297, + "time_per_iteration": 2.5731170177459717 + }, + { + "auxiliary_loss_clip": 0.01155957, + "auxiliary_loss_mlp": 0.01052869, + "balance_loss_clip": 1.05678844, + "balance_loss_mlp": 1.0361439, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.6687510065649234, + "language_loss": 0.8122443, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83433259, + "num_input_tokens_seen": 71141800, + "step": 3298, + "time_per_iteration": 3.9075684547424316 + }, + { + "auxiliary_loss_clip": 0.01138185, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.05416179, + "balance_loss_mlp": 1.03330314, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 1.9604025727357222, + "language_loss": 0.8535319, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87545073, + "num_input_tokens_seen": 71159505, + "step": 3299, + "time_per_iteration": 2.4883904457092285 + }, + { + "auxiliary_loss_clip": 0.0103896, + "auxiliary_loss_mlp": 0.01010889, + "balance_loss_clip": 1.02885485, + "balance_loss_mlp": 1.00891042, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.8040922825843156, + "language_loss": 0.5324626, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55296111, + "num_input_tokens_seen": 71223265, + "step": 3300, + "time_per_iteration": 4.541759967803955 + }, + { + "auxiliary_loss_clip": 0.01109147, + "auxiliary_loss_mlp": 0.0108778, + "balance_loss_clip": 1.04596055, + "balance_loss_mlp": 1.06501055, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.5882209717141014, + "language_loss": 0.73091292, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75288218, + "num_input_tokens_seen": 71242385, + "step": 3301, + "time_per_iteration": 2.542071580886841 + }, + { + "auxiliary_loss_clip": 0.0112197, + "auxiliary_loss_mlp": 0.01044389, + "balance_loss_clip": 1.05381227, + "balance_loss_mlp": 1.02701974, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.867830532561094, + "language_loss": 0.88419783, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90586138, + "num_input_tokens_seen": 71258990, + "step": 3302, + "time_per_iteration": 2.502896308898926 + }, + { + "auxiliary_loss_clip": 0.01125479, + "auxiliary_loss_mlp": 0.0105609, + "balance_loss_clip": 1.05695438, + "balance_loss_mlp": 1.03763628, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.8597900337168363, + "language_loss": 0.73376679, + "learning_rate": 3.709190638115111e-06, + "loss": 0.75558251, + "num_input_tokens_seen": 71282770, + "step": 3303, + "time_per_iteration": 2.744868516921997 + }, + { + "auxiliary_loss_clip": 0.01145647, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_clip": 1.05543947, + "balance_loss_mlp": 1.02907455, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.8318900078235605, + "language_loss": 0.75035727, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.77228278, + "num_input_tokens_seen": 71301410, + "step": 3304, + "time_per_iteration": 2.5926973819732666 + }, + { + "auxiliary_loss_clip": 0.01139317, + "auxiliary_loss_mlp": 0.010416, + "balance_loss_clip": 1.05717623, + "balance_loss_mlp": 1.02415991, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 1.673022449972856, + "language_loss": 0.85855556, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88036478, + "num_input_tokens_seen": 71319670, + "step": 3305, + "time_per_iteration": 2.5242536067962646 + }, + { + "auxiliary_loss_clip": 0.01135323, + "auxiliary_loss_mlp": 0.01045783, + "balance_loss_clip": 1.05158997, + "balance_loss_mlp": 1.02842593, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.6564566508508676, + "language_loss": 0.6824677, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70427883, + "num_input_tokens_seen": 71339850, + "step": 3306, + "time_per_iteration": 4.041804313659668 + }, + { + "auxiliary_loss_clip": 0.01122996, + "auxiliary_loss_mlp": 0.01040423, + "balance_loss_clip": 1.04908645, + "balance_loss_mlp": 1.02367425, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.7260943802897515, + "language_loss": 0.76481855, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78645277, + "num_input_tokens_seen": 71359795, + "step": 3307, + "time_per_iteration": 2.554908275604248 + }, + { + "auxiliary_loss_clip": 0.01158659, + "auxiliary_loss_mlp": 0.01043635, + "balance_loss_clip": 1.05791616, + "balance_loss_mlp": 1.02661204, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.9190090191657738, + "language_loss": 0.76079625, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78281915, + "num_input_tokens_seen": 71378885, + "step": 3308, + "time_per_iteration": 2.4720876216888428 + }, + { + "auxiliary_loss_clip": 0.01110248, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.05625868, + "balance_loss_mlp": 1.02705443, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.7257434760047867, + "language_loss": 0.75785989, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.77941239, + "num_input_tokens_seen": 71397285, + "step": 3309, + "time_per_iteration": 2.5652074813842773 + }, + { + "auxiliary_loss_clip": 0.01145657, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.05501509, + "balance_loss_mlp": 1.02775085, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.5196600782206806, + "language_loss": 0.87970769, + "learning_rate": 3.707773333313917e-06, + "loss": 0.901631, + "num_input_tokens_seen": 71415775, + "step": 3310, + "time_per_iteration": 2.5225391387939453 + }, + { + "auxiliary_loss_clip": 0.01153772, + "auxiliary_loss_mlp": 0.01039892, + "balance_loss_clip": 1.05305851, + "balance_loss_mlp": 1.0215342, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 2.02096773213656, + "language_loss": 0.64462638, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66656303, + "num_input_tokens_seen": 71437315, + "step": 3311, + "time_per_iteration": 2.5666496753692627 + }, + { + "auxiliary_loss_clip": 0.01115804, + "auxiliary_loss_mlp": 0.01046116, + "balance_loss_clip": 1.04553127, + "balance_loss_mlp": 1.02785289, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.6793373196752612, + "language_loss": 0.74451417, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76613337, + "num_input_tokens_seen": 71456320, + "step": 3312, + "time_per_iteration": 2.5602753162384033 + }, + { + "auxiliary_loss_clip": 0.01147116, + "auxiliary_loss_mlp": 0.01044014, + "balance_loss_clip": 1.05643821, + "balance_loss_mlp": 1.02675223, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.036613592492963, + "language_loss": 0.83166581, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.85357714, + "num_input_tokens_seen": 71475360, + "step": 3313, + "time_per_iteration": 2.465758800506592 + }, + { + "auxiliary_loss_clip": 0.01146948, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.05625129, + "balance_loss_mlp": 1.02581167, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 2.432259209740796, + "language_loss": 0.8072775, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.82917881, + "num_input_tokens_seen": 71496155, + "step": 3314, + "time_per_iteration": 2.5478062629699707 + }, + { + "auxiliary_loss_clip": 0.01118398, + "auxiliary_loss_mlp": 0.0104359, + "balance_loss_clip": 1.04656959, + "balance_loss_mlp": 1.0260663, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.4852239773942937, + "language_loss": 0.87486255, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89648241, + "num_input_tokens_seen": 71517295, + "step": 3315, + "time_per_iteration": 4.026681900024414 + }, + { + "auxiliary_loss_clip": 0.01126938, + "auxiliary_loss_mlp": 0.00784718, + "balance_loss_clip": 1.05733728, + "balance_loss_mlp": 1.00090992, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5418049767740618, + "language_loss": 0.70799041, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.72710705, + "num_input_tokens_seen": 71540000, + "step": 3316, + "time_per_iteration": 2.613628625869751 + }, + { + "auxiliary_loss_clip": 0.01024371, + "auxiliary_loss_mlp": 0.01012188, + "balance_loss_clip": 1.03304946, + "balance_loss_mlp": 1.00987518, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8361679802883356, + "language_loss": 0.6629054, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68327099, + "num_input_tokens_seen": 71607880, + "step": 3317, + "time_per_iteration": 3.327333450317383 + }, + { + "auxiliary_loss_clip": 0.01151672, + "auxiliary_loss_mlp": 0.01052203, + "balance_loss_clip": 1.05635285, + "balance_loss_mlp": 1.03326046, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.2589896430153, + "language_loss": 0.74319768, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76523638, + "num_input_tokens_seen": 71625695, + "step": 3318, + "time_per_iteration": 2.668901205062866 + }, + { + "auxiliary_loss_clip": 0.01124746, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.05348611, + "balance_loss_mlp": 1.03124094, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 2.115337325933263, + "language_loss": 0.78779483, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.80952358, + "num_input_tokens_seen": 71648520, + "step": 3319, + "time_per_iteration": 2.686349630355835 + }, + { + "auxiliary_loss_clip": 0.01141584, + "auxiliary_loss_mlp": 0.01046889, + "balance_loss_clip": 1.05807126, + "balance_loss_mlp": 1.02725494, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.257389489289609, + "language_loss": 0.75596833, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77785301, + "num_input_tokens_seen": 71672185, + "step": 3320, + "time_per_iteration": 2.7661802768707275 + }, + { + "auxiliary_loss_clip": 0.01128061, + "auxiliary_loss_mlp": 0.01045684, + "balance_loss_clip": 1.05573881, + "balance_loss_mlp": 1.02774334, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.088484477158945, + "language_loss": 0.80014813, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82188559, + "num_input_tokens_seen": 71692890, + "step": 3321, + "time_per_iteration": 2.5589799880981445 + }, + { + "auxiliary_loss_clip": 0.01044333, + "auxiliary_loss_mlp": 0.01023433, + "balance_loss_clip": 1.03352213, + "balance_loss_mlp": 1.02152598, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.8789125713888362, + "language_loss": 0.65162086, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67229855, + "num_input_tokens_seen": 71745815, + "step": 3322, + "time_per_iteration": 2.906531810760498 + }, + { + "auxiliary_loss_clip": 0.01046375, + "auxiliary_loss_mlp": 0.01008657, + "balance_loss_clip": 1.03605485, + "balance_loss_mlp": 1.00680912, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7915999476098778, + "language_loss": 0.57048261, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59103298, + "num_input_tokens_seen": 71806915, + "step": 3323, + "time_per_iteration": 3.178133726119995 + }, + { + "auxiliary_loss_clip": 0.01136979, + "auxiliary_loss_mlp": 0.00784123, + "balance_loss_clip": 1.05538952, + "balance_loss_mlp": 1.00092161, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 2.4094939140063185, + "language_loss": 0.80493927, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82415032, + "num_input_tokens_seen": 71824645, + "step": 3324, + "time_per_iteration": 2.489957571029663 + }, + { + "auxiliary_loss_clip": 0.01137955, + "auxiliary_loss_mlp": 0.01047554, + "balance_loss_clip": 1.04958177, + "balance_loss_mlp": 1.02886176, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.96205867117288, + "language_loss": 0.54120326, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.56305826, + "num_input_tokens_seen": 71845125, + "step": 3325, + "time_per_iteration": 2.5281710624694824 + }, + { + "auxiliary_loss_clip": 0.01133074, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_clip": 1.05352998, + "balance_loss_mlp": 1.02823186, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.279433169796584, + "language_loss": 0.86075729, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88254035, + "num_input_tokens_seen": 71863500, + "step": 3326, + "time_per_iteration": 2.5125668048858643 + }, + { + "auxiliary_loss_clip": 0.01154864, + "auxiliary_loss_mlp": 0.01045124, + "balance_loss_clip": 1.05659854, + "balance_loss_mlp": 1.02842259, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 2.7990572426022644, + "language_loss": 0.72249007, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74448991, + "num_input_tokens_seen": 71881845, + "step": 3327, + "time_per_iteration": 2.4567956924438477 + }, + { + "auxiliary_loss_clip": 0.01135795, + "auxiliary_loss_mlp": 0.01046139, + "balance_loss_clip": 1.05534816, + "balance_loss_mlp": 1.0263381, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.7605117785573245, + "language_loss": 0.76191819, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78373754, + "num_input_tokens_seen": 71900940, + "step": 3328, + "time_per_iteration": 2.5133957862854004 + }, + { + "auxiliary_loss_clip": 0.01120437, + "auxiliary_loss_mlp": 0.01042125, + "balance_loss_clip": 1.0463686, + "balance_loss_mlp": 1.02574587, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 2.0864297551532793, + "language_loss": 0.6975252, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.71915084, + "num_input_tokens_seen": 71921925, + "step": 3329, + "time_per_iteration": 2.598174810409546 + }, + { + "auxiliary_loss_clip": 0.01111535, + "auxiliary_loss_mlp": 0.01061908, + "balance_loss_clip": 1.04759264, + "balance_loss_mlp": 1.03923416, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.7188802736317994, + "language_loss": 0.81434995, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83608437, + "num_input_tokens_seen": 71941855, + "step": 3330, + "time_per_iteration": 2.6086742877960205 + }, + { + "auxiliary_loss_clip": 0.01138577, + "auxiliary_loss_mlp": 0.01043789, + "balance_loss_clip": 1.05092525, + "balance_loss_mlp": 1.02547812, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.2748696408938285, + "language_loss": 0.7697767, + "learning_rate": 3.703502390349417e-06, + "loss": 0.79160035, + "num_input_tokens_seen": 71960915, + "step": 3331, + "time_per_iteration": 2.4770820140838623 + }, + { + "auxiliary_loss_clip": 0.01095995, + "auxiliary_loss_mlp": 0.01058476, + "balance_loss_clip": 1.04088068, + "balance_loss_mlp": 1.03724527, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 5.723337687200452, + "language_loss": 0.79181492, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81335968, + "num_input_tokens_seen": 71979220, + "step": 3332, + "time_per_iteration": 2.570133686065674 + }, + { + "auxiliary_loss_clip": 0.01052761, + "auxiliary_loss_mlp": 0.01023217, + "balance_loss_clip": 1.02626407, + "balance_loss_mlp": 1.02108276, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9342970468904608, + "language_loss": 0.62006462, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64082432, + "num_input_tokens_seen": 72033950, + "step": 3333, + "time_per_iteration": 2.9514670372009277 + }, + { + "auxiliary_loss_clip": 0.01120376, + "auxiliary_loss_mlp": 0.00786657, + "balance_loss_clip": 1.05329084, + "balance_loss_mlp": 1.00093341, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.1539704195878198, + "language_loss": 0.8161363, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.83520663, + "num_input_tokens_seen": 72051395, + "step": 3334, + "time_per_iteration": 2.5737316608428955 + }, + { + "auxiliary_loss_clip": 0.01100178, + "auxiliary_loss_mlp": 0.0104774, + "balance_loss_clip": 1.04951489, + "balance_loss_mlp": 1.02791548, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.329221301088921, + "language_loss": 0.74240148, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76388067, + "num_input_tokens_seen": 72071305, + "step": 3335, + "time_per_iteration": 2.6944756507873535 + }, + { + "auxiliary_loss_clip": 0.01155405, + "auxiliary_loss_mlp": 0.0106368, + "balance_loss_clip": 1.06075335, + "balance_loss_mlp": 1.04569113, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.7567296262236123, + "language_loss": 0.80092609, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82311696, + "num_input_tokens_seen": 72090165, + "step": 3336, + "time_per_iteration": 2.5196409225463867 + }, + { + "auxiliary_loss_clip": 0.01121926, + "auxiliary_loss_mlp": 0.01056044, + "balance_loss_clip": 1.0523386, + "balance_loss_mlp": 1.03588545, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 2.1175805956214018, + "language_loss": 0.7763741, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.79815376, + "num_input_tokens_seen": 72107210, + "step": 3337, + "time_per_iteration": 4.20624852180481 + }, + { + "auxiliary_loss_clip": 0.01163233, + "auxiliary_loss_mlp": 0.01054741, + "balance_loss_clip": 1.06048477, + "balance_loss_mlp": 1.03492856, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 1.8479325764826382, + "language_loss": 0.69108939, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71326905, + "num_input_tokens_seen": 72126315, + "step": 3338, + "time_per_iteration": 2.5125858783721924 + }, + { + "auxiliary_loss_clip": 0.0112372, + "auxiliary_loss_mlp": 0.0105843, + "balance_loss_clip": 1.05204701, + "balance_loss_mlp": 1.04088211, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 1.8188672611894199, + "language_loss": 0.69719762, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71901911, + "num_input_tokens_seen": 72146470, + "step": 3339, + "time_per_iteration": 4.032656669616699 + }, + { + "auxiliary_loss_clip": 0.0113141, + "auxiliary_loss_mlp": 0.01048398, + "balance_loss_clip": 1.06124747, + "balance_loss_mlp": 1.03005171, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.371482583794403, + "language_loss": 0.66161644, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68341446, + "num_input_tokens_seen": 72166600, + "step": 3340, + "time_per_iteration": 2.731203556060791 + }, + { + "auxiliary_loss_clip": 0.01150791, + "auxiliary_loss_mlp": 0.01040649, + "balance_loss_clip": 1.05657709, + "balance_loss_mlp": 1.02091992, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 5.609647127180164, + "language_loss": 0.74357468, + "learning_rate": 3.701458591066019e-06, + "loss": 0.7654891, + "num_input_tokens_seen": 72185160, + "step": 3341, + "time_per_iteration": 2.483081340789795 + }, + { + "auxiliary_loss_clip": 0.01117844, + "auxiliary_loss_mlp": 0.01050129, + "balance_loss_clip": 1.05521369, + "balance_loss_mlp": 1.03141284, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 2.1262855457561036, + "language_loss": 0.7171967, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73887646, + "num_input_tokens_seen": 72205160, + "step": 3342, + "time_per_iteration": 2.5899150371551514 + }, + { + "auxiliary_loss_clip": 0.01121896, + "auxiliary_loss_mlp": 0.01049416, + "balance_loss_clip": 1.05480433, + "balance_loss_mlp": 1.03080726, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 2.457187472928396, + "language_loss": 0.72460645, + "learning_rate": 3.701049056727384e-06, + "loss": 0.74631953, + "num_input_tokens_seen": 72223555, + "step": 3343, + "time_per_iteration": 2.607544183731079 + }, + { + "auxiliary_loss_clip": 0.0112033, + "auxiliary_loss_mlp": 0.01052896, + "balance_loss_clip": 1.04962349, + "balance_loss_mlp": 1.03325069, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 2.0208956448113278, + "language_loss": 0.81148696, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83321917, + "num_input_tokens_seen": 72242465, + "step": 3344, + "time_per_iteration": 2.5815911293029785 + }, + { + "auxiliary_loss_clip": 0.01158379, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_clip": 1.0558095, + "balance_loss_mlp": 1.02876186, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.1037797165680208, + "language_loss": 0.83665299, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85870463, + "num_input_tokens_seen": 72260655, + "step": 3345, + "time_per_iteration": 3.8268678188323975 + }, + { + "auxiliary_loss_clip": 0.0109848, + "auxiliary_loss_mlp": 0.01039806, + "balance_loss_clip": 1.04686666, + "balance_loss_mlp": 1.02316427, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.6324125123190218, + "language_loss": 0.67737544, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.69875836, + "num_input_tokens_seen": 72279055, + "step": 3346, + "time_per_iteration": 2.582387924194336 + }, + { + "auxiliary_loss_clip": 0.01119114, + "auxiliary_loss_mlp": 0.01049023, + "balance_loss_clip": 1.04859495, + "balance_loss_mlp": 1.03091478, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.5289227958267206, + "language_loss": 0.74105686, + "learning_rate": 3.70022921406487e-06, + "loss": 0.76273823, + "num_input_tokens_seen": 72297895, + "step": 3347, + "time_per_iteration": 2.5697872638702393 + }, + { + "auxiliary_loss_clip": 0.01147583, + "auxiliary_loss_mlp": 0.01050009, + "balance_loss_clip": 1.05729938, + "balance_loss_mlp": 1.03289104, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.7511227845337287, + "language_loss": 0.87036502, + "learning_rate": 3.70002409219765e-06, + "loss": 0.89234096, + "num_input_tokens_seen": 72318385, + "step": 3348, + "time_per_iteration": 2.49985408782959 + }, + { + "auxiliary_loss_clip": 0.01100444, + "auxiliary_loss_mlp": 0.01044558, + "balance_loss_clip": 1.04679275, + "balance_loss_mlp": 1.02528214, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.7346293699420114, + "language_loss": 0.70643944, + "learning_rate": 3.699818905865346e-06, + "loss": 0.72788942, + "num_input_tokens_seen": 72338235, + "step": 3349, + "time_per_iteration": 2.5966804027557373 + }, + { + "auxiliary_loss_clip": 0.01122558, + "auxiliary_loss_mlp": 0.01051325, + "balance_loss_clip": 1.05115271, + "balance_loss_mlp": 1.03153586, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.9898721297492716, + "language_loss": 0.71211541, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73385429, + "num_input_tokens_seen": 72357825, + "step": 3350, + "time_per_iteration": 2.520794153213501 + }, + { + "auxiliary_loss_clip": 0.01128721, + "auxiliary_loss_mlp": 0.01052026, + "balance_loss_clip": 1.04991698, + "balance_loss_mlp": 1.03006816, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 6.5057292185910525, + "language_loss": 0.76126385, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78307128, + "num_input_tokens_seen": 72376335, + "step": 3351, + "time_per_iteration": 2.536867618560791 + }, + { + "auxiliary_loss_clip": 0.01138252, + "auxiliary_loss_mlp": 0.01052071, + "balance_loss_clip": 1.05482936, + "balance_loss_mlp": 1.03196013, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.6761765242214783, + "language_loss": 0.8040086, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82591182, + "num_input_tokens_seen": 72395440, + "step": 3352, + "time_per_iteration": 2.559028148651123 + }, + { + "auxiliary_loss_clip": 0.0114959, + "auxiliary_loss_mlp": 0.01047332, + "balance_loss_clip": 1.05676425, + "balance_loss_mlp": 1.02905726, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 10.935541636411397, + "language_loss": 0.80482399, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82679325, + "num_input_tokens_seen": 72414670, + "step": 3353, + "time_per_iteration": 4.397000789642334 + }, + { + "auxiliary_loss_clip": 0.01129377, + "auxiliary_loss_mlp": 0.01054018, + "balance_loss_clip": 1.05492449, + "balance_loss_mlp": 1.03573132, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.942416031516906, + "language_loss": 0.89926863, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.92110258, + "num_input_tokens_seen": 72432210, + "step": 3354, + "time_per_iteration": 2.6258010864257812 + }, + { + "auxiliary_loss_clip": 0.01046152, + "auxiliary_loss_mlp": 0.00758982, + "balance_loss_clip": 1.02843428, + "balance_loss_mlp": 1.00054157, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8302479875674377, + "language_loss": 0.55853796, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57658935, + "num_input_tokens_seen": 72489225, + "step": 3355, + "time_per_iteration": 3.0630578994750977 + }, + { + "auxiliary_loss_clip": 0.01131878, + "auxiliary_loss_mlp": 0.00784743, + "balance_loss_clip": 1.05393338, + "balance_loss_mlp": 1.00085664, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.5444102283516594, + "language_loss": 0.84307384, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86224008, + "num_input_tokens_seen": 72508715, + "step": 3356, + "time_per_iteration": 2.5524039268493652 + }, + { + "auxiliary_loss_clip": 0.01132002, + "auxiliary_loss_mlp": 0.01053909, + "balance_loss_clip": 1.05007112, + "balance_loss_mlp": 1.03010321, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 3.495052229081349, + "language_loss": 0.69590831, + "learning_rate": 3.698175095398085e-06, + "loss": 0.71776736, + "num_input_tokens_seen": 72525135, + "step": 3357, + "time_per_iteration": 2.509931802749634 + }, + { + "auxiliary_loss_clip": 0.011416, + "auxiliary_loss_mlp": 0.01048158, + "balance_loss_clip": 1.05623317, + "balance_loss_mlp": 1.02865577, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.7506050181654207, + "language_loss": 0.71791828, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.73981589, + "num_input_tokens_seen": 72543690, + "step": 3358, + "time_per_iteration": 2.5297024250030518 + }, + { + "auxiliary_loss_clip": 0.01139398, + "auxiliary_loss_mlp": 0.0105378, + "balance_loss_clip": 1.05030525, + "balance_loss_mlp": 1.03701925, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 1.7886786013524623, + "language_loss": 0.83744889, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85938072, + "num_input_tokens_seen": 72560725, + "step": 3359, + "time_per_iteration": 2.473055124282837 + }, + { + "auxiliary_loss_clip": 0.0105154, + "auxiliary_loss_mlp": 0.01012951, + "balance_loss_clip": 1.02303076, + "balance_loss_mlp": 1.01073384, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.7807635386479491, + "language_loss": 0.58929539, + "learning_rate": 3.697557603741482e-06, + "loss": 0.60994029, + "num_input_tokens_seen": 72621940, + "step": 3360, + "time_per_iteration": 3.0203375816345215 + }, + { + "auxiliary_loss_clip": 0.01100237, + "auxiliary_loss_mlp": 0.01051762, + "balance_loss_clip": 1.05441129, + "balance_loss_mlp": 1.0316751, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.6146069509485916, + "language_loss": 0.62847048, + "learning_rate": 3.697351644435763e-06, + "loss": 0.6499905, + "num_input_tokens_seen": 72639135, + "step": 3361, + "time_per_iteration": 2.6235616207122803 + }, + { + "auxiliary_loss_clip": 0.01121993, + "auxiliary_loss_mlp": 0.01061857, + "balance_loss_clip": 1.0513711, + "balance_loss_mlp": 1.04292631, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 2.010743229270784, + "language_loss": 0.75469249, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77653098, + "num_input_tokens_seen": 72658525, + "step": 3362, + "time_per_iteration": 2.5973715782165527 + }, + { + "auxiliary_loss_clip": 0.0114307, + "auxiliary_loss_mlp": 0.0078586, + "balance_loss_clip": 1.05232108, + "balance_loss_mlp": 1.0009129, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.7198947018494386, + "language_loss": 0.76682794, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.7861172, + "num_input_tokens_seen": 72678085, + "step": 3363, + "time_per_iteration": 2.483701705932617 + }, + { + "auxiliary_loss_clip": 0.0114424, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.05316782, + "balance_loss_mlp": 1.02947271, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 1.598024716498664, + "language_loss": 0.75611866, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77802324, + "num_input_tokens_seen": 72698695, + "step": 3364, + "time_per_iteration": 2.551407814025879 + }, + { + "auxiliary_loss_clip": 0.0111153, + "auxiliary_loss_mlp": 0.01048012, + "balance_loss_clip": 1.050475, + "balance_loss_mlp": 1.0284977, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.162409518833845, + "language_loss": 0.71331215, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73490757, + "num_input_tokens_seen": 72717880, + "step": 3365, + "time_per_iteration": 2.631963014602661 + }, + { + "auxiliary_loss_clip": 0.01108064, + "auxiliary_loss_mlp": 0.01048352, + "balance_loss_clip": 1.04651523, + "balance_loss_mlp": 1.02955258, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 2.229531152266137, + "language_loss": 0.85531229, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87687647, + "num_input_tokens_seen": 72736410, + "step": 3366, + "time_per_iteration": 2.606130838394165 + }, + { + "auxiliary_loss_clip": 0.01117881, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.05021381, + "balance_loss_mlp": 1.03019845, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.7058004757359657, + "language_loss": 0.69470143, + "learning_rate": 3.696114537236335e-06, + "loss": 0.71636748, + "num_input_tokens_seen": 72758295, + "step": 3367, + "time_per_iteration": 2.6342201232910156 + }, + { + "auxiliary_loss_clip": 0.01146434, + "auxiliary_loss_mlp": 0.01047899, + "balance_loss_clip": 1.05034816, + "balance_loss_mlp": 1.02614331, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 1.8030300374380999, + "language_loss": 0.67849553, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70043886, + "num_input_tokens_seen": 72782495, + "step": 3368, + "time_per_iteration": 2.6366114616394043 + }, + { + "auxiliary_loss_clip": 0.0112447, + "auxiliary_loss_mlp": 0.01055241, + "balance_loss_clip": 1.05516016, + "balance_loss_mlp": 1.03564286, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 1.7591606265097377, + "language_loss": 0.76930165, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79109871, + "num_input_tokens_seen": 72801885, + "step": 3369, + "time_per_iteration": 2.572819471359253 + }, + { + "auxiliary_loss_clip": 0.01138928, + "auxiliary_loss_mlp": 0.01057555, + "balance_loss_clip": 1.05085206, + "balance_loss_mlp": 1.03851724, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 2.628674523593763, + "language_loss": 0.6524623, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67442715, + "num_input_tokens_seen": 72816990, + "step": 3370, + "time_per_iteration": 2.503615379333496 + }, + { + "auxiliary_loss_clip": 0.01049836, + "auxiliary_loss_mlp": 0.01008354, + "balance_loss_clip": 1.02289057, + "balance_loss_mlp": 1.00605333, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6843933005257506, + "language_loss": 0.58125043, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60183239, + "num_input_tokens_seen": 72879240, + "step": 3371, + "time_per_iteration": 3.1250481605529785 + }, + { + "auxiliary_loss_clip": 0.0112161, + "auxiliary_loss_mlp": 0.01044791, + "balance_loss_clip": 1.05079341, + "balance_loss_mlp": 1.02688551, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.7438902761427997, + "language_loss": 0.91797757, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.93964154, + "num_input_tokens_seen": 72899030, + "step": 3372, + "time_per_iteration": 2.6071271896362305 + }, + { + "auxiliary_loss_clip": 0.01138131, + "auxiliary_loss_mlp": 0.01051334, + "balance_loss_clip": 1.04962778, + "balance_loss_mlp": 1.02982891, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.7284363124225683, + "language_loss": 0.78946197, + "learning_rate": 3.694875114631167e-06, + "loss": 0.81135654, + "num_input_tokens_seen": 72919190, + "step": 3373, + "time_per_iteration": 2.554633617401123 + }, + { + "auxiliary_loss_clip": 0.01096292, + "auxiliary_loss_mlp": 0.01050292, + "balance_loss_clip": 1.0457983, + "balance_loss_mlp": 1.0292871, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 2.3588869461192417, + "language_loss": 0.71161544, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73308122, + "num_input_tokens_seen": 72939720, + "step": 3374, + "time_per_iteration": 2.7199203968048096 + }, + { + "auxiliary_loss_clip": 0.01041161, + "auxiliary_loss_mlp": 0.0100713, + "balance_loss_clip": 1.0216862, + "balance_loss_mlp": 1.00502014, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9717719958157529, + "language_loss": 0.62380278, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64428568, + "num_input_tokens_seen": 73000015, + "step": 3375, + "time_per_iteration": 3.058858633041382 + }, + { + "auxiliary_loss_clip": 0.01155021, + "auxiliary_loss_mlp": 0.01047054, + "balance_loss_clip": 1.05473447, + "balance_loss_mlp": 1.02908874, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.5582550057984543, + "language_loss": 0.82225484, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84427559, + "num_input_tokens_seen": 73017675, + "step": 3376, + "time_per_iteration": 3.9141793251037598 + }, + { + "auxiliary_loss_clip": 0.01146382, + "auxiliary_loss_mlp": 0.01042537, + "balance_loss_clip": 1.05230141, + "balance_loss_mlp": 1.02225983, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.0627375580233633, + "language_loss": 0.81436634, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83625549, + "num_input_tokens_seen": 73036135, + "step": 3377, + "time_per_iteration": 2.5199577808380127 + }, + { + "auxiliary_loss_clip": 0.01126641, + "auxiliary_loss_mlp": 0.01051214, + "balance_loss_clip": 1.05332351, + "balance_loss_mlp": 1.03162825, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 1.9814656916153912, + "language_loss": 0.76569152, + "learning_rate": 3.69384049496805e-06, + "loss": 0.7874701, + "num_input_tokens_seen": 73054075, + "step": 3378, + "time_per_iteration": 4.1219940185546875 + }, + { + "auxiliary_loss_clip": 0.01098319, + "auxiliary_loss_mlp": 0.01050499, + "balance_loss_clip": 1.05421603, + "balance_loss_mlp": 1.02862382, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.9176838676177368, + "language_loss": 0.79837245, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81986064, + "num_input_tokens_seen": 73073530, + "step": 3379, + "time_per_iteration": 2.5908539295196533 + }, + { + "auxiliary_loss_clip": 0.01140041, + "auxiliary_loss_mlp": 0.01039905, + "balance_loss_clip": 1.05381012, + "balance_loss_mlp": 1.02201152, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 2.007732014523263, + "language_loss": 0.86791658, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.88971603, + "num_input_tokens_seen": 73092820, + "step": 3380, + "time_per_iteration": 2.5365967750549316 + }, + { + "auxiliary_loss_clip": 0.01158925, + "auxiliary_loss_mlp": 0.01050337, + "balance_loss_clip": 1.05826437, + "balance_loss_mlp": 1.03177619, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 2.71098932579644, + "language_loss": 0.74771571, + "learning_rate": 3.693218952340186e-06, + "loss": 0.76980841, + "num_input_tokens_seen": 73113385, + "step": 3381, + "time_per_iteration": 2.4848642349243164 + }, + { + "auxiliary_loss_clip": 0.01122859, + "auxiliary_loss_mlp": 0.01046952, + "balance_loss_clip": 1.04645348, + "balance_loss_mlp": 1.02677035, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.9894714928785064, + "language_loss": 0.79391789, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81561601, + "num_input_tokens_seen": 73131195, + "step": 3382, + "time_per_iteration": 2.5343177318573 + }, + { + "auxiliary_loss_clip": 0.01117447, + "auxiliary_loss_mlp": 0.0078871, + "balance_loss_clip": 1.04849744, + "balance_loss_mlp": 1.00092578, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.8514814989896515, + "language_loss": 0.80259955, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82166111, + "num_input_tokens_seen": 73148850, + "step": 3383, + "time_per_iteration": 2.535022497177124 + }, + { + "auxiliary_loss_clip": 0.01105428, + "auxiliary_loss_mlp": 0.01047536, + "balance_loss_clip": 1.04552317, + "balance_loss_mlp": 1.02814054, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 1.8499774854215436, + "language_loss": 0.74462008, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.7661497, + "num_input_tokens_seen": 73166775, + "step": 3384, + "time_per_iteration": 3.964491605758667 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01047626, + "balance_loss_clip": 1.05895424, + "balance_loss_mlp": 1.026896, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.4222901155269376, + "language_loss": 0.7625941, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78462005, + "num_input_tokens_seen": 73183215, + "step": 3385, + "time_per_iteration": 2.499932289123535 + }, + { + "auxiliary_loss_clip": 0.01111162, + "auxiliary_loss_mlp": 0.01060658, + "balance_loss_clip": 1.04935336, + "balance_loss_mlp": 1.04023778, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.7498145113851757, + "language_loss": 0.68200934, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70372754, + "num_input_tokens_seen": 73203290, + "step": 3386, + "time_per_iteration": 2.630675792694092 + }, + { + "auxiliary_loss_clip": 0.01107882, + "auxiliary_loss_mlp": 0.01059329, + "balance_loss_clip": 1.04832578, + "balance_loss_mlp": 1.03938508, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.3710353994548081, + "language_loss": 0.81051874, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83219075, + "num_input_tokens_seen": 73226185, + "step": 3387, + "time_per_iteration": 2.685410976409912 + }, + { + "auxiliary_loss_clip": 0.01126849, + "auxiliary_loss_mlp": 0.01048268, + "balance_loss_clip": 1.05385566, + "balance_loss_mlp": 1.02992129, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.338133241189526, + "language_loss": 0.80513161, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82688284, + "num_input_tokens_seen": 73243300, + "step": 3388, + "time_per_iteration": 2.551022529602051 + }, + { + "auxiliary_loss_clip": 0.01159187, + "auxiliary_loss_mlp": 0.01044875, + "balance_loss_clip": 1.05753303, + "balance_loss_mlp": 1.02531314, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 2.059143991037257, + "language_loss": 0.71996343, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.74200404, + "num_input_tokens_seen": 73261490, + "step": 3389, + "time_per_iteration": 2.452219247817993 + }, + { + "auxiliary_loss_clip": 0.01144569, + "auxiliary_loss_mlp": 0.01050511, + "balance_loss_clip": 1.05601406, + "balance_loss_mlp": 1.03210473, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 2.0300854584299537, + "language_loss": 0.8739655, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89591634, + "num_input_tokens_seen": 73280180, + "step": 3390, + "time_per_iteration": 2.477689266204834 + }, + { + "auxiliary_loss_clip": 0.01127981, + "auxiliary_loss_mlp": 0.010563, + "balance_loss_clip": 1.05337906, + "balance_loss_mlp": 1.03597474, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 3.1117586817457275, + "language_loss": 0.70972294, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73156571, + "num_input_tokens_seen": 73300680, + "step": 3391, + "time_per_iteration": 2.5511105060577393 + }, + { + "auxiliary_loss_clip": 0.01123429, + "auxiliary_loss_mlp": 0.01047088, + "balance_loss_clip": 1.0545032, + "balance_loss_mlp": 1.02900362, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.272426045300358, + "language_loss": 0.86539423, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88709939, + "num_input_tokens_seen": 73316760, + "step": 3392, + "time_per_iteration": 2.5354666709899902 + }, + { + "auxiliary_loss_clip": 0.01145624, + "auxiliary_loss_mlp": 0.0105037, + "balance_loss_clip": 1.0517267, + "balance_loss_mlp": 1.03282237, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.4140173091950634, + "language_loss": 0.80827534, + "learning_rate": 3.69072700532013e-06, + "loss": 0.8302353, + "num_input_tokens_seen": 73339385, + "step": 3393, + "time_per_iteration": 4.070557594299316 + }, + { + "auxiliary_loss_clip": 0.01121643, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.04444647, + "balance_loss_mlp": 1.02046645, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.822897589889034, + "language_loss": 0.86710572, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88870728, + "num_input_tokens_seen": 73357235, + "step": 3394, + "time_per_iteration": 2.521463394165039 + }, + { + "auxiliary_loss_clip": 0.01140299, + "auxiliary_loss_mlp": 0.01043284, + "balance_loss_clip": 1.05568767, + "balance_loss_mlp": 1.0265708, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 3.2226885009823856, + "language_loss": 0.8384912, + "learning_rate": 3.69031078287345e-06, + "loss": 0.86032706, + "num_input_tokens_seen": 73374435, + "step": 3395, + "time_per_iteration": 2.46529221534729 + }, + { + "auxiliary_loss_clip": 0.01145509, + "auxiliary_loss_mlp": 0.01040594, + "balance_loss_clip": 1.05413985, + "balance_loss_mlp": 1.02147293, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.245370339128548, + "language_loss": 0.83714294, + "learning_rate": 3.690102575501033e-06, + "loss": 0.8590039, + "num_input_tokens_seen": 73391025, + "step": 3396, + "time_per_iteration": 2.434246778488159 + }, + { + "auxiliary_loss_clip": 0.01115885, + "auxiliary_loss_mlp": 0.01042104, + "balance_loss_clip": 1.04816365, + "balance_loss_mlp": 1.02293539, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.9619394570159407, + "language_loss": 0.77059358, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.7921735, + "num_input_tokens_seen": 73409270, + "step": 3397, + "time_per_iteration": 2.592276096343994 + }, + { + "auxiliary_loss_clip": 0.01129199, + "auxiliary_loss_mlp": 0.01043853, + "balance_loss_clip": 1.05335474, + "balance_loss_mlp": 1.02680588, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.6172195143853303, + "language_loss": 0.873698, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89542854, + "num_input_tokens_seen": 73425225, + "step": 3398, + "time_per_iteration": 2.478492021560669 + }, + { + "auxiliary_loss_clip": 0.01123157, + "auxiliary_loss_mlp": 0.01049766, + "balance_loss_clip": 1.05061913, + "balance_loss_mlp": 1.03161025, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.1918674152939324, + "language_loss": 0.77887154, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.80060077, + "num_input_tokens_seen": 73440940, + "step": 3399, + "time_per_iteration": 2.5031492710113525 + }, + { + "auxiliary_loss_clip": 0.01143142, + "auxiliary_loss_mlp": 0.01040471, + "balance_loss_clip": 1.05257034, + "balance_loss_mlp": 1.02264953, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 2.320558358173286, + "language_loss": 0.76441133, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78624749, + "num_input_tokens_seen": 73458805, + "step": 3400, + "time_per_iteration": 2.478548049926758 + }, + { + "auxiliary_loss_clip": 0.01123027, + "auxiliary_loss_mlp": 0.00783234, + "balance_loss_clip": 1.05677915, + "balance_loss_mlp": 1.00074673, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.7076950127945825, + "language_loss": 0.79261678, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81167936, + "num_input_tokens_seen": 73479380, + "step": 3401, + "time_per_iteration": 2.6220240592956543 + }, + { + "auxiliary_loss_clip": 0.01132181, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_clip": 1.05042982, + "balance_loss_mlp": 1.02555799, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.8153424343180506, + "language_loss": 0.69515228, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71691561, + "num_input_tokens_seen": 73505105, + "step": 3402, + "time_per_iteration": 2.676375150680542 + }, + { + "auxiliary_loss_clip": 0.01117673, + "auxiliary_loss_mlp": 0.01042131, + "balance_loss_clip": 1.04819512, + "balance_loss_mlp": 1.02337909, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.2815536802913834, + "language_loss": 0.81091464, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83251262, + "num_input_tokens_seen": 73523700, + "step": 3403, + "time_per_iteration": 2.5556719303131104 + }, + { + "auxiliary_loss_clip": 0.01144752, + "auxiliary_loss_mlp": 0.01040449, + "balance_loss_clip": 1.05397165, + "balance_loss_mlp": 1.02294898, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 2.0618761759578015, + "language_loss": 0.8373127, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85916471, + "num_input_tokens_seen": 73542625, + "step": 3404, + "time_per_iteration": 2.5040242671966553 + }, + { + "auxiliary_loss_clip": 0.01139724, + "auxiliary_loss_mlp": 0.01047522, + "balance_loss_clip": 1.04902828, + "balance_loss_mlp": 1.02892494, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 2.067584859910655, + "language_loss": 0.8576566, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.879529, + "num_input_tokens_seen": 73561450, + "step": 3405, + "time_per_iteration": 2.5099124908447266 + }, + { + "auxiliary_loss_clip": 0.01116522, + "auxiliary_loss_mlp": 0.01042697, + "balance_loss_clip": 1.05267906, + "balance_loss_mlp": 1.02513742, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 3.7349637541285037, + "language_loss": 0.84987944, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.87147158, + "num_input_tokens_seen": 73577155, + "step": 3406, + "time_per_iteration": 2.5842604637145996 + }, + { + "auxiliary_loss_clip": 0.01152719, + "auxiliary_loss_mlp": 0.01038725, + "balance_loss_clip": 1.05546248, + "balance_loss_mlp": 1.02234554, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 1.9547095540121335, + "language_loss": 0.67175198, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.69366646, + "num_input_tokens_seen": 73594900, + "step": 3407, + "time_per_iteration": 2.4324183464050293 + }, + { + "auxiliary_loss_clip": 0.01152646, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_clip": 1.05466926, + "balance_loss_mlp": 1.02684808, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.107515768838125, + "language_loss": 0.84234989, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86432302, + "num_input_tokens_seen": 73613810, + "step": 3408, + "time_per_iteration": 2.477513313293457 + }, + { + "auxiliary_loss_clip": 0.01156516, + "auxiliary_loss_mlp": 0.01044228, + "balance_loss_clip": 1.05672717, + "balance_loss_mlp": 1.0270381, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.412958264690138, + "language_loss": 0.64534843, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66735584, + "num_input_tokens_seen": 73631495, + "step": 3409, + "time_per_iteration": 2.427823066711426 + }, + { + "auxiliary_loss_clip": 0.01138053, + "auxiliary_loss_mlp": 0.01045924, + "balance_loss_clip": 1.05089951, + "balance_loss_mlp": 1.02854371, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.3904494302722334, + "language_loss": 0.80318552, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82502532, + "num_input_tokens_seen": 73652840, + "step": 3410, + "time_per_iteration": 2.505305767059326 + }, + { + "auxiliary_loss_clip": 0.01103171, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_clip": 1.05586898, + "balance_loss_mlp": 1.02629423, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.6035847778851386, + "language_loss": 0.76127148, + "learning_rate": 3.686971778678803e-06, + "loss": 0.78273839, + "num_input_tokens_seen": 73672150, + "step": 3411, + "time_per_iteration": 2.783710241317749 + }, + { + "auxiliary_loss_clip": 0.01138606, + "auxiliary_loss_mlp": 0.01039773, + "balance_loss_clip": 1.0548991, + "balance_loss_mlp": 1.02263117, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 1.968496158442584, + "language_loss": 0.73225594, + "learning_rate": 3.686762546833722e-06, + "loss": 0.7540397, + "num_input_tokens_seen": 73691940, + "step": 3412, + "time_per_iteration": 2.545180082321167 + }, + { + "auxiliary_loss_clip": 0.01133003, + "auxiliary_loss_mlp": 0.01055037, + "balance_loss_clip": 1.05352807, + "balance_loss_mlp": 1.03527212, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.3549003974088705, + "language_loss": 0.77457446, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.79645485, + "num_input_tokens_seen": 73709080, + "step": 3413, + "time_per_iteration": 2.5434188842773438 + }, + { + "auxiliary_loss_clip": 0.0110969, + "auxiliary_loss_mlp": 0.0105116, + "balance_loss_clip": 1.05020165, + "balance_loss_mlp": 1.03321922, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 2.029445544732773, + "language_loss": 0.84769315, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.86930168, + "num_input_tokens_seen": 73727670, + "step": 3414, + "time_per_iteration": 2.5441858768463135 + }, + { + "auxiliary_loss_clip": 0.01142044, + "auxiliary_loss_mlp": 0.0104064, + "balance_loss_clip": 1.05429196, + "balance_loss_mlp": 1.02360535, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.8759777575017074, + "language_loss": 0.80920362, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.83103049, + "num_input_tokens_seen": 73747170, + "step": 3415, + "time_per_iteration": 2.5274040699005127 + }, + { + "auxiliary_loss_clip": 0.01085853, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.0464685, + "balance_loss_mlp": 1.02468979, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.694615306234993, + "language_loss": 0.72867703, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.74994332, + "num_input_tokens_seen": 73767690, + "step": 3416, + "time_per_iteration": 4.081060171127319 + }, + { + "auxiliary_loss_clip": 0.01148115, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.05781078, + "balance_loss_mlp": 1.02451825, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.57380853291728, + "language_loss": 0.78775579, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80966198, + "num_input_tokens_seen": 73786900, + "step": 3417, + "time_per_iteration": 3.983869791030884 + }, + { + "auxiliary_loss_clip": 0.01146483, + "auxiliary_loss_mlp": 0.01047795, + "balance_loss_clip": 1.05636573, + "balance_loss_mlp": 1.02955592, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.52583763215626, + "language_loss": 0.87558162, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89752442, + "num_input_tokens_seen": 73804515, + "step": 3418, + "time_per_iteration": 2.4968605041503906 + }, + { + "auxiliary_loss_clip": 0.011423, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.05903411, + "balance_loss_mlp": 1.01920986, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.4231341954579007, + "language_loss": 0.62465453, + "learning_rate": 3.685296133421035e-06, + "loss": 0.6464504, + "num_input_tokens_seen": 73822910, + "step": 3419, + "time_per_iteration": 2.548949718475342 + }, + { + "auxiliary_loss_clip": 0.01143199, + "auxiliary_loss_mlp": 0.01056403, + "balance_loss_clip": 1.05985141, + "balance_loss_mlp": 1.03730607, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 1.8579304464733482, + "language_loss": 0.86444712, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88644314, + "num_input_tokens_seen": 73841160, + "step": 3420, + "time_per_iteration": 2.534816026687622 + }, + { + "auxiliary_loss_clip": 0.01105157, + "auxiliary_loss_mlp": 0.00789561, + "balance_loss_clip": 1.04482126, + "balance_loss_mlp": 1.00062752, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 5.056662877253075, + "language_loss": 0.71095961, + "learning_rate": 3.684876582881668e-06, + "loss": 0.7299068, + "num_input_tokens_seen": 73862795, + "step": 3421, + "time_per_iteration": 2.6523783206939697 + }, + { + "auxiliary_loss_clip": 0.0115557, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.05721378, + "balance_loss_mlp": 1.02234006, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 4.1283893951571935, + "language_loss": 0.71661949, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.73857826, + "num_input_tokens_seen": 73881525, + "step": 3422, + "time_per_iteration": 2.5395073890686035 + }, + { + "auxiliary_loss_clip": 0.01061783, + "auxiliary_loss_mlp": 0.0101739, + "balance_loss_clip": 1.03363442, + "balance_loss_mlp": 1.01499355, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7416815020111842, + "language_loss": 0.55550003, + "learning_rate": 3.684456776779548e-06, + "loss": 0.5762918, + "num_input_tokens_seen": 73937775, + "step": 3423, + "time_per_iteration": 3.1085615158081055 + }, + { + "auxiliary_loss_clip": 0.01108914, + "auxiliary_loss_mlp": 0.01040617, + "balance_loss_clip": 1.05570471, + "balance_loss_mlp": 1.02316499, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.7287615775397547, + "language_loss": 0.71364295, + "learning_rate": 3.684246777912353e-06, + "loss": 0.7351383, + "num_input_tokens_seen": 73958250, + "step": 3424, + "time_per_iteration": 4.144752264022827 + }, + { + "auxiliary_loss_clip": 0.01128944, + "auxiliary_loss_mlp": 0.00784627, + "balance_loss_clip": 1.06048441, + "balance_loss_mlp": 1.0005697, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.5841328381550104, + "language_loss": 0.75441611, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77355176, + "num_input_tokens_seen": 73977775, + "step": 3425, + "time_per_iteration": 2.5962584018707275 + }, + { + "auxiliary_loss_clip": 0.01110643, + "auxiliary_loss_mlp": 0.01059802, + "balance_loss_clip": 1.05605865, + "balance_loss_mlp": 1.04119349, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.7053594970767216, + "language_loss": 0.88086921, + "learning_rate": 3.683826588585508e-06, + "loss": 0.9025737, + "num_input_tokens_seen": 73996590, + "step": 3426, + "time_per_iteration": 2.561000108718872 + }, + { + "auxiliary_loss_clip": 0.01148741, + "auxiliary_loss_mlp": 0.01041802, + "balance_loss_clip": 1.06294584, + "balance_loss_mlp": 1.02451634, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.5200699749326054, + "language_loss": 0.76964474, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.79155016, + "num_input_tokens_seen": 74015935, + "step": 3427, + "time_per_iteration": 2.563655376434326 + }, + { + "auxiliary_loss_clip": 0.01161621, + "auxiliary_loss_mlp": 0.0105092, + "balance_loss_clip": 1.05987906, + "balance_loss_mlp": 1.03264546, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.9725762678686294, + "language_loss": 0.7376374, + "learning_rate": 3.683406143855174e-06, + "loss": 0.75976276, + "num_input_tokens_seen": 74036575, + "step": 3428, + "time_per_iteration": 2.5289814472198486 + }, + { + "auxiliary_loss_clip": 0.01136549, + "auxiliary_loss_mlp": 0.01046039, + "balance_loss_clip": 1.05441213, + "balance_loss_mlp": 1.02763319, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 1.916072863471356, + "language_loss": 0.73429734, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75612319, + "num_input_tokens_seen": 74055365, + "step": 3429, + "time_per_iteration": 2.5409739017486572 + }, + { + "auxiliary_loss_clip": 0.01147148, + "auxiliary_loss_mlp": 0.01048863, + "balance_loss_clip": 1.06099939, + "balance_loss_mlp": 1.02959871, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.750536096578458, + "language_loss": 0.8563509, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87831104, + "num_input_tokens_seen": 74074875, + "step": 3430, + "time_per_iteration": 2.5145492553710938 + }, + { + "auxiliary_loss_clip": 0.01088346, + "auxiliary_loss_mlp": 0.01053017, + "balance_loss_clip": 1.04583108, + "balance_loss_mlp": 1.03445637, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.5250118221110671, + "language_loss": 0.68776441, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.70917803, + "num_input_tokens_seen": 74094505, + "step": 3431, + "time_per_iteration": 2.6087849140167236 + }, + { + "auxiliary_loss_clip": 0.01036855, + "auxiliary_loss_mlp": 0.01004408, + "balance_loss_clip": 1.04072475, + "balance_loss_mlp": 1.00179696, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8073980070624193, + "language_loss": 0.60295069, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62336332, + "num_input_tokens_seen": 74158500, + "step": 3432, + "time_per_iteration": 3.302760124206543 + }, + { + "auxiliary_loss_clip": 0.01147653, + "auxiliary_loss_mlp": 0.01045098, + "balance_loss_clip": 1.05962133, + "balance_loss_mlp": 1.02763343, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.7848252227607262, + "language_loss": 0.72255182, + "learning_rate": 3.682353915057679e-06, + "loss": 0.7444793, + "num_input_tokens_seen": 74176685, + "step": 3433, + "time_per_iteration": 4.030315637588501 + }, + { + "auxiliary_loss_clip": 0.01097927, + "auxiliary_loss_mlp": 0.01055119, + "balance_loss_clip": 1.04576039, + "balance_loss_mlp": 1.03444839, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 2.9982058482006724, + "language_loss": 0.86899018, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.89052063, + "num_input_tokens_seen": 74194935, + "step": 3434, + "time_per_iteration": 2.606926441192627 + }, + { + "auxiliary_loss_clip": 0.01153003, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.05772114, + "balance_loss_mlp": 1.02439141, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 2.2570999136057757, + "language_loss": 0.69148242, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71343136, + "num_input_tokens_seen": 74215400, + "step": 3435, + "time_per_iteration": 2.549001455307007 + }, + { + "auxiliary_loss_clip": 0.01130479, + "auxiliary_loss_mlp": 0.01042158, + "balance_loss_clip": 1.05758357, + "balance_loss_mlp": 1.02287042, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.6720752921481665, + "language_loss": 0.89307547, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91480184, + "num_input_tokens_seen": 74234090, + "step": 3436, + "time_per_iteration": 2.5785880088806152 + }, + { + "auxiliary_loss_clip": 0.01119381, + "auxiliary_loss_mlp": 0.01042016, + "balance_loss_clip": 1.0513308, + "balance_loss_mlp": 1.02272749, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 4.498150343249336, + "language_loss": 0.76940364, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.79101753, + "num_input_tokens_seen": 74253345, + "step": 3437, + "time_per_iteration": 2.582888603210449 + }, + { + "auxiliary_loss_clip": 0.01146683, + "auxiliary_loss_mlp": 0.01042337, + "balance_loss_clip": 1.05401075, + "balance_loss_mlp": 1.02512288, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 2.763291629383324, + "language_loss": 0.7733134, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79520357, + "num_input_tokens_seen": 74271615, + "step": 3438, + "time_per_iteration": 2.485445737838745 + }, + { + "auxiliary_loss_clip": 0.01062596, + "auxiliary_loss_mlp": 0.01002613, + "balance_loss_clip": 1.03530312, + "balance_loss_mlp": 0.99951375, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8344669126558971, + "language_loss": 0.67148542, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.6921376, + "num_input_tokens_seen": 74331390, + "step": 3439, + "time_per_iteration": 3.0371649265289307 + }, + { + "auxiliary_loss_clip": 0.01149565, + "auxiliary_loss_mlp": 0.01039473, + "balance_loss_clip": 1.05672467, + "balance_loss_mlp": 1.0209831, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.728114412813293, + "language_loss": 0.83967835, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86156875, + "num_input_tokens_seen": 74347335, + "step": 3440, + "time_per_iteration": 2.462216854095459 + }, + { + "auxiliary_loss_clip": 0.01147681, + "auxiliary_loss_mlp": 0.01044139, + "balance_loss_clip": 1.05410886, + "balance_loss_mlp": 1.02683008, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 2.2910212722159233, + "language_loss": 0.84815097, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87006915, + "num_input_tokens_seen": 74366310, + "step": 3441, + "time_per_iteration": 2.481078624725342 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01048276, + "balance_loss_clip": 1.05584717, + "balance_loss_mlp": 1.02977443, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.6771414582977886, + "language_loss": 0.86061239, + "learning_rate": 3.680455884806959e-06, + "loss": 0.8821975, + "num_input_tokens_seen": 74387100, + "step": 3442, + "time_per_iteration": 2.6615161895751953 + }, + { + "auxiliary_loss_clip": 0.01078094, + "auxiliary_loss_mlp": 0.0105153, + "balance_loss_clip": 1.04953289, + "balance_loss_mlp": 1.03162169, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 3.1616540519376586, + "language_loss": 0.73167825, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75297451, + "num_input_tokens_seen": 74404460, + "step": 3443, + "time_per_iteration": 2.6307239532470703 + }, + { + "auxiliary_loss_clip": 0.01127073, + "auxiliary_loss_mlp": 0.00784078, + "balance_loss_clip": 1.05282068, + "balance_loss_mlp": 1.00059104, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 1.7579326662037158, + "language_loss": 0.85134858, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87046009, + "num_input_tokens_seen": 74423790, + "step": 3444, + "time_per_iteration": 2.549915313720703 + }, + { + "auxiliary_loss_clip": 0.01037915, + "auxiliary_loss_mlp": 0.01009231, + "balance_loss_clip": 1.04091668, + "balance_loss_mlp": 1.00635815, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6973511494687517, + "language_loss": 0.57085937, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59133083, + "num_input_tokens_seen": 74488130, + "step": 3445, + "time_per_iteration": 3.112220287322998 + }, + { + "auxiliary_loss_clip": 0.01157548, + "auxiliary_loss_mlp": 0.00785753, + "balance_loss_clip": 1.05752695, + "balance_loss_mlp": 1.0006094, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.5754556506544213, + "language_loss": 0.78409231, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80352533, + "num_input_tokens_seen": 74506720, + "step": 3446, + "time_per_iteration": 2.456000328063965 + }, + { + "auxiliary_loss_clip": 0.01153847, + "auxiliary_loss_mlp": 0.01047555, + "balance_loss_clip": 1.05527568, + "balance_loss_mlp": 1.02625275, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.1339723099487338, + "language_loss": 0.62629879, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64831281, + "num_input_tokens_seen": 74525330, + "step": 3447, + "time_per_iteration": 2.53129506111145 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01063574, + "balance_loss_clip": 1.04902911, + "balance_loss_mlp": 1.0427835, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.8044648826572263, + "language_loss": 0.86203921, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88378078, + "num_input_tokens_seen": 74544535, + "step": 3448, + "time_per_iteration": 2.603425979614258 + }, + { + "auxiliary_loss_clip": 0.01131175, + "auxiliary_loss_mlp": 0.01047258, + "balance_loss_clip": 1.04925942, + "balance_loss_mlp": 1.02672982, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 2.5766423891966874, + "language_loss": 0.75390738, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77569169, + "num_input_tokens_seen": 74562300, + "step": 3449, + "time_per_iteration": 2.540797710418701 + }, + { + "auxiliary_loss_clip": 0.01142112, + "auxiliary_loss_mlp": 0.01050355, + "balance_loss_clip": 1.0530417, + "balance_loss_mlp": 1.03075743, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.8636265018213507, + "language_loss": 0.76608151, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78800619, + "num_input_tokens_seen": 74580080, + "step": 3450, + "time_per_iteration": 2.5215768814086914 + }, + { + "auxiliary_loss_clip": 0.01126829, + "auxiliary_loss_mlp": 0.01046089, + "balance_loss_clip": 1.05089116, + "balance_loss_mlp": 1.02718258, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.5774096719745656, + "language_loss": 0.82302898, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84475815, + "num_input_tokens_seen": 74598980, + "step": 3451, + "time_per_iteration": 2.5619683265686035 + }, + { + "auxiliary_loss_clip": 0.01059714, + "auxiliary_loss_mlp": 0.01004681, + "balance_loss_clip": 1.02391112, + "balance_loss_mlp": 1.00251138, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7891275528847044, + "language_loss": 0.5660255, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58666945, + "num_input_tokens_seen": 74655275, + "step": 3452, + "time_per_iteration": 2.9333291053771973 + }, + { + "auxiliary_loss_clip": 0.01124086, + "auxiliary_loss_mlp": 0.00785296, + "balance_loss_clip": 1.05135489, + "balance_loss_mlp": 1.00061131, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 2.195528297079523, + "language_loss": 0.8856461, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90473992, + "num_input_tokens_seen": 74674560, + "step": 3453, + "time_per_iteration": 2.5733611583709717 + }, + { + "auxiliary_loss_clip": 0.01147622, + "auxiliary_loss_mlp": 0.01046203, + "balance_loss_clip": 1.05452514, + "balance_loss_mlp": 1.02666426, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.7861348921644566, + "language_loss": 0.8027035, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82464182, + "num_input_tokens_seen": 74694500, + "step": 3454, + "time_per_iteration": 3.9907310009002686 + }, + { + "auxiliary_loss_clip": 0.01108384, + "auxiliary_loss_mlp": 0.00786556, + "balance_loss_clip": 1.0425992, + "balance_loss_mlp": 1.00063527, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 4.323187144003694, + "language_loss": 0.76655078, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78550017, + "num_input_tokens_seen": 74710485, + "step": 3455, + "time_per_iteration": 2.519514322280884 + }, + { + "auxiliary_loss_clip": 0.0111946, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.04811573, + "balance_loss_mlp": 1.03010774, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 1.7515336901669696, + "language_loss": 0.80755311, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82923436, + "num_input_tokens_seen": 74727450, + "step": 3456, + "time_per_iteration": 2.543368101119995 + }, + { + "auxiliary_loss_clip": 0.01117269, + "auxiliary_loss_mlp": 0.00786443, + "balance_loss_clip": 1.0522933, + "balance_loss_mlp": 1.00070858, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.7065725192623915, + "language_loss": 0.78158915, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80062628, + "num_input_tokens_seen": 74746725, + "step": 3457, + "time_per_iteration": 4.122541666030884 + }, + { + "auxiliary_loss_clip": 0.0108595, + "auxiliary_loss_mlp": 0.01053449, + "balance_loss_clip": 1.04170871, + "balance_loss_mlp": 1.03088212, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 1.965665182796254, + "language_loss": 0.83414984, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85554385, + "num_input_tokens_seen": 74765255, + "step": 3458, + "time_per_iteration": 2.5986413955688477 + }, + { + "auxiliary_loss_clip": 0.01143057, + "auxiliary_loss_mlp": 0.00785549, + "balance_loss_clip": 1.05162799, + "balance_loss_mlp": 1.00070226, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.8499863684301878, + "language_loss": 0.75931716, + "learning_rate": 3.676856638489272e-06, + "loss": 0.77860326, + "num_input_tokens_seen": 74785710, + "step": 3459, + "time_per_iteration": 2.533949375152588 + }, + { + "auxiliary_loss_clip": 0.01091247, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.04667568, + "balance_loss_mlp": 1.01860631, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 1.8595375961655243, + "language_loss": 0.76865458, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.78993112, + "num_input_tokens_seen": 74804490, + "step": 3460, + "time_per_iteration": 2.5983192920684814 + }, + { + "auxiliary_loss_clip": 0.01097739, + "auxiliary_loss_mlp": 0.01048587, + "balance_loss_clip": 1.05043745, + "balance_loss_mlp": 1.02995467, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 5.2056563437524686, + "language_loss": 0.75606626, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77752948, + "num_input_tokens_seen": 74826340, + "step": 3461, + "time_per_iteration": 2.708906650543213 + }, + { + "auxiliary_loss_clip": 0.01127372, + "auxiliary_loss_mlp": 0.01041831, + "balance_loss_clip": 1.04661179, + "balance_loss_mlp": 1.02244711, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 2.0704145077534513, + "language_loss": 0.88529742, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90698946, + "num_input_tokens_seen": 74844960, + "step": 3462, + "time_per_iteration": 2.560734987258911 + }, + { + "auxiliary_loss_clip": 0.01027258, + "auxiliary_loss_mlp": 0.00757597, + "balance_loss_clip": 1.02786171, + "balance_loss_mlp": 1.00025105, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7605899426325511, + "language_loss": 0.59053421, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.60838282, + "num_input_tokens_seen": 74909075, + "step": 3463, + "time_per_iteration": 4.731340169906616 + }, + { + "auxiliary_loss_clip": 0.01137088, + "auxiliary_loss_mlp": 0.01048636, + "balance_loss_clip": 1.05204296, + "balance_loss_mlp": 1.029145, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 10.217285680668354, + "language_loss": 0.660972, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68282926, + "num_input_tokens_seen": 74928125, + "step": 3464, + "time_per_iteration": 2.5652992725372314 + }, + { + "auxiliary_loss_clip": 0.01128407, + "auxiliary_loss_mlp": 0.01044064, + "balance_loss_clip": 1.05245328, + "balance_loss_mlp": 1.02409649, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.3366694587426164, + "language_loss": 0.84122121, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.86294591, + "num_input_tokens_seen": 74945090, + "step": 3465, + "time_per_iteration": 2.552669048309326 + }, + { + "auxiliary_loss_clip": 0.01100481, + "auxiliary_loss_mlp": 0.01040987, + "balance_loss_clip": 1.04528975, + "balance_loss_mlp": 1.02278328, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 2.0385717321215635, + "language_loss": 0.82020056, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84161532, + "num_input_tokens_seen": 74963630, + "step": 3466, + "time_per_iteration": 2.6252808570861816 + }, + { + "auxiliary_loss_clip": 0.01142869, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.05432296, + "balance_loss_mlp": 1.02862525, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 2.0740712491924653, + "language_loss": 0.82147843, + "learning_rate": 3.675156514448716e-06, + "loss": 0.84335458, + "num_input_tokens_seen": 74981875, + "step": 3467, + "time_per_iteration": 2.477203845977783 + }, + { + "auxiliary_loss_clip": 0.01154312, + "auxiliary_loss_mlp": 0.0104157, + "balance_loss_clip": 1.05689788, + "balance_loss_mlp": 1.02451181, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 1.8569735050433889, + "language_loss": 0.81779844, + "learning_rate": 3.674943713009518e-06, + "loss": 0.8397572, + "num_input_tokens_seen": 74999155, + "step": 3468, + "time_per_iteration": 2.434610605239868 + }, + { + "auxiliary_loss_clip": 0.01146794, + "auxiliary_loss_mlp": 0.01049175, + "balance_loss_clip": 1.05373561, + "balance_loss_mlp": 1.02869463, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.9374015945301197, + "language_loss": 0.90179592, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92375565, + "num_input_tokens_seen": 75017850, + "step": 3469, + "time_per_iteration": 2.5046651363372803 + }, + { + "auxiliary_loss_clip": 0.01125805, + "auxiliary_loss_mlp": 0.0104632, + "balance_loss_clip": 1.05481923, + "balance_loss_mlp": 1.02791429, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 2.30560735260123, + "language_loss": 0.76282322, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78454447, + "num_input_tokens_seen": 75039270, + "step": 3470, + "time_per_iteration": 2.6495137214660645 + }, + { + "auxiliary_loss_clip": 0.0113096, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.0536418, + "balance_loss_mlp": 1.03011656, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 1.8210426221433875, + "language_loss": 0.7568422, + "learning_rate": 3.674304927640011e-06, + "loss": 0.77862889, + "num_input_tokens_seen": 75059350, + "step": 3471, + "time_per_iteration": 2.561749219894409 + }, + { + "auxiliary_loss_clip": 0.01124583, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.05052972, + "balance_loss_mlp": 1.02860594, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.8577476237779802, + "language_loss": 0.75611442, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77784038, + "num_input_tokens_seen": 75080150, + "step": 3472, + "time_per_iteration": 4.096347332000732 + }, + { + "auxiliary_loss_clip": 0.01145815, + "auxiliary_loss_mlp": 0.01045026, + "balance_loss_clip": 1.05473626, + "balance_loss_mlp": 1.02759719, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 2.1320825508251895, + "language_loss": 0.84410834, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86601675, + "num_input_tokens_seen": 75097920, + "step": 3473, + "time_per_iteration": 2.4651405811309814 + }, + { + "auxiliary_loss_clip": 0.01038408, + "auxiliary_loss_mlp": 0.01005383, + "balance_loss_clip": 1.03469229, + "balance_loss_mlp": 1.00332093, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8829824767699986, + "language_loss": 0.63659918, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65703714, + "num_input_tokens_seen": 75152410, + "step": 3474, + "time_per_iteration": 3.029953718185425 + }, + { + "auxiliary_loss_clip": 0.01137823, + "auxiliary_loss_mlp": 0.01042469, + "balance_loss_clip": 1.05444098, + "balance_loss_mlp": 1.02456355, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 3.506151672517051, + "language_loss": 0.69538009, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.71718299, + "num_input_tokens_seen": 75173265, + "step": 3475, + "time_per_iteration": 2.64447283744812 + }, + { + "auxiliary_loss_clip": 0.01156844, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.05554128, + "balance_loss_mlp": 1.03063071, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.4623575674045923, + "language_loss": 0.7024039, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72445345, + "num_input_tokens_seen": 75193640, + "step": 3476, + "time_per_iteration": 2.471719741821289 + }, + { + "auxiliary_loss_clip": 0.0113729, + "auxiliary_loss_mlp": 0.01046223, + "balance_loss_clip": 1.05795217, + "balance_loss_mlp": 1.02866387, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 1.8511774359030206, + "language_loss": 0.89521503, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91705012, + "num_input_tokens_seen": 75212545, + "step": 3477, + "time_per_iteration": 2.564309597015381 + }, + { + "auxiliary_loss_clip": 0.0109449, + "auxiliary_loss_mlp": 0.01053173, + "balance_loss_clip": 1.04614055, + "balance_loss_mlp": 1.03489828, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 2.305695791725471, + "language_loss": 0.67872417, + "learning_rate": 3.672812206678344e-06, + "loss": 0.7002008, + "num_input_tokens_seen": 75230865, + "step": 3478, + "time_per_iteration": 2.65191650390625 + }, + { + "auxiliary_loss_clip": 0.01100489, + "auxiliary_loss_mlp": 0.01050955, + "balance_loss_clip": 1.04191256, + "balance_loss_mlp": 1.03123784, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.164221583547283, + "language_loss": 0.84576499, + "learning_rate": 3.672598707029127e-06, + "loss": 0.86727947, + "num_input_tokens_seen": 75248285, + "step": 3479, + "time_per_iteration": 2.5210888385772705 + }, + { + "auxiliary_loss_clip": 0.01112804, + "auxiliary_loss_mlp": 0.01055884, + "balance_loss_clip": 1.04895735, + "balance_loss_mlp": 1.03661931, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 3.1655840518178855, + "language_loss": 0.74013436, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76182121, + "num_input_tokens_seen": 75266310, + "step": 3480, + "time_per_iteration": 2.543281078338623 + }, + { + "auxiliary_loss_clip": 0.01111216, + "auxiliary_loss_mlp": 0.010483, + "balance_loss_clip": 1.04715979, + "balance_loss_mlp": 1.03236151, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.2793276590471327, + "language_loss": 0.75881684, + "learning_rate": 3.67217151746346e-06, + "loss": 0.78041202, + "num_input_tokens_seen": 75284175, + "step": 3481, + "time_per_iteration": 2.526445150375366 + }, + { + "auxiliary_loss_clip": 0.01092956, + "auxiliary_loss_mlp": 0.01049256, + "balance_loss_clip": 1.04456186, + "balance_loss_mlp": 1.03124344, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 2.125012606500584, + "language_loss": 0.85254884, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87397099, + "num_input_tokens_seen": 75303465, + "step": 3482, + "time_per_iteration": 2.6030306816101074 + }, + { + "auxiliary_loss_clip": 0.01103345, + "auxiliary_loss_mlp": 0.01047683, + "balance_loss_clip": 1.0527184, + "balance_loss_mlp": 1.03045738, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 1.763652893099504, + "language_loss": 0.7093628, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.73087311, + "num_input_tokens_seen": 75325290, + "step": 3483, + "time_per_iteration": 2.719115972518921 + }, + { + "auxiliary_loss_clip": 0.01126192, + "auxiliary_loss_mlp": 0.0105585, + "balance_loss_clip": 1.04986477, + "balance_loss_mlp": 1.03767037, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.6907258879701619, + "language_loss": 0.74962121, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77144158, + "num_input_tokens_seen": 75343895, + "step": 3484, + "time_per_iteration": 2.515824794769287 + }, + { + "auxiliary_loss_clip": 0.01118363, + "auxiliary_loss_mlp": 0.01051196, + "balance_loss_clip": 1.05135751, + "balance_loss_mlp": 1.03301644, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.6555901306041692, + "language_loss": 0.70590878, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.72760439, + "num_input_tokens_seen": 75367100, + "step": 3485, + "time_per_iteration": 2.6061911582946777 + }, + { + "auxiliary_loss_clip": 0.01087439, + "auxiliary_loss_mlp": 0.00786533, + "balance_loss_clip": 1.04405677, + "balance_loss_mlp": 1.00079966, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 3.6343861762967653, + "language_loss": 0.83143616, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.8501758, + "num_input_tokens_seen": 75389925, + "step": 3486, + "time_per_iteration": 2.6216933727264404 + }, + { + "auxiliary_loss_clip": 0.01140591, + "auxiliary_loss_mlp": 0.01051861, + "balance_loss_clip": 1.0516026, + "balance_loss_mlp": 1.03467107, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 1.7198551155185489, + "language_loss": 0.86978829, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89171284, + "num_input_tokens_seen": 75408575, + "step": 3487, + "time_per_iteration": 2.591953754425049 + }, + { + "auxiliary_loss_clip": 0.01115881, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_clip": 1.04869366, + "balance_loss_mlp": 1.02354074, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 3.4512208624877125, + "language_loss": 0.72504753, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74662793, + "num_input_tokens_seen": 75427155, + "step": 3488, + "time_per_iteration": 2.5710582733154297 + }, + { + "auxiliary_loss_clip": 0.01122618, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_clip": 1.05145061, + "balance_loss_mlp": 1.02744138, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.1833326417147267, + "language_loss": 0.8093183, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.83098578, + "num_input_tokens_seen": 75444450, + "step": 3489, + "time_per_iteration": 2.498525619506836 + }, + { + "auxiliary_loss_clip": 0.01152955, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_clip": 1.05258489, + "balance_loss_mlp": 1.02970529, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 2.2327336192232856, + "language_loss": 0.72918558, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75117576, + "num_input_tokens_seen": 75462625, + "step": 3490, + "time_per_iteration": 2.4608709812164307 + }, + { + "auxiliary_loss_clip": 0.01124907, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.05246127, + "balance_loss_mlp": 1.03580892, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 2.1890218139559128, + "language_loss": 0.70738232, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72915256, + "num_input_tokens_seen": 75480640, + "step": 3491, + "time_per_iteration": 2.499541759490967 + }, + { + "auxiliary_loss_clip": 0.01139542, + "auxiliary_loss_mlp": 0.00783789, + "balance_loss_clip": 1.05049562, + "balance_loss_mlp": 1.00072265, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 3.3779081818449486, + "language_loss": 0.79895604, + "learning_rate": 3.669817442854444e-06, + "loss": 0.81818938, + "num_input_tokens_seen": 75494900, + "step": 3492, + "time_per_iteration": 2.4779021739959717 + }, + { + "auxiliary_loss_clip": 0.01143681, + "auxiliary_loss_mlp": 0.00784597, + "balance_loss_clip": 1.05452466, + "balance_loss_mlp": 1.00059342, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 2.717702382967049, + "language_loss": 0.87008655, + "learning_rate": 3.669603055991502e-06, + "loss": 0.88936931, + "num_input_tokens_seen": 75513370, + "step": 3493, + "time_per_iteration": 3.8997509479522705 + }, + { + "auxiliary_loss_clip": 0.01115138, + "auxiliary_loss_mlp": 0.01039868, + "balance_loss_clip": 1.045434, + "balance_loss_mlp": 1.02326226, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 2.192687257236449, + "language_loss": 0.68695188, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.70850199, + "num_input_tokens_seen": 75532480, + "step": 3494, + "time_per_iteration": 2.5127663612365723 + }, + { + "auxiliary_loss_clip": 0.01147887, + "auxiliary_loss_mlp": 0.01038579, + "balance_loss_clip": 1.05551207, + "balance_loss_mlp": 1.02142525, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.6675848064293102, + "language_loss": 0.79137313, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81323779, + "num_input_tokens_seen": 75552745, + "step": 3495, + "time_per_iteration": 2.5689709186553955 + }, + { + "auxiliary_loss_clip": 0.01120396, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.04711294, + "balance_loss_mlp": 1.02592325, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.7237100693097498, + "language_loss": 0.7726928, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79433274, + "num_input_tokens_seen": 75574355, + "step": 3496, + "time_per_iteration": 4.037398099899292 + }, + { + "auxiliary_loss_clip": 0.01143999, + "auxiliary_loss_mlp": 0.01049095, + "balance_loss_clip": 1.05816913, + "balance_loss_mlp": 1.03049827, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.1224873452539414, + "language_loss": 0.82308537, + "learning_rate": 3.668744875505915e-06, + "loss": 0.8450163, + "num_input_tokens_seen": 75592215, + "step": 3497, + "time_per_iteration": 2.512441873550415 + }, + { + "auxiliary_loss_clip": 0.01146366, + "auxiliary_loss_mlp": 0.01049724, + "balance_loss_clip": 1.05521548, + "balance_loss_mlp": 1.03155661, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 1.8016821784569221, + "language_loss": 0.66972172, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69168264, + "num_input_tokens_seen": 75610740, + "step": 3498, + "time_per_iteration": 2.5135738849639893 + }, + { + "auxiliary_loss_clip": 0.01124844, + "auxiliary_loss_mlp": 0.01045983, + "balance_loss_clip": 1.05233729, + "balance_loss_mlp": 1.02818465, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 3.4682498696734654, + "language_loss": 0.80541593, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82712412, + "num_input_tokens_seen": 75631005, + "step": 3499, + "time_per_iteration": 2.5559887886047363 + }, + { + "auxiliary_loss_clip": 0.01142724, + "auxiliary_loss_mlp": 0.01043934, + "balance_loss_clip": 1.05514431, + "balance_loss_mlp": 1.02680361, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.9533682642538575, + "language_loss": 0.78080869, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80267525, + "num_input_tokens_seen": 75650655, + "step": 3500, + "time_per_iteration": 2.5551626682281494 + }, + { + "auxiliary_loss_clip": 0.01130652, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_clip": 1.05266964, + "balance_loss_mlp": 1.02593982, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.814579066558409, + "language_loss": 0.74071068, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76245856, + "num_input_tokens_seen": 75669895, + "step": 3501, + "time_per_iteration": 2.576176643371582 + }, + { + "auxiliary_loss_clip": 0.01141706, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.05456257, + "balance_loss_mlp": 1.0230267, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.785608267917209, + "language_loss": 0.75405538, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77587104, + "num_input_tokens_seen": 75689535, + "step": 3502, + "time_per_iteration": 3.9544293880462646 + }, + { + "auxiliary_loss_clip": 0.01105575, + "auxiliary_loss_mlp": 0.01038417, + "balance_loss_clip": 1.05019271, + "balance_loss_mlp": 1.02028525, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 2.6463707964375214, + "language_loss": 0.77364749, + "learning_rate": 3.667455706571316e-06, + "loss": 0.79508746, + "num_input_tokens_seen": 75709265, + "step": 3503, + "time_per_iteration": 2.6355931758880615 + }, + { + "auxiliary_loss_clip": 0.01103102, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_clip": 1.05571246, + "balance_loss_mlp": 1.02547491, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 7.209664654098529, + "language_loss": 0.78379166, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80529201, + "num_input_tokens_seen": 75727050, + "step": 3504, + "time_per_iteration": 2.620413303375244 + }, + { + "auxiliary_loss_clip": 0.0112301, + "auxiliary_loss_mlp": 0.01044277, + "balance_loss_clip": 1.05290949, + "balance_loss_mlp": 1.02610946, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 2.156079886388739, + "language_loss": 0.76684391, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.78851682, + "num_input_tokens_seen": 75747175, + "step": 3505, + "time_per_iteration": 2.5826878547668457 + }, + { + "auxiliary_loss_clip": 0.01125989, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.0531702, + "balance_loss_mlp": 1.02907944, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.8505071248033929, + "language_loss": 0.63966376, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.66139829, + "num_input_tokens_seen": 75767690, + "step": 3506, + "time_per_iteration": 2.611629009246826 + }, + { + "auxiliary_loss_clip": 0.01143616, + "auxiliary_loss_mlp": 0.01048649, + "balance_loss_clip": 1.05504322, + "balance_loss_mlp": 1.03046966, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.6680701766829107, + "language_loss": 0.81874692, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84066951, + "num_input_tokens_seen": 75787255, + "step": 3507, + "time_per_iteration": 2.582301378250122 + }, + { + "auxiliary_loss_clip": 0.0114499, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.05572855, + "balance_loss_mlp": 1.02845454, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 1.6859654988403778, + "language_loss": 0.75434208, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77625358, + "num_input_tokens_seen": 75805890, + "step": 3508, + "time_per_iteration": 2.4510974884033203 + }, + { + "auxiliary_loss_clip": 0.01158434, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.05551887, + "balance_loss_mlp": 1.02396369, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.4811703507059533, + "language_loss": 0.84901929, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87102342, + "num_input_tokens_seen": 75821620, + "step": 3509, + "time_per_iteration": 2.458760976791382 + }, + { + "auxiliary_loss_clip": 0.01122632, + "auxiliary_loss_mlp": 0.01039133, + "balance_loss_clip": 1.05495095, + "balance_loss_mlp": 1.02034545, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.7053646399265174, + "language_loss": 0.68076169, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.70237935, + "num_input_tokens_seen": 75842490, + "step": 3510, + "time_per_iteration": 2.6374900341033936 + }, + { + "auxiliary_loss_clip": 0.01160416, + "auxiliary_loss_mlp": 0.01042159, + "balance_loss_clip": 1.05642414, + "balance_loss_mlp": 1.02428973, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.7084692794677092, + "language_loss": 0.72042137, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74244714, + "num_input_tokens_seen": 75865985, + "step": 3511, + "time_per_iteration": 2.523637294769287 + }, + { + "auxiliary_loss_clip": 0.01077802, + "auxiliary_loss_mlp": 0.01040158, + "balance_loss_clip": 1.05024457, + "balance_loss_mlp": 1.02024961, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.7819314385578813, + "language_loss": 0.69025558, + "learning_rate": 3.665517685689794e-06, + "loss": 0.7114352, + "num_input_tokens_seen": 75882745, + "step": 3512, + "time_per_iteration": 4.30631422996521 + }, + { + "auxiliary_loss_clip": 0.01146674, + "auxiliary_loss_mlp": 0.01054925, + "balance_loss_clip": 1.05489266, + "balance_loss_mlp": 1.03530359, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 2.8357167925093605, + "language_loss": 0.732355, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.75437099, + "num_input_tokens_seen": 75904305, + "step": 3513, + "time_per_iteration": 2.798779249191284 + }, + { + "auxiliary_loss_clip": 0.01129691, + "auxiliary_loss_mlp": 0.01041605, + "balance_loss_clip": 1.05310392, + "balance_loss_mlp": 1.02435565, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 2.0776537923419163, + "language_loss": 0.74270368, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76441669, + "num_input_tokens_seen": 75923710, + "step": 3514, + "time_per_iteration": 2.5463063716888428 + }, + { + "auxiliary_loss_clip": 0.01143842, + "auxiliary_loss_mlp": 0.01043984, + "balance_loss_clip": 1.05922461, + "balance_loss_mlp": 1.02535164, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 7.5722288673514955, + "language_loss": 0.76978391, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.79166222, + "num_input_tokens_seen": 75942625, + "step": 3515, + "time_per_iteration": 2.5143253803253174 + }, + { + "auxiliary_loss_clip": 0.01132442, + "auxiliary_loss_mlp": 0.01044574, + "balance_loss_clip": 1.05639565, + "balance_loss_mlp": 1.0268712, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 1.8592659393221829, + "language_loss": 0.68408138, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70585155, + "num_input_tokens_seen": 75959930, + "step": 3516, + "time_per_iteration": 2.5005316734313965 + }, + { + "auxiliary_loss_clip": 0.01117497, + "auxiliary_loss_mlp": 0.0105, + "balance_loss_clip": 1.05492926, + "balance_loss_mlp": 1.03124785, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 4.467563266164674, + "language_loss": 0.85137486, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87304986, + "num_input_tokens_seen": 75980335, + "step": 3517, + "time_per_iteration": 2.589543342590332 + }, + { + "auxiliary_loss_clip": 0.01133924, + "auxiliary_loss_mlp": 0.01039035, + "balance_loss_clip": 1.05380917, + "balance_loss_mlp": 1.02124918, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.0167132328006403, + "language_loss": 0.62003744, + "learning_rate": 3.664222829354512e-06, + "loss": 0.64176702, + "num_input_tokens_seen": 76002095, + "step": 3518, + "time_per_iteration": 2.6334149837493896 + }, + { + "auxiliary_loss_clip": 0.01091339, + "auxiliary_loss_mlp": 0.01050756, + "balance_loss_clip": 1.05141044, + "balance_loss_mlp": 1.0340426, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 2.0243788003192003, + "language_loss": 0.89419186, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91561282, + "num_input_tokens_seen": 76020425, + "step": 3519, + "time_per_iteration": 2.638925075531006 + }, + { + "auxiliary_loss_clip": 0.0113902, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.05481505, + "balance_loss_mlp": 1.03171182, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 3.5541303298562794, + "language_loss": 0.81599301, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83788478, + "num_input_tokens_seen": 76041210, + "step": 3520, + "time_per_iteration": 2.5569567680358887 + }, + { + "auxiliary_loss_clip": 0.01129223, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.05409014, + "balance_loss_mlp": 1.03310704, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.5995704768044499, + "language_loss": 0.75792247, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.77972233, + "num_input_tokens_seen": 76062685, + "step": 3521, + "time_per_iteration": 2.5868277549743652 + }, + { + "auxiliary_loss_clip": 0.01100388, + "auxiliary_loss_mlp": 0.01046536, + "balance_loss_clip": 1.0497539, + "balance_loss_mlp": 1.03037143, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.0728004615476565, + "language_loss": 0.76075971, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78222889, + "num_input_tokens_seen": 76082300, + "step": 3522, + "time_per_iteration": 2.656881093978882 + }, + { + "auxiliary_loss_clip": 0.01158076, + "auxiliary_loss_mlp": 0.0105432, + "balance_loss_clip": 1.05748427, + "balance_loss_mlp": 1.03606868, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 1.943083071097993, + "language_loss": 0.70177567, + "learning_rate": 3.663142046877374e-06, + "loss": 0.7238996, + "num_input_tokens_seen": 76101135, + "step": 3523, + "time_per_iteration": 2.542175769805908 + }, + { + "auxiliary_loss_clip": 0.0114788, + "auxiliary_loss_mlp": 0.0104798, + "balance_loss_clip": 1.05796373, + "balance_loss_mlp": 1.03040838, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.163697214153577, + "language_loss": 0.77233732, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.79429591, + "num_input_tokens_seen": 76119320, + "step": 3524, + "time_per_iteration": 2.504033088684082 + }, + { + "auxiliary_loss_clip": 0.01136369, + "auxiliary_loss_mlp": 0.01040497, + "balance_loss_clip": 1.05366492, + "balance_loss_mlp": 1.02285433, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.397761848639027, + "language_loss": 0.8130855, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83485419, + "num_input_tokens_seen": 76137445, + "step": 3525, + "time_per_iteration": 2.528733730316162 + }, + { + "auxiliary_loss_clip": 0.01101547, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_clip": 1.04964924, + "balance_loss_mlp": 1.02608705, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 1.7520069756421617, + "language_loss": 0.75771624, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77916181, + "num_input_tokens_seen": 76159500, + "step": 3526, + "time_per_iteration": 2.713932752609253 + }, + { + "auxiliary_loss_clip": 0.0116257, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.06011093, + "balance_loss_mlp": 1.02715361, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 2.3681179998883293, + "language_loss": 0.77030551, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79238057, + "num_input_tokens_seen": 76177990, + "step": 3527, + "time_per_iteration": 2.4922244548797607 + }, + { + "auxiliary_loss_clip": 0.0115781, + "auxiliary_loss_mlp": 0.01051073, + "balance_loss_clip": 1.05775249, + "balance_loss_mlp": 1.0326196, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 1.7214800797934222, + "language_loss": 0.78150499, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80359381, + "num_input_tokens_seen": 76197125, + "step": 3528, + "time_per_iteration": 2.478132486343384 + }, + { + "auxiliary_loss_clip": 0.01147892, + "auxiliary_loss_mlp": 0.01049258, + "balance_loss_clip": 1.05912614, + "balance_loss_mlp": 1.03240168, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.7425232956252665, + "language_loss": 0.81386852, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83584005, + "num_input_tokens_seen": 76216215, + "step": 3529, + "time_per_iteration": 2.4997971057891846 + }, + { + "auxiliary_loss_clip": 0.01141343, + "auxiliary_loss_mlp": 0.00784916, + "balance_loss_clip": 1.0571382, + "balance_loss_mlp": 1.00082386, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 1.8145687819067813, + "language_loss": 0.76433253, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78359509, + "num_input_tokens_seen": 76237010, + "step": 3530, + "time_per_iteration": 2.5591518878936768 + }, + { + "auxiliary_loss_clip": 0.01161101, + "auxiliary_loss_mlp": 0.01044236, + "balance_loss_clip": 1.06073582, + "balance_loss_mlp": 1.0273087, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.0017019015070687, + "language_loss": 0.82646579, + "learning_rate": 3.661409515882308e-06, + "loss": 0.84851921, + "num_input_tokens_seen": 76255965, + "step": 3531, + "time_per_iteration": 2.4575355052948 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_clip": 1.056252, + "balance_loss_mlp": 1.02323985, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.3347225538756513, + "language_loss": 0.73741347, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75914431, + "num_input_tokens_seen": 76272150, + "step": 3532, + "time_per_iteration": 2.4994425773620605 + }, + { + "auxiliary_loss_clip": 0.01135869, + "auxiliary_loss_mlp": 0.01046578, + "balance_loss_clip": 1.06086731, + "balance_loss_mlp": 1.02814817, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 3.963179960664185, + "language_loss": 0.73844248, + "learning_rate": 3.660975752961054e-06, + "loss": 0.7602669, + "num_input_tokens_seen": 76291425, + "step": 3533, + "time_per_iteration": 4.076772451400757 + }, + { + "auxiliary_loss_clip": 0.01152617, + "auxiliary_loss_mlp": 0.01039507, + "balance_loss_clip": 1.05871272, + "balance_loss_mlp": 1.02200747, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 1.9778452947616418, + "language_loss": 0.71181405, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73373532, + "num_input_tokens_seen": 76313975, + "step": 3534, + "time_per_iteration": 2.6235015392303467 + }, + { + "auxiliary_loss_clip": 0.01141713, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.05945468, + "balance_loss_mlp": 1.02201581, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.9516346897938828, + "language_loss": 0.71546042, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.73728079, + "num_input_tokens_seen": 76330955, + "step": 3535, + "time_per_iteration": 4.21394419670105 + }, + { + "auxiliary_loss_clip": 0.01147404, + "auxiliary_loss_mlp": 0.01053079, + "balance_loss_clip": 1.06020546, + "balance_loss_mlp": 1.03544831, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 3.836308474952151, + "language_loss": 0.70323491, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72523975, + "num_input_tokens_seen": 76352680, + "step": 3536, + "time_per_iteration": 2.5841000080108643 + }, + { + "auxiliary_loss_clip": 0.01163946, + "auxiliary_loss_mlp": 0.01046873, + "balance_loss_clip": 1.06067371, + "balance_loss_mlp": 1.02922964, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 2.212171564560371, + "language_loss": 0.87925565, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90136385, + "num_input_tokens_seen": 76370750, + "step": 3537, + "time_per_iteration": 2.468818187713623 + }, + { + "auxiliary_loss_clip": 0.01148139, + "auxiliary_loss_mlp": 0.00783895, + "balance_loss_clip": 1.05910063, + "balance_loss_mlp": 1.00077176, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.6261297176561065, + "language_loss": 0.8075313, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82685161, + "num_input_tokens_seen": 76390610, + "step": 3538, + "time_per_iteration": 2.512143611907959 + }, + { + "auxiliary_loss_clip": 0.01094509, + "auxiliary_loss_mlp": 0.01053274, + "balance_loss_clip": 1.05066454, + "balance_loss_mlp": 1.03427243, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.5220433125222739, + "language_loss": 0.87384808, + "learning_rate": 3.659672952835863e-06, + "loss": 0.8953259, + "num_input_tokens_seen": 76408860, + "step": 3539, + "time_per_iteration": 2.6623191833496094 + }, + { + "auxiliary_loss_clip": 0.01133775, + "auxiliary_loss_mlp": 0.01051018, + "balance_loss_clip": 1.05619872, + "balance_loss_mlp": 1.03335142, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 7.021315209785069, + "language_loss": 0.58033335, + "learning_rate": 3.659455599161237e-06, + "loss": 0.60218132, + "num_input_tokens_seen": 76424980, + "step": 3540, + "time_per_iteration": 2.527519702911377 + }, + { + "auxiliary_loss_clip": 0.01163382, + "auxiliary_loss_mlp": 0.01039541, + "balance_loss_clip": 1.06103325, + "balance_loss_mlp": 1.02207708, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 2.2491851200472674, + "language_loss": 0.75523293, + "learning_rate": 3.659238182559888e-06, + "loss": 0.77726215, + "num_input_tokens_seen": 76443135, + "step": 3541, + "time_per_iteration": 3.873764753341675 + }, + { + "auxiliary_loss_clip": 0.01120274, + "auxiliary_loss_mlp": 0.01047849, + "balance_loss_clip": 1.05743444, + "balance_loss_mlp": 1.03049207, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.863162060353586, + "language_loss": 0.69670808, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71838927, + "num_input_tokens_seen": 76462470, + "step": 3542, + "time_per_iteration": 2.603097677230835 + }, + { + "auxiliary_loss_clip": 0.01159994, + "auxiliary_loss_mlp": 0.01040529, + "balance_loss_clip": 1.06094885, + "balance_loss_mlp": 1.02400708, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.7891276470530693, + "language_loss": 0.75666201, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77866721, + "num_input_tokens_seen": 76481995, + "step": 3543, + "time_per_iteration": 2.4815731048583984 + }, + { + "auxiliary_loss_clip": 0.01136421, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.05976117, + "balance_loss_mlp": 1.01854467, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 1.745413553559167, + "language_loss": 0.66680253, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68851978, + "num_input_tokens_seen": 76500245, + "step": 3544, + "time_per_iteration": 2.5143332481384277 + }, + { + "auxiliary_loss_clip": 0.01130799, + "auxiliary_loss_mlp": 0.01044951, + "balance_loss_clip": 1.05658078, + "balance_loss_mlp": 1.02776098, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.7410303969250904, + "language_loss": 0.70740592, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.72916341, + "num_input_tokens_seen": 76519535, + "step": 3545, + "time_per_iteration": 2.50761342048645 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01051235, + "balance_loss_clip": 1.05977023, + "balance_loss_mlp": 1.0334487, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 2.106514718745417, + "language_loss": 0.71997023, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74184322, + "num_input_tokens_seen": 76542065, + "step": 3546, + "time_per_iteration": 2.5995771884918213 + }, + { + "auxiliary_loss_clip": 0.01121758, + "auxiliary_loss_mlp": 0.0103981, + "balance_loss_clip": 1.05949748, + "balance_loss_mlp": 1.02201223, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 3.095950556684566, + "language_loss": 0.80082095, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82243663, + "num_input_tokens_seen": 76560540, + "step": 3547, + "time_per_iteration": 2.6187984943389893 + }, + { + "auxiliary_loss_clip": 0.01164716, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.05969071, + "balance_loss_mlp": 1.02812243, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 20.006231438476764, + "language_loss": 0.7418403, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.76395249, + "num_input_tokens_seen": 76581760, + "step": 3548, + "time_per_iteration": 2.5312445163726807 + }, + { + "auxiliary_loss_clip": 0.01121125, + "auxiliary_loss_mlp": 0.01054759, + "balance_loss_clip": 1.05447459, + "balance_loss_mlp": 1.03590035, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 1.834045581024351, + "language_loss": 0.7447626, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76652145, + "num_input_tokens_seen": 76599940, + "step": 3549, + "time_per_iteration": 2.537552833557129 + }, + { + "auxiliary_loss_clip": 0.01129998, + "auxiliary_loss_mlp": 0.01047476, + "balance_loss_clip": 1.0603497, + "balance_loss_mlp": 1.02986884, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.9000256372673385, + "language_loss": 0.80858725, + "learning_rate": 3.657278602806357e-06, + "loss": 0.83036196, + "num_input_tokens_seen": 76619580, + "step": 3550, + "time_per_iteration": 2.586364507675171 + }, + { + "auxiliary_loss_clip": 0.01158282, + "auxiliary_loss_mlp": 0.0104774, + "balance_loss_clip": 1.06087101, + "balance_loss_mlp": 1.0307405, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.7201715016193913, + "language_loss": 0.88490129, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90696156, + "num_input_tokens_seen": 76638195, + "step": 3551, + "time_per_iteration": 4.010004281997681 + }, + { + "auxiliary_loss_clip": 0.01159288, + "auxiliary_loss_mlp": 0.01048014, + "balance_loss_clip": 1.05926979, + "balance_loss_mlp": 1.0302515, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.7804777099771996, + "language_loss": 0.83890295, + "learning_rate": 3.656842449140983e-06, + "loss": 0.86097598, + "num_input_tokens_seen": 76656695, + "step": 3552, + "time_per_iteration": 2.4560365676879883 + }, + { + "auxiliary_loss_clip": 0.01144111, + "auxiliary_loss_mlp": 0.01051568, + "balance_loss_clip": 1.05534863, + "balance_loss_mlp": 1.03394878, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.6862361868810394, + "language_loss": 0.76636451, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78832132, + "num_input_tokens_seen": 76677430, + "step": 3553, + "time_per_iteration": 2.5188536643981934 + }, + { + "auxiliary_loss_clip": 0.01147399, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_clip": 1.06067502, + "balance_loss_mlp": 1.03068459, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.9880061106798028, + "language_loss": 0.72409225, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.74603355, + "num_input_tokens_seen": 76697615, + "step": 3554, + "time_per_iteration": 2.54403018951416 + }, + { + "auxiliary_loss_clip": 0.01102754, + "auxiliary_loss_mlp": 0.00785001, + "balance_loss_clip": 1.05003881, + "balance_loss_mlp": 1.00093365, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 2.099646095268324, + "language_loss": 0.67602003, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69489753, + "num_input_tokens_seen": 76715685, + "step": 3555, + "time_per_iteration": 2.5882444381713867 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.05599654, + "balance_loss_mlp": 1.02185237, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 1.7430500575636962, + "language_loss": 0.65230304, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.67389858, + "num_input_tokens_seen": 76735405, + "step": 3556, + "time_per_iteration": 2.6104865074157715 + }, + { + "auxiliary_loss_clip": 0.01147497, + "auxiliary_loss_mlp": 0.01046946, + "balance_loss_clip": 1.05836785, + "balance_loss_mlp": 1.0286113, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.868911925957357, + "language_loss": 0.72022223, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.7421667, + "num_input_tokens_seen": 76754395, + "step": 3557, + "time_per_iteration": 2.596668004989624 + }, + { + "auxiliary_loss_clip": 0.01150237, + "auxiliary_loss_mlp": 0.00785841, + "balance_loss_clip": 1.06197882, + "balance_loss_mlp": 1.00090837, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.6874997086397712, + "language_loss": 0.67107332, + "learning_rate": 3.655532480546528e-06, + "loss": 0.6904341, + "num_input_tokens_seen": 76777210, + "step": 3558, + "time_per_iteration": 2.632140874862671 + }, + { + "auxiliary_loss_clip": 0.01163638, + "auxiliary_loss_mlp": 0.0104441, + "balance_loss_clip": 1.05799186, + "balance_loss_mlp": 1.02568197, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 2.0771183832691746, + "language_loss": 0.79840851, + "learning_rate": 3.655313932676286e-06, + "loss": 0.82048899, + "num_input_tokens_seen": 76795830, + "step": 3559, + "time_per_iteration": 2.4835290908813477 + }, + { + "auxiliary_loss_clip": 0.01156594, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_clip": 1.05680978, + "balance_loss_mlp": 1.03039467, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.6358470385069923, + "language_loss": 0.67910874, + "learning_rate": 3.655095322036373e-06, + "loss": 0.70114577, + "num_input_tokens_seen": 76814700, + "step": 3560, + "time_per_iteration": 2.4890341758728027 + }, + { + "auxiliary_loss_clip": 0.01151944, + "auxiliary_loss_mlp": 0.01046972, + "balance_loss_clip": 1.06027222, + "balance_loss_mlp": 1.02886367, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 4.97764198105894, + "language_loss": 0.72875351, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75074267, + "num_input_tokens_seen": 76833400, + "step": 3561, + "time_per_iteration": 2.489516258239746 + }, + { + "auxiliary_loss_clip": 0.01137912, + "auxiliary_loss_mlp": 0.01049082, + "balance_loss_clip": 1.05568647, + "balance_loss_mlp": 1.03096175, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 3.3401240913670933, + "language_loss": 0.77648461, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79835451, + "num_input_tokens_seen": 76850645, + "step": 3562, + "time_per_iteration": 2.5104241371154785 + }, + { + "auxiliary_loss_clip": 0.01159537, + "auxiliary_loss_mlp": 0.01042275, + "balance_loss_clip": 1.05971837, + "balance_loss_mlp": 1.02451324, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.72709274244019, + "language_loss": 0.84625041, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.86826849, + "num_input_tokens_seen": 76870135, + "step": 3563, + "time_per_iteration": 2.4562087059020996 + }, + { + "auxiliary_loss_clip": 0.01162028, + "auxiliary_loss_mlp": 0.01039799, + "balance_loss_clip": 1.06208634, + "balance_loss_mlp": 1.02259684, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.4485073807871507, + "language_loss": 0.76896179, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.7909801, + "num_input_tokens_seen": 76893905, + "step": 3564, + "time_per_iteration": 2.5902528762817383 + }, + { + "auxiliary_loss_clip": 0.01133166, + "auxiliary_loss_mlp": 0.01046694, + "balance_loss_clip": 1.05950022, + "balance_loss_mlp": 1.02906346, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.8472687685401892, + "language_loss": 0.88663059, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90842927, + "num_input_tokens_seen": 76914205, + "step": 3565, + "time_per_iteration": 2.6060166358947754 + }, + { + "auxiliary_loss_clip": 0.0106083, + "auxiliary_loss_mlp": 0.01008609, + "balance_loss_clip": 1.0474267, + "balance_loss_mlp": 1.00647497, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8297066816818143, + "language_loss": 0.52314532, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54383969, + "num_input_tokens_seen": 76975650, + "step": 3566, + "time_per_iteration": 3.0580990314483643 + }, + { + "auxiliary_loss_clip": 0.0114641, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.05961418, + "balance_loss_mlp": 1.02468991, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.9522719702185969, + "language_loss": 0.6741364, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69601083, + "num_input_tokens_seen": 76992615, + "step": 3567, + "time_per_iteration": 2.5086374282836914 + }, + { + "auxiliary_loss_clip": 0.01127746, + "auxiliary_loss_mlp": 0.01046453, + "balance_loss_clip": 1.05372107, + "balance_loss_mlp": 1.02879858, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.6794778579791938, + "language_loss": 0.74272275, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.76446468, + "num_input_tokens_seen": 77017005, + "step": 3568, + "time_per_iteration": 2.628667116165161 + }, + { + "auxiliary_loss_clip": 0.01144869, + "auxiliary_loss_mlp": 0.01048851, + "balance_loss_clip": 1.05882645, + "balance_loss_mlp": 1.03102911, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.943990891365584, + "language_loss": 0.77575064, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.79768783, + "num_input_tokens_seen": 77034990, + "step": 3569, + "time_per_iteration": 2.505002021789551 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01042042, + "balance_loss_clip": 1.05845118, + "balance_loss_mlp": 1.02244437, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.1962179831596815, + "language_loss": 0.69863433, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.72060072, + "num_input_tokens_seen": 77052610, + "step": 3570, + "time_per_iteration": 2.46317458152771 + }, + { + "auxiliary_loss_clip": 0.01165455, + "auxiliary_loss_mlp": 0.01045321, + "balance_loss_clip": 1.06275487, + "balance_loss_mlp": 1.02743912, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.470108925678594, + "language_loss": 0.7843883, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80649608, + "num_input_tokens_seen": 77072475, + "step": 3571, + "time_per_iteration": 2.4534640312194824 + }, + { + "auxiliary_loss_clip": 0.01143384, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_clip": 1.0574131, + "balance_loss_mlp": 1.02784324, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.4554107477849834, + "language_loss": 0.83016294, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85208815, + "num_input_tokens_seen": 77089930, + "step": 3572, + "time_per_iteration": 2.4549431800842285 + }, + { + "auxiliary_loss_clip": 0.01136365, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.05674124, + "balance_loss_mlp": 1.0186584, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.2932182185355647, + "language_loss": 0.65002877, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67176294, + "num_input_tokens_seen": 77108970, + "step": 3573, + "time_per_iteration": 4.036039590835571 + }, + { + "auxiliary_loss_clip": 0.01154595, + "auxiliary_loss_mlp": 0.01042272, + "balance_loss_clip": 1.05817938, + "balance_loss_mlp": 1.02489161, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 1.9216309723238911, + "language_loss": 0.75145042, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77341914, + "num_input_tokens_seen": 77126045, + "step": 3574, + "time_per_iteration": 2.4797732830047607 + }, + { + "auxiliary_loss_clip": 0.01142139, + "auxiliary_loss_mlp": 0.01038481, + "balance_loss_clip": 1.05528355, + "balance_loss_mlp": 1.01984882, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.79157214112844, + "language_loss": 0.71711475, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.73892093, + "num_input_tokens_seen": 77144600, + "step": 3575, + "time_per_iteration": 3.9874515533447266 + }, + { + "auxiliary_loss_clip": 0.01134961, + "auxiliary_loss_mlp": 0.01040396, + "balance_loss_clip": 1.05776405, + "balance_loss_mlp": 1.02312291, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 2.0022929896024375, + "language_loss": 0.68368095, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70543456, + "num_input_tokens_seen": 77162965, + "step": 3576, + "time_per_iteration": 2.5161309242248535 + }, + { + "auxiliary_loss_clip": 0.01145095, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.05621719, + "balance_loss_mlp": 1.02500772, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.0869360001751383, + "language_loss": 0.88624847, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90815556, + "num_input_tokens_seen": 77179960, + "step": 3577, + "time_per_iteration": 2.46203351020813 + }, + { + "auxiliary_loss_clip": 0.01056718, + "auxiliary_loss_mlp": 0.01016592, + "balance_loss_clip": 1.03688717, + "balance_loss_mlp": 1.01450598, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8037194273921029, + "language_loss": 0.56195992, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58269304, + "num_input_tokens_seen": 77239500, + "step": 3578, + "time_per_iteration": 3.0381555557250977 + }, + { + "auxiliary_loss_clip": 0.01146165, + "auxiliary_loss_mlp": 0.00784897, + "balance_loss_clip": 1.05544901, + "balance_loss_mlp": 1.00096989, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 2.595794938450797, + "language_loss": 0.8890059, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90831655, + "num_input_tokens_seen": 77254680, + "step": 3579, + "time_per_iteration": 2.493972063064575 + }, + { + "auxiliary_loss_clip": 0.01145002, + "auxiliary_loss_mlp": 0.0104573, + "balance_loss_clip": 1.05281234, + "balance_loss_mlp": 1.02812266, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.883275772545907, + "language_loss": 0.78009683, + "learning_rate": 3.650709940390972e-06, + "loss": 0.8020041, + "num_input_tokens_seen": 77274060, + "step": 3580, + "time_per_iteration": 2.5123908519744873 + }, + { + "auxiliary_loss_clip": 0.01149675, + "auxiliary_loss_mlp": 0.01046065, + "balance_loss_clip": 1.0598948, + "balance_loss_mlp": 1.02835011, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.871498390769953, + "language_loss": 0.73286736, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75482482, + "num_input_tokens_seen": 77293255, + "step": 3581, + "time_per_iteration": 3.9196512699127197 + }, + { + "auxiliary_loss_clip": 0.01141663, + "auxiliary_loss_mlp": 0.01046056, + "balance_loss_clip": 1.05575776, + "balance_loss_mlp": 1.02548075, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.0832123672631404, + "language_loss": 0.71236134, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.73423851, + "num_input_tokens_seen": 77312390, + "step": 3582, + "time_per_iteration": 2.4760892391204834 + }, + { + "auxiliary_loss_clip": 0.01155761, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.05660999, + "balance_loss_mlp": 1.02563441, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.2801505982479786, + "language_loss": 0.84318286, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86518091, + "num_input_tokens_seen": 77330985, + "step": 3583, + "time_per_iteration": 2.4447476863861084 + }, + { + "auxiliary_loss_clip": 0.01136785, + "auxiliary_loss_mlp": 0.01045488, + "balance_loss_clip": 1.05525684, + "balance_loss_mlp": 1.02765393, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 2.4722441157011943, + "language_loss": 0.82857478, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85039759, + "num_input_tokens_seen": 77350770, + "step": 3584, + "time_per_iteration": 2.5740578174591064 + }, + { + "auxiliary_loss_clip": 0.01123494, + "auxiliary_loss_mlp": 0.0078426, + "balance_loss_clip": 1.05332398, + "balance_loss_mlp": 1.00106907, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.321087788019159, + "language_loss": 0.90277845, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92185605, + "num_input_tokens_seen": 77370510, + "step": 3585, + "time_per_iteration": 2.5646040439605713 + }, + { + "auxiliary_loss_clip": 0.01144811, + "auxiliary_loss_mlp": 0.01045975, + "balance_loss_clip": 1.05722296, + "balance_loss_mlp": 1.02822459, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 1.7147841367661611, + "language_loss": 0.7459904, + "learning_rate": 3.649389440450277e-06, + "loss": 0.7678982, + "num_input_tokens_seen": 77390645, + "step": 3586, + "time_per_iteration": 2.5048670768737793 + }, + { + "auxiliary_loss_clip": 0.01122953, + "auxiliary_loss_mlp": 0.01048401, + "balance_loss_clip": 1.0566833, + "balance_loss_mlp": 1.0312109, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 1.762073915685756, + "language_loss": 0.83431733, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85603088, + "num_input_tokens_seen": 77409655, + "step": 3587, + "time_per_iteration": 2.599729299545288 + }, + { + "auxiliary_loss_clip": 0.01108658, + "auxiliary_loss_mlp": 0.00785754, + "balance_loss_clip": 1.04922676, + "balance_loss_mlp": 1.00089288, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 2.5160789709925937, + "language_loss": 0.75579906, + "learning_rate": 3.648948773354224e-06, + "loss": 0.7747432, + "num_input_tokens_seen": 77430560, + "step": 3588, + "time_per_iteration": 2.6863601207733154 + }, + { + "auxiliary_loss_clip": 0.0114227, + "auxiliary_loss_mlp": 0.01046056, + "balance_loss_clip": 1.05297375, + "balance_loss_mlp": 1.02767396, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.7123712990732842, + "language_loss": 0.80930424, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83118749, + "num_input_tokens_seen": 77455000, + "step": 3589, + "time_per_iteration": 2.5995421409606934 + }, + { + "auxiliary_loss_clip": 0.01159291, + "auxiliary_loss_mlp": 0.01041643, + "balance_loss_clip": 1.05888224, + "balance_loss_mlp": 1.02451229, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 3.097226895985718, + "language_loss": 0.72441375, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74642313, + "num_input_tokens_seen": 77475075, + "step": 3590, + "time_per_iteration": 4.04399037361145 + }, + { + "auxiliary_loss_clip": 0.01140377, + "auxiliary_loss_mlp": 0.0104487, + "balance_loss_clip": 1.05576193, + "balance_loss_mlp": 1.02514076, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.285206529643844, + "language_loss": 0.83967406, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86152649, + "num_input_tokens_seen": 77495945, + "step": 3591, + "time_per_iteration": 2.5959482192993164 + }, + { + "auxiliary_loss_clip": 0.0112792, + "auxiliary_loss_mlp": 0.01047227, + "balance_loss_clip": 1.05665827, + "balance_loss_mlp": 1.02680683, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 1.7316642611512054, + "language_loss": 0.68826669, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71001816, + "num_input_tokens_seen": 77517140, + "step": 3592, + "time_per_iteration": 2.6411831378936768 + }, + { + "auxiliary_loss_clip": 0.0111969, + "auxiliary_loss_mlp": 0.01056772, + "balance_loss_clip": 1.05021632, + "balance_loss_mlp": 1.0375793, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.7024235186033234, + "language_loss": 0.84776831, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86953288, + "num_input_tokens_seen": 77536085, + "step": 3593, + "time_per_iteration": 2.585843086242676 + }, + { + "auxiliary_loss_clip": 0.01123642, + "auxiliary_loss_mlp": 0.01048646, + "balance_loss_clip": 1.05183482, + "balance_loss_mlp": 1.02941799, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 3.1991123440100404, + "language_loss": 0.75227189, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77399474, + "num_input_tokens_seen": 77553675, + "step": 3594, + "time_per_iteration": 2.5658621788024902 + }, + { + "auxiliary_loss_clip": 0.01145845, + "auxiliary_loss_mlp": 0.01047643, + "balance_loss_clip": 1.0587399, + "balance_loss_mlp": 1.02870083, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 1.6272771237743773, + "language_loss": 0.80613971, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82807457, + "num_input_tokens_seen": 77573360, + "step": 3595, + "time_per_iteration": 2.5391924381256104 + }, + { + "auxiliary_loss_clip": 0.01123698, + "auxiliary_loss_mlp": 0.01044776, + "balance_loss_clip": 1.05517972, + "balance_loss_mlp": 1.02582192, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 1.9329190246751202, + "language_loss": 0.79198176, + "learning_rate": 3.647183604506897e-06, + "loss": 0.81366646, + "num_input_tokens_seen": 77591865, + "step": 3596, + "time_per_iteration": 2.5351483821868896 + }, + { + "auxiliary_loss_clip": 0.01086741, + "auxiliary_loss_mlp": 0.010484, + "balance_loss_clip": 1.05224562, + "balance_loss_mlp": 1.03024459, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.6396990742354112, + "language_loss": 0.83080006, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85215151, + "num_input_tokens_seen": 77611600, + "step": 3597, + "time_per_iteration": 2.66264009475708 + }, + { + "auxiliary_loss_clip": 0.01135499, + "auxiliary_loss_mlp": 0.00785922, + "balance_loss_clip": 1.05536556, + "balance_loss_mlp": 1.00111032, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.8318549905301897, + "language_loss": 0.80680633, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.8260206, + "num_input_tokens_seen": 77630665, + "step": 3598, + "time_per_iteration": 2.5285723209381104 + }, + { + "auxiliary_loss_clip": 0.01129206, + "auxiliary_loss_mlp": 0.01053217, + "balance_loss_clip": 1.05429769, + "balance_loss_mlp": 1.03320158, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.7847683808071444, + "language_loss": 0.81857651, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.84040082, + "num_input_tokens_seen": 77650835, + "step": 3599, + "time_per_iteration": 2.6114344596862793 + }, + { + "auxiliary_loss_clip": 0.01111545, + "auxiliary_loss_mlp": 0.00786126, + "balance_loss_clip": 1.04863787, + "balance_loss_mlp": 1.00121152, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 2.013966890566815, + "language_loss": 0.7655074, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78448409, + "num_input_tokens_seen": 77669000, + "step": 3600, + "time_per_iteration": 2.5551626682281494 + }, + { + "auxiliary_loss_clip": 0.01111614, + "auxiliary_loss_mlp": 0.01047126, + "balance_loss_clip": 1.0484482, + "balance_loss_mlp": 1.02999592, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 2.087392563201535, + "language_loss": 0.80518115, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82676858, + "num_input_tokens_seen": 77688745, + "step": 3601, + "time_per_iteration": 2.618189811706543 + }, + { + "auxiliary_loss_clip": 0.0116202, + "auxiliary_loss_mlp": 0.01047907, + "balance_loss_clip": 1.06157899, + "balance_loss_mlp": 1.02978683, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.7023690081217502, + "language_loss": 0.83350348, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85560286, + "num_input_tokens_seen": 77708445, + "step": 3602, + "time_per_iteration": 2.494861364364624 + }, + { + "auxiliary_loss_clip": 0.01157798, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.05852818, + "balance_loss_mlp": 1.03200901, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 5.39269703626781, + "language_loss": 0.74866289, + "learning_rate": 3.645635802397693e-06, + "loss": 0.77074838, + "num_input_tokens_seen": 77728465, + "step": 3603, + "time_per_iteration": 2.5390119552612305 + }, + { + "auxiliary_loss_clip": 0.01121897, + "auxiliary_loss_mlp": 0.01049751, + "balance_loss_clip": 1.05215693, + "balance_loss_mlp": 1.03046286, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 2.0912905828217694, + "language_loss": 0.73728323, + "learning_rate": 3.645414438132855e-06, + "loss": 0.75899971, + "num_input_tokens_seen": 77746735, + "step": 3604, + "time_per_iteration": 2.5903170108795166 + }, + { + "auxiliary_loss_clip": 0.01141972, + "auxiliary_loss_mlp": 0.01040746, + "balance_loss_clip": 1.05718589, + "balance_loss_mlp": 1.02319837, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 4.02185451206015, + "language_loss": 0.79724169, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.81906891, + "num_input_tokens_seen": 77768105, + "step": 3605, + "time_per_iteration": 2.5345757007598877 + }, + { + "auxiliary_loss_clip": 0.01078071, + "auxiliary_loss_mlp": 0.01001895, + "balance_loss_clip": 1.04145503, + "balance_loss_mlp": 0.99985617, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.707840585659466, + "language_loss": 0.58362329, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60442293, + "num_input_tokens_seen": 77833750, + "step": 3606, + "time_per_iteration": 3.12777042388916 + }, + { + "auxiliary_loss_clip": 0.01160064, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.05869031, + "balance_loss_mlp": 1.02366674, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.0448118925211314, + "language_loss": 0.72880542, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75083244, + "num_input_tokens_seen": 77853780, + "step": 3607, + "time_per_iteration": 2.5112345218658447 + }, + { + "auxiliary_loss_clip": 0.01138938, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.05717397, + "balance_loss_mlp": 1.02955508, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 1.8668736770095087, + "language_loss": 0.76539379, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78726959, + "num_input_tokens_seen": 77872575, + "step": 3608, + "time_per_iteration": 2.510521650314331 + }, + { + "auxiliary_loss_clip": 0.01083486, + "auxiliary_loss_mlp": 0.01050313, + "balance_loss_clip": 1.04759479, + "balance_loss_mlp": 1.0323838, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.794154156463691, + "language_loss": 0.74266189, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76399988, + "num_input_tokens_seen": 77892700, + "step": 3609, + "time_per_iteration": 2.668292284011841 + }, + { + "auxiliary_loss_clip": 0.01136707, + "auxiliary_loss_mlp": 0.01053471, + "balance_loss_clip": 1.05527818, + "balance_loss_mlp": 1.03529143, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.8115366659754282, + "language_loss": 0.88782454, + "learning_rate": 3.6440849425579e-06, + "loss": 0.90972632, + "num_input_tokens_seen": 77911060, + "step": 3610, + "time_per_iteration": 2.5133697986602783 + }, + { + "auxiliary_loss_clip": 0.01159629, + "auxiliary_loss_mlp": 0.01042086, + "balance_loss_clip": 1.06050968, + "balance_loss_mlp": 1.0240972, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 2.051810344962679, + "language_loss": 0.77287197, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79488909, + "num_input_tokens_seen": 77929930, + "step": 3611, + "time_per_iteration": 2.480933904647827 + }, + { + "auxiliary_loss_clip": 0.01092602, + "auxiliary_loss_mlp": 0.01047497, + "balance_loss_clip": 1.04919744, + "balance_loss_mlp": 1.02919888, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.008170123566162, + "language_loss": 0.63482749, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65622842, + "num_input_tokens_seen": 77949060, + "step": 3612, + "time_per_iteration": 4.0875349044799805 + }, + { + "auxiliary_loss_clip": 0.01095042, + "auxiliary_loss_mlp": 0.01053989, + "balance_loss_clip": 1.04612172, + "balance_loss_mlp": 1.03327096, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.985645352173331, + "language_loss": 0.75636375, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77785408, + "num_input_tokens_seen": 77967920, + "step": 3613, + "time_per_iteration": 2.5660860538482666 + }, + { + "auxiliary_loss_clip": 0.01104922, + "auxiliary_loss_mlp": 0.01053894, + "balance_loss_clip": 1.04975605, + "balance_loss_mlp": 1.03350902, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 2.051959093247748, + "language_loss": 0.70995843, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73154652, + "num_input_tokens_seen": 77985330, + "step": 3614, + "time_per_iteration": 4.018165349960327 + }, + { + "auxiliary_loss_clip": 0.01145196, + "auxiliary_loss_mlp": 0.01046988, + "balance_loss_clip": 1.05786586, + "balance_loss_mlp": 1.02877259, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 3.5018479072423894, + "language_loss": 0.73181391, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75373572, + "num_input_tokens_seen": 78003105, + "step": 3615, + "time_per_iteration": 2.489408493041992 + }, + { + "auxiliary_loss_clip": 0.01145859, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_clip": 1.05319369, + "balance_loss_mlp": 1.02482522, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.359489860457569, + "language_loss": 0.9021728, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92407978, + "num_input_tokens_seen": 78019655, + "step": 3616, + "time_per_iteration": 2.4935495853424072 + }, + { + "auxiliary_loss_clip": 0.01103268, + "auxiliary_loss_mlp": 0.01043344, + "balance_loss_clip": 1.04674625, + "balance_loss_mlp": 1.02458072, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 1.9850776118516738, + "language_loss": 0.81404287, + "learning_rate": 3.642531027869148e-06, + "loss": 0.835509, + "num_input_tokens_seen": 78036025, + "step": 3617, + "time_per_iteration": 2.5525052547454834 + }, + { + "auxiliary_loss_clip": 0.01131537, + "auxiliary_loss_mlp": 0.01045323, + "balance_loss_clip": 1.05157173, + "balance_loss_mlp": 1.02766776, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 2.0800945474200416, + "language_loss": 0.75670469, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77847326, + "num_input_tokens_seen": 78055645, + "step": 3618, + "time_per_iteration": 2.5469071865081787 + }, + { + "auxiliary_loss_clip": 0.01147775, + "auxiliary_loss_mlp": 0.01048957, + "balance_loss_clip": 1.0557065, + "balance_loss_mlp": 1.02982426, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 2.097097060018017, + "language_loss": 0.69491911, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71688646, + "num_input_tokens_seen": 78071660, + "step": 3619, + "time_per_iteration": 2.4685003757476807 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.01044368, + "balance_loss_clip": 1.05916536, + "balance_loss_mlp": 1.02502084, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.9387200174753505, + "language_loss": 0.78185344, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80378115, + "num_input_tokens_seen": 78091265, + "step": 3620, + "time_per_iteration": 3.9967174530029297 + }, + { + "auxiliary_loss_clip": 0.01151188, + "auxiliary_loss_mlp": 0.01044668, + "balance_loss_clip": 1.05490685, + "balance_loss_mlp": 1.02710795, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 1.6093282090313263, + "language_loss": 0.80017233, + "learning_rate": 3.641641706164509e-06, + "loss": 0.82213086, + "num_input_tokens_seen": 78110095, + "step": 3621, + "time_per_iteration": 2.466987371444702 + }, + { + "auxiliary_loss_clip": 0.01144453, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.05563223, + "balance_loss_mlp": 1.0251615, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.6016265721122012, + "language_loss": 0.87506866, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89693928, + "num_input_tokens_seen": 78129475, + "step": 3622, + "time_per_iteration": 2.5363240242004395 + }, + { + "auxiliary_loss_clip": 0.01148738, + "auxiliary_loss_mlp": 0.01043848, + "balance_loss_clip": 1.05698776, + "balance_loss_mlp": 1.02286696, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 2.1904164178972474, + "language_loss": 0.76856732, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79049313, + "num_input_tokens_seen": 78146880, + "step": 3623, + "time_per_iteration": 2.470057487487793 + }, + { + "auxiliary_loss_clip": 0.01121738, + "auxiliary_loss_mlp": 0.01056283, + "balance_loss_clip": 1.05212426, + "balance_loss_mlp": 1.03565931, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 2.0120280539535673, + "language_loss": 0.84699082, + "learning_rate": 3.640974061218741e-06, + "loss": 0.86877108, + "num_input_tokens_seen": 78165065, + "step": 3624, + "time_per_iteration": 2.5322465896606445 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.010588, + "balance_loss_clip": 1.05292106, + "balance_loss_mlp": 1.03965545, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.278318068644996, + "language_loss": 0.77626789, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79824758, + "num_input_tokens_seen": 78180005, + "step": 3625, + "time_per_iteration": 2.4707841873168945 + }, + { + "auxiliary_loss_clip": 0.01066098, + "auxiliary_loss_mlp": 0.01003098, + "balance_loss_clip": 1.03823423, + "balance_loss_mlp": 1.00098825, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8157547331608672, + "language_loss": 0.60726309, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62795508, + "num_input_tokens_seen": 78245350, + "step": 3626, + "time_per_iteration": 3.141815662384033 + }, + { + "auxiliary_loss_clip": 0.0112584, + "auxiliary_loss_mlp": 0.00787009, + "balance_loss_clip": 1.0508256, + "balance_loss_mlp": 1.00116909, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 3.818572611636492, + "language_loss": 0.90472531, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92385387, + "num_input_tokens_seen": 78264165, + "step": 3627, + "time_per_iteration": 2.535130500793457 + }, + { + "auxiliary_loss_clip": 0.01100345, + "auxiliary_loss_mlp": 0.01041699, + "balance_loss_clip": 1.05319238, + "balance_loss_mlp": 1.02216065, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 2.0084509280624854, + "language_loss": 0.73608243, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75750291, + "num_input_tokens_seen": 78283745, + "step": 3628, + "time_per_iteration": 2.57487154006958 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01042759, + "balance_loss_clip": 1.05539906, + "balance_loss_mlp": 1.02405512, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 2.055124203966678, + "language_loss": 0.77054918, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79251921, + "num_input_tokens_seen": 78302900, + "step": 3629, + "time_per_iteration": 2.485816478729248 + }, + { + "auxiliary_loss_clip": 0.01144758, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.05670643, + "balance_loss_mlp": 1.02342558, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.7061858308850177, + "language_loss": 0.71712077, + "learning_rate": 3.63963709145597e-06, + "loss": 0.7389788, + "num_input_tokens_seen": 78326470, + "step": 3630, + "time_per_iteration": 4.148780345916748 + }, + { + "auxiliary_loss_clip": 0.01089368, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.04559612, + "balance_loss_mlp": 1.0242424, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.8010925090954135, + "language_loss": 0.7681042, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78941274, + "num_input_tokens_seen": 78345810, + "step": 3631, + "time_per_iteration": 2.6530117988586426 + }, + { + "auxiliary_loss_clip": 0.01154122, + "auxiliary_loss_mlp": 0.01042033, + "balance_loss_clip": 1.05462694, + "balance_loss_mlp": 1.02379394, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.0353605765458265, + "language_loss": 0.754462, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77642357, + "num_input_tokens_seen": 78364085, + "step": 3632, + "time_per_iteration": 2.442458391189575 + }, + { + "auxiliary_loss_clip": 0.01151589, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.0544095, + "balance_loss_mlp": 1.02287912, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 2.0807828991471324, + "language_loss": 0.83507526, + "learning_rate": 3.638967767095249e-06, + "loss": 0.85699552, + "num_input_tokens_seen": 78381385, + "step": 3633, + "time_per_iteration": 2.4437766075134277 + }, + { + "auxiliary_loss_clip": 0.01120628, + "auxiliary_loss_mlp": 0.01048838, + "balance_loss_clip": 1.05183768, + "balance_loss_mlp": 1.03055096, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 6.643787160077499, + "language_loss": 0.81476504, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.8364597, + "num_input_tokens_seen": 78400500, + "step": 3634, + "time_per_iteration": 2.5517523288726807 + }, + { + "auxiliary_loss_clip": 0.01146487, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.05694306, + "balance_loss_mlp": 1.01930845, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 1.8587606589038208, + "language_loss": 0.74751079, + "learning_rate": 3.638521240091558e-06, + "loss": 0.76935172, + "num_input_tokens_seen": 78418340, + "step": 3635, + "time_per_iteration": 2.4669361114501953 + }, + { + "auxiliary_loss_clip": 0.01129027, + "auxiliary_loss_mlp": 0.01054624, + "balance_loss_clip": 1.05410469, + "balance_loss_mlp": 1.03633761, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.170311193666717, + "language_loss": 0.88298029, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90481681, + "num_input_tokens_seen": 78434375, + "step": 3636, + "time_per_iteration": 2.496354818344116 + }, + { + "auxiliary_loss_clip": 0.01117978, + "auxiliary_loss_mlp": 0.00785977, + "balance_loss_clip": 1.05270731, + "balance_loss_mlp": 1.00092125, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 2.1099743347152193, + "language_loss": 0.75894153, + "learning_rate": 3.638074464556311e-06, + "loss": 0.77798116, + "num_input_tokens_seen": 78451735, + "step": 3637, + "time_per_iteration": 2.5860705375671387 + }, + { + "auxiliary_loss_clip": 0.01140032, + "auxiliary_loss_mlp": 0.01042386, + "balance_loss_clip": 1.0574131, + "balance_loss_mlp": 1.02237058, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 2.73376670187268, + "language_loss": 0.90201259, + "learning_rate": 3.63785098361053e-06, + "loss": 0.92383683, + "num_input_tokens_seen": 78462730, + "step": 3638, + "time_per_iteration": 2.4598755836486816 + }, + { + "auxiliary_loss_clip": 0.01139442, + "auxiliary_loss_mlp": 0.01050668, + "balance_loss_clip": 1.0513041, + "balance_loss_mlp": 1.03173804, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.5256856495346973, + "language_loss": 0.89588344, + "learning_rate": 3.637627440557275e-06, + "loss": 0.91778457, + "num_input_tokens_seen": 78476300, + "step": 3639, + "time_per_iteration": 2.4304237365722656 + }, + { + "auxiliary_loss_clip": 0.01130119, + "auxiliary_loss_mlp": 0.00785475, + "balance_loss_clip": 1.0513618, + "balance_loss_mlp": 1.00101304, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.7328228841098665, + "language_loss": 0.79408193, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81323791, + "num_input_tokens_seen": 78496135, + "step": 3640, + "time_per_iteration": 2.571579933166504 + }, + { + "auxiliary_loss_clip": 0.01150963, + "auxiliary_loss_mlp": 0.01054515, + "balance_loss_clip": 1.06046999, + "balance_loss_mlp": 1.03393948, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.4051835748494117, + "language_loss": 0.72273481, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74478966, + "num_input_tokens_seen": 78513855, + "step": 3641, + "time_per_iteration": 2.468034267425537 + }, + { + "auxiliary_loss_clip": 0.01132164, + "auxiliary_loss_mlp": 0.01041806, + "balance_loss_clip": 1.05437386, + "balance_loss_mlp": 1.02379358, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 1.821855112033954, + "language_loss": 0.81012571, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83186543, + "num_input_tokens_seen": 78531740, + "step": 3642, + "time_per_iteration": 2.537022829055786 + }, + { + "auxiliary_loss_clip": 0.01152823, + "auxiliary_loss_mlp": 0.01050317, + "balance_loss_clip": 1.06069994, + "balance_loss_mlp": 1.03037333, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 2.8946793147728243, + "language_loss": 0.72144914, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.74348056, + "num_input_tokens_seen": 78549600, + "step": 3643, + "time_per_iteration": 2.5055153369903564 + }, + { + "auxiliary_loss_clip": 0.01157376, + "auxiliary_loss_mlp": 0.01047013, + "balance_loss_clip": 1.05780733, + "balance_loss_mlp": 1.02811837, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 2.1472337075288417, + "language_loss": 0.68272406, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70476794, + "num_input_tokens_seen": 78573350, + "step": 3644, + "time_per_iteration": 2.686793088912964 + }, + { + "auxiliary_loss_clip": 0.01156688, + "auxiliary_loss_mlp": 0.01048379, + "balance_loss_clip": 1.05531824, + "balance_loss_mlp": 1.02948451, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.374323251764297, + "language_loss": 0.77673811, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79878879, + "num_input_tokens_seen": 78591005, + "step": 3645, + "time_per_iteration": 2.448847532272339 + }, + { + "auxiliary_loss_clip": 0.01139022, + "auxiliary_loss_mlp": 0.01049592, + "balance_loss_clip": 1.05347824, + "balance_loss_mlp": 1.03148365, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.5885502330755137, + "language_loss": 0.82084644, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84273255, + "num_input_tokens_seen": 78610645, + "step": 3646, + "time_per_iteration": 2.476433277130127 + }, + { + "auxiliary_loss_clip": 0.01140792, + "auxiliary_loss_mlp": 0.0103985, + "balance_loss_clip": 1.05383384, + "balance_loss_mlp": 1.0221591, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.7454353122127366, + "language_loss": 0.82966948, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85147589, + "num_input_tokens_seen": 78628340, + "step": 3647, + "time_per_iteration": 2.4679832458496094 + }, + { + "auxiliary_loss_clip": 0.01149221, + "auxiliary_loss_mlp": 0.01050353, + "balance_loss_clip": 1.05247605, + "balance_loss_mlp": 1.03247142, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.7585310145445399, + "language_loss": 0.72551334, + "learning_rate": 3.635612759641123e-06, + "loss": 0.747509, + "num_input_tokens_seen": 78649355, + "step": 3648, + "time_per_iteration": 2.5133676528930664 + }, + { + "auxiliary_loss_clip": 0.01113442, + "auxiliary_loss_mlp": 0.01052903, + "balance_loss_clip": 1.05067635, + "balance_loss_mlp": 1.02989554, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.3435971847591937, + "language_loss": 0.74068344, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76234692, + "num_input_tokens_seen": 78664915, + "step": 3649, + "time_per_iteration": 2.5305445194244385 + }, + { + "auxiliary_loss_clip": 0.01133112, + "auxiliary_loss_mlp": 0.01041011, + "balance_loss_clip": 1.05031013, + "balance_loss_mlp": 1.02396417, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 1.9211938095123942, + "language_loss": 0.86295277, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88469398, + "num_input_tokens_seen": 78681475, + "step": 3650, + "time_per_iteration": 2.4831180572509766 + }, + { + "auxiliary_loss_clip": 0.01126625, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.0476315, + "balance_loss_mlp": 1.02428734, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.1561360321667062, + "language_loss": 0.83552992, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85722959, + "num_input_tokens_seen": 78702300, + "step": 3651, + "time_per_iteration": 3.979912281036377 + }, + { + "auxiliary_loss_clip": 0.01140872, + "auxiliary_loss_mlp": 0.010474, + "balance_loss_clip": 1.05266738, + "balance_loss_mlp": 1.02896988, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.7618627097771977, + "language_loss": 0.74407417, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76595688, + "num_input_tokens_seen": 78720230, + "step": 3652, + "time_per_iteration": 2.4776206016540527 + }, + { + "auxiliary_loss_clip": 0.01034688, + "auxiliary_loss_mlp": 0.01004501, + "balance_loss_clip": 1.0343225, + "balance_loss_mlp": 1.00247431, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7663210641648077, + "language_loss": 0.51547986, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53587174, + "num_input_tokens_seen": 78780200, + "step": 3653, + "time_per_iteration": 3.1321983337402344 + }, + { + "auxiliary_loss_clip": 0.01123652, + "auxiliary_loss_mlp": 0.01055448, + "balance_loss_clip": 1.05173481, + "balance_loss_mlp": 1.03638685, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.8124496724275019, + "language_loss": 0.75475961, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77655065, + "num_input_tokens_seen": 78800575, + "step": 3654, + "time_per_iteration": 4.1094810962677 + }, + { + "auxiliary_loss_clip": 0.01150045, + "auxiliary_loss_mlp": 0.01046092, + "balance_loss_clip": 1.05947518, + "balance_loss_mlp": 1.02759075, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.857104789762749, + "language_loss": 0.73106056, + "learning_rate": 3.634042312013064e-06, + "loss": 0.7530219, + "num_input_tokens_seen": 78819585, + "step": 3655, + "time_per_iteration": 2.4795725345611572 + }, + { + "auxiliary_loss_clip": 0.01130181, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.05239356, + "balance_loss_mlp": 1.02660012, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.6891541799885945, + "language_loss": 0.81080198, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.8325597, + "num_input_tokens_seen": 78837330, + "step": 3656, + "time_per_iteration": 2.5662426948547363 + }, + { + "auxiliary_loss_clip": 0.01115414, + "auxiliary_loss_mlp": 0.00786048, + "balance_loss_clip": 1.05228996, + "balance_loss_mlp": 1.00109458, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.114627408485928, + "language_loss": 0.84997803, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86899263, + "num_input_tokens_seen": 78854955, + "step": 3657, + "time_per_iteration": 2.5459234714508057 + }, + { + "auxiliary_loss_clip": 0.01136386, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.0542686, + "balance_loss_mlp": 1.01885152, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.5200680719511832, + "language_loss": 0.80476665, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82650602, + "num_input_tokens_seen": 78874965, + "step": 3658, + "time_per_iteration": 2.5068280696868896 + }, + { + "auxiliary_loss_clip": 0.01043427, + "auxiliary_loss_mlp": 0.01016379, + "balance_loss_clip": 1.03350282, + "balance_loss_mlp": 1.01407814, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.7865271785297224, + "language_loss": 0.58246922, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60306728, + "num_input_tokens_seen": 78937740, + "step": 3659, + "time_per_iteration": 4.610206127166748 + }, + { + "auxiliary_loss_clip": 0.01111365, + "auxiliary_loss_mlp": 0.01052174, + "balance_loss_clip": 1.04943883, + "balance_loss_mlp": 1.03286195, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.218373024007041, + "language_loss": 0.74411345, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76574886, + "num_input_tokens_seen": 78955055, + "step": 3660, + "time_per_iteration": 2.5645625591278076 + }, + { + "auxiliary_loss_clip": 0.01139774, + "auxiliary_loss_mlp": 0.01041953, + "balance_loss_clip": 1.0516516, + "balance_loss_mlp": 1.0233444, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 1.9530168226296194, + "language_loss": 0.81249702, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83431423, + "num_input_tokens_seen": 78974895, + "step": 3661, + "time_per_iteration": 2.537353754043579 + }, + { + "auxiliary_loss_clip": 0.01122018, + "auxiliary_loss_mlp": 0.01051681, + "balance_loss_clip": 1.0474081, + "balance_loss_mlp": 1.03251243, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.7416094934073298, + "language_loss": 0.73474425, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75648123, + "num_input_tokens_seen": 78994990, + "step": 3662, + "time_per_iteration": 2.5801122188568115 + }, + { + "auxiliary_loss_clip": 0.0113144, + "auxiliary_loss_mlp": 0.0105432, + "balance_loss_clip": 1.05384338, + "balance_loss_mlp": 1.03766668, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.5710516152334548, + "language_loss": 0.78330714, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80516469, + "num_input_tokens_seen": 79014405, + "step": 3663, + "time_per_iteration": 2.5317232608795166 + }, + { + "auxiliary_loss_clip": 0.01141309, + "auxiliary_loss_mlp": 0.01059202, + "balance_loss_clip": 1.05636716, + "balance_loss_mlp": 1.03874612, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 8.73858681720546, + "language_loss": 0.80630738, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82831252, + "num_input_tokens_seen": 79032375, + "step": 3664, + "time_per_iteration": 2.5118329524993896 + }, + { + "auxiliary_loss_clip": 0.01133761, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.05494571, + "balance_loss_mlp": 1.02481794, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.9696238217499786, + "language_loss": 0.7714799, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.79327607, + "num_input_tokens_seen": 79049635, + "step": 3665, + "time_per_iteration": 2.5053467750549316 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01053221, + "balance_loss_clip": 1.05399299, + "balance_loss_mlp": 1.03468418, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.524538544767128, + "language_loss": 0.98074549, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00251794, + "num_input_tokens_seen": 79062890, + "step": 3666, + "time_per_iteration": 2.4686496257781982 + }, + { + "auxiliary_loss_clip": 0.01142725, + "auxiliary_loss_mlp": 0.00786586, + "balance_loss_clip": 1.05499685, + "balance_loss_mlp": 1.00103593, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 1.8880725119640966, + "language_loss": 0.80500776, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82430089, + "num_input_tokens_seen": 79085495, + "step": 3667, + "time_per_iteration": 2.685692071914673 + }, + { + "auxiliary_loss_clip": 0.01141891, + "auxiliary_loss_mlp": 0.01052352, + "balance_loss_clip": 1.0564574, + "balance_loss_mlp": 1.03280187, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.8930140525600874, + "language_loss": 0.77600747, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79794997, + "num_input_tokens_seen": 79101820, + "step": 3668, + "time_per_iteration": 2.480224847793579 + }, + { + "auxiliary_loss_clip": 0.01142704, + "auxiliary_loss_mlp": 0.01043779, + "balance_loss_clip": 1.06085181, + "balance_loss_mlp": 1.02457428, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.5780946631245778, + "language_loss": 0.71044695, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73231184, + "num_input_tokens_seen": 79123320, + "step": 3669, + "time_per_iteration": 2.5771358013153076 + }, + { + "auxiliary_loss_clip": 0.01155307, + "auxiliary_loss_mlp": 0.01042178, + "balance_loss_clip": 1.05629385, + "balance_loss_mlp": 1.02427268, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 1.7040224595500753, + "language_loss": 0.85735261, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87932748, + "num_input_tokens_seen": 79141615, + "step": 3670, + "time_per_iteration": 3.9888687133789062 + }, + { + "auxiliary_loss_clip": 0.01135174, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.05283809, + "balance_loss_mlp": 1.02028489, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 4.674058125333272, + "language_loss": 0.76924253, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79098213, + "num_input_tokens_seen": 79164910, + "step": 3671, + "time_per_iteration": 2.660672187805176 + }, + { + "auxiliary_loss_clip": 0.01126007, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.05643487, + "balance_loss_mlp": 1.02597535, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.121048375606678, + "language_loss": 0.81159544, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.83329666, + "num_input_tokens_seen": 79179685, + "step": 3672, + "time_per_iteration": 2.472507953643799 + }, + { + "auxiliary_loss_clip": 0.01146571, + "auxiliary_loss_mlp": 0.01055173, + "balance_loss_clip": 1.05606961, + "balance_loss_mlp": 1.03695822, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 2.0358440831534206, + "language_loss": 0.73774171, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75975919, + "num_input_tokens_seen": 79196285, + "step": 3673, + "time_per_iteration": 2.5095748901367188 + }, + { + "auxiliary_loss_clip": 0.0112174, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.05436409, + "balance_loss_mlp": 1.02002358, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 3.1770455546761283, + "language_loss": 0.76664972, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78825831, + "num_input_tokens_seen": 79216060, + "step": 3674, + "time_per_iteration": 2.6631669998168945 + }, + { + "auxiliary_loss_clip": 0.01157148, + "auxiliary_loss_mlp": 0.01045509, + "balance_loss_clip": 1.05882096, + "balance_loss_mlp": 1.02610207, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 1.7324998752367768, + "language_loss": 0.74137497, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76340151, + "num_input_tokens_seen": 79235145, + "step": 3675, + "time_per_iteration": 2.4669129848480225 + }, + { + "auxiliary_loss_clip": 0.01157892, + "auxiliary_loss_mlp": 0.01042975, + "balance_loss_clip": 1.05839658, + "balance_loss_mlp": 1.02495027, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.6690973199195063, + "language_loss": 0.8011477, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82315636, + "num_input_tokens_seen": 79256960, + "step": 3676, + "time_per_iteration": 2.532395839691162 + }, + { + "auxiliary_loss_clip": 0.01131605, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.05267954, + "balance_loss_mlp": 1.03127074, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 2.3089059603316557, + "language_loss": 0.75094098, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77274728, + "num_input_tokens_seen": 79274860, + "step": 3677, + "time_per_iteration": 2.504789113998413 + }, + { + "auxiliary_loss_clip": 0.01116472, + "auxiliary_loss_mlp": 0.01047238, + "balance_loss_clip": 1.04967308, + "balance_loss_mlp": 1.02880776, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 2.1220552307321925, + "language_loss": 0.83207142, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85370851, + "num_input_tokens_seen": 79294005, + "step": 3678, + "time_per_iteration": 2.578688859939575 + }, + { + "auxiliary_loss_clip": 0.01093259, + "auxiliary_loss_mlp": 0.01052886, + "balance_loss_clip": 1.04680347, + "balance_loss_mlp": 1.03354979, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.8953880953693871, + "language_loss": 0.89225954, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91372097, + "num_input_tokens_seen": 79314005, + "step": 3679, + "time_per_iteration": 2.626110315322876 + }, + { + "auxiliary_loss_clip": 0.01150761, + "auxiliary_loss_mlp": 0.01050864, + "balance_loss_clip": 1.05947065, + "balance_loss_mlp": 1.0317781, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.162074562122549, + "language_loss": 0.86912972, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.891146, + "num_input_tokens_seen": 79331030, + "step": 3680, + "time_per_iteration": 2.502061605453491 + }, + { + "auxiliary_loss_clip": 0.01115578, + "auxiliary_loss_mlp": 0.0104976, + "balance_loss_clip": 1.05664086, + "balance_loss_mlp": 1.03185463, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.4009977035471963, + "language_loss": 0.81022608, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.8318795, + "num_input_tokens_seen": 79348560, + "step": 3681, + "time_per_iteration": 2.566317081451416 + }, + { + "auxiliary_loss_clip": 0.01151129, + "auxiliary_loss_mlp": 0.0078409, + "balance_loss_clip": 1.05709505, + "balance_loss_mlp": 1.00111437, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.1543898886399138, + "language_loss": 0.794958, + "learning_rate": 3.62795645623335e-06, + "loss": 0.81431019, + "num_input_tokens_seen": 79367175, + "step": 3682, + "time_per_iteration": 2.4837608337402344 + }, + { + "auxiliary_loss_clip": 0.01122402, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.0485791, + "balance_loss_mlp": 1.02959001, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 1.6781419899008854, + "language_loss": 0.7780478, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79976624, + "num_input_tokens_seen": 79388435, + "step": 3683, + "time_per_iteration": 2.5655558109283447 + }, + { + "auxiliary_loss_clip": 0.01129742, + "auxiliary_loss_mlp": 0.01042743, + "balance_loss_clip": 1.05047667, + "balance_loss_mlp": 1.02508819, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.902229190180043, + "language_loss": 0.72554129, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74726611, + "num_input_tokens_seen": 79407910, + "step": 3684, + "time_per_iteration": 2.5967655181884766 + }, + { + "auxiliary_loss_clip": 0.01084217, + "auxiliary_loss_mlp": 0.01045925, + "balance_loss_clip": 1.04484415, + "balance_loss_mlp": 1.02712607, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 2.0133511055799884, + "language_loss": 0.80595624, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82725769, + "num_input_tokens_seen": 79424020, + "step": 3685, + "time_per_iteration": 2.6252617835998535 + }, + { + "auxiliary_loss_clip": 0.01148133, + "auxiliary_loss_mlp": 0.01046813, + "balance_loss_clip": 1.05315018, + "balance_loss_mlp": 1.0302546, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.419592721146243, + "language_loss": 0.87413609, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89608556, + "num_input_tokens_seen": 79445605, + "step": 3686, + "time_per_iteration": 2.5181095600128174 + }, + { + "auxiliary_loss_clip": 0.01133645, + "auxiliary_loss_mlp": 0.01045847, + "balance_loss_clip": 1.04963279, + "balance_loss_mlp": 1.02816868, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 2.145538839975765, + "language_loss": 0.7770611, + "learning_rate": 3.626824502298707e-06, + "loss": 0.79885602, + "num_input_tokens_seen": 79463850, + "step": 3687, + "time_per_iteration": 2.629599094390869 + }, + { + "auxiliary_loss_clip": 0.01125175, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_clip": 1.04904473, + "balance_loss_mlp": 1.03319955, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.842058695816128, + "language_loss": 0.84851694, + "learning_rate": 3.626597926409383e-06, + "loss": 0.87029731, + "num_input_tokens_seen": 79482845, + "step": 3688, + "time_per_iteration": 2.5491557121276855 + }, + { + "auxiliary_loss_clip": 0.01112097, + "auxiliary_loss_mlp": 0.01051887, + "balance_loss_clip": 1.05109167, + "balance_loss_mlp": 1.03324294, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 2.521785825247337, + "language_loss": 0.81279278, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.8344326, + "num_input_tokens_seen": 79501550, + "step": 3689, + "time_per_iteration": 2.6009464263916016 + }, + { + "auxiliary_loss_clip": 0.01124966, + "auxiliary_loss_mlp": 0.01045827, + "balance_loss_clip": 1.05198097, + "balance_loss_mlp": 1.02867293, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 1.8470433933561368, + "language_loss": 0.70464301, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72635096, + "num_input_tokens_seen": 79519680, + "step": 3690, + "time_per_iteration": 4.063467741012573 + }, + { + "auxiliary_loss_clip": 0.01149198, + "auxiliary_loss_mlp": 0.00786861, + "balance_loss_clip": 1.05623126, + "balance_loss_mlp": 1.00125265, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 2.382511873319812, + "language_loss": 0.72338337, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74274397, + "num_input_tokens_seen": 79539000, + "step": 3691, + "time_per_iteration": 2.515124797821045 + }, + { + "auxiliary_loss_clip": 0.01142237, + "auxiliary_loss_mlp": 0.01050592, + "balance_loss_clip": 1.05570447, + "balance_loss_mlp": 1.03219771, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 1.8728787050643085, + "language_loss": 0.71309763, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73502594, + "num_input_tokens_seen": 79559695, + "step": 3692, + "time_per_iteration": 2.518810987472534 + }, + { + "auxiliary_loss_clip": 0.01146076, + "auxiliary_loss_mlp": 0.01048882, + "balance_loss_clip": 1.05412447, + "balance_loss_mlp": 1.03115594, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 1.6519423242606903, + "language_loss": 0.87146151, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.8934111, + "num_input_tokens_seen": 79579095, + "step": 3693, + "time_per_iteration": 3.9950265884399414 + }, + { + "auxiliary_loss_clip": 0.01140058, + "auxiliary_loss_mlp": 0.01040454, + "balance_loss_clip": 1.05497789, + "balance_loss_mlp": 1.02453995, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 2.2240324197847507, + "language_loss": 0.85171044, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87351555, + "num_input_tokens_seen": 79596430, + "step": 3694, + "time_per_iteration": 2.443242311477661 + }, + { + "auxiliary_loss_clip": 0.01110388, + "auxiliary_loss_mlp": 0.01048411, + "balance_loss_clip": 1.04328239, + "balance_loss_mlp": 1.02921867, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.8015022985614688, + "language_loss": 0.69104743, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71263546, + "num_input_tokens_seen": 79615825, + "step": 3695, + "time_per_iteration": 2.537489175796509 + }, + { + "auxiliary_loss_clip": 0.01118067, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.05062342, + "balance_loss_mlp": 1.02586544, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.5787256472548041, + "language_loss": 0.71716118, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73876786, + "num_input_tokens_seen": 79637875, + "step": 3696, + "time_per_iteration": 2.576617956161499 + }, + { + "auxiliary_loss_clip": 0.01138212, + "auxiliary_loss_mlp": 0.0104081, + "balance_loss_clip": 1.05005884, + "balance_loss_mlp": 1.02267802, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.731164711290256, + "language_loss": 0.87414443, + "learning_rate": 3.624555968803217e-06, + "loss": 0.89593464, + "num_input_tokens_seen": 79656970, + "step": 3697, + "time_per_iteration": 2.5176894664764404 + }, + { + "auxiliary_loss_clip": 0.011209, + "auxiliary_loss_mlp": 0.01046578, + "balance_loss_clip": 1.04861164, + "balance_loss_mlp": 1.03002024, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.5426177892447108, + "language_loss": 0.66138977, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68306452, + "num_input_tokens_seen": 79680275, + "step": 3698, + "time_per_iteration": 2.6996328830718994 + }, + { + "auxiliary_loss_clip": 0.01136078, + "auxiliary_loss_mlp": 0.01042529, + "balance_loss_clip": 1.05208743, + "balance_loss_mlp": 1.02374125, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.8495853424829278, + "language_loss": 0.8224324, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84421849, + "num_input_tokens_seen": 79701255, + "step": 3699, + "time_per_iteration": 4.027231216430664 + }, + { + "auxiliary_loss_clip": 0.01128751, + "auxiliary_loss_mlp": 0.0104101, + "balance_loss_clip": 1.05012083, + "balance_loss_mlp": 1.02279472, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 2.806357232142837, + "language_loss": 0.79250038, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81419802, + "num_input_tokens_seen": 79721315, + "step": 3700, + "time_per_iteration": 2.512688398361206 + }, + { + "auxiliary_loss_clip": 0.01113616, + "auxiliary_loss_mlp": 0.0104607, + "balance_loss_clip": 1.05075693, + "balance_loss_mlp": 1.0269134, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 1.971470494473508, + "language_loss": 0.72778922, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74938607, + "num_input_tokens_seen": 79742705, + "step": 3701, + "time_per_iteration": 2.7813217639923096 + }, + { + "auxiliary_loss_clip": 0.01137586, + "auxiliary_loss_mlp": 0.01043773, + "balance_loss_clip": 1.04925084, + "balance_loss_mlp": 1.02699995, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.67628630685803, + "language_loss": 0.80276644, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.82458001, + "num_input_tokens_seen": 79763000, + "step": 3702, + "time_per_iteration": 2.5801048278808594 + }, + { + "auxiliary_loss_clip": 0.01128068, + "auxiliary_loss_mlp": 0.01038418, + "balance_loss_clip": 1.04724133, + "balance_loss_mlp": 1.02065599, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 1.7624583343871614, + "language_loss": 0.78208452, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80374932, + "num_input_tokens_seen": 79781335, + "step": 3703, + "time_per_iteration": 2.4637372493743896 + }, + { + "auxiliary_loss_clip": 0.01140411, + "auxiliary_loss_mlp": 0.01042437, + "balance_loss_clip": 1.05172968, + "balance_loss_mlp": 1.02292299, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.9833984036857906, + "language_loss": 0.73961073, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.76143926, + "num_input_tokens_seen": 79800150, + "step": 3704, + "time_per_iteration": 2.4938161373138428 + }, + { + "auxiliary_loss_clip": 0.01105738, + "auxiliary_loss_mlp": 0.01043211, + "balance_loss_clip": 1.0492394, + "balance_loss_mlp": 1.02665329, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.850022486316099, + "language_loss": 0.64044869, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66193819, + "num_input_tokens_seen": 79822390, + "step": 3705, + "time_per_iteration": 2.7908504009246826 + }, + { + "auxiliary_loss_clip": 0.01040512, + "auxiliary_loss_mlp": 0.01004558, + "balance_loss_clip": 1.04295945, + "balance_loss_mlp": 1.00268674, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.2688412177733446, + "language_loss": 0.65228033, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67273104, + "num_input_tokens_seen": 79873350, + "step": 3706, + "time_per_iteration": 2.9858758449554443 + }, + { + "auxiliary_loss_clip": 0.01112579, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.04601383, + "balance_loss_mlp": 1.02546406, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 1.8660001236569304, + "language_loss": 0.8076452, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82920361, + "num_input_tokens_seen": 79891715, + "step": 3707, + "time_per_iteration": 2.5089175701141357 + }, + { + "auxiliary_loss_clip": 0.01150637, + "auxiliary_loss_mlp": 0.01038674, + "balance_loss_clip": 1.0529654, + "balance_loss_mlp": 1.02104342, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 1.9941320674212022, + "language_loss": 0.7848236, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80671668, + "num_input_tokens_seen": 79911175, + "step": 3708, + "time_per_iteration": 2.5086278915405273 + }, + { + "auxiliary_loss_clip": 0.01132838, + "auxiliary_loss_mlp": 0.01043737, + "balance_loss_clip": 1.05325913, + "balance_loss_mlp": 1.02546239, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 2.147116983624498, + "language_loss": 0.80418575, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82595146, + "num_input_tokens_seen": 79931875, + "step": 3709, + "time_per_iteration": 4.0136559009552 + }, + { + "auxiliary_loss_clip": 0.01136111, + "auxiliary_loss_mlp": 0.00785947, + "balance_loss_clip": 1.05060768, + "balance_loss_mlp": 1.00111771, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 1.8373151875061622, + "language_loss": 0.69318187, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.71240246, + "num_input_tokens_seen": 79952445, + "step": 3710, + "time_per_iteration": 2.5372931957244873 + }, + { + "auxiliary_loss_clip": 0.01114315, + "auxiliary_loss_mlp": 0.01052782, + "balance_loss_clip": 1.04586327, + "balance_loss_mlp": 1.03400707, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.1344376661276714, + "language_loss": 0.90721041, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92888141, + "num_input_tokens_seen": 79971030, + "step": 3711, + "time_per_iteration": 2.541043519973755 + }, + { + "auxiliary_loss_clip": 0.01122579, + "auxiliary_loss_mlp": 0.01055277, + "balance_loss_clip": 1.05346262, + "balance_loss_mlp": 1.03495204, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.922480275940382, + "language_loss": 0.89894009, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.92071867, + "num_input_tokens_seen": 79982085, + "step": 3712, + "time_per_iteration": 2.447312355041504 + }, + { + "auxiliary_loss_clip": 0.01151226, + "auxiliary_loss_mlp": 0.01049083, + "balance_loss_clip": 1.05598128, + "balance_loss_mlp": 1.03059363, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 3.36610655396184, + "language_loss": 0.75029212, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77229524, + "num_input_tokens_seen": 79997460, + "step": 3713, + "time_per_iteration": 2.4043924808502197 + }, + { + "auxiliary_loss_clip": 0.01108097, + "auxiliary_loss_mlp": 0.01043925, + "balance_loss_clip": 1.05607069, + "balance_loss_mlp": 1.02587628, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 2.6282524318676326, + "language_loss": 0.62351239, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64503258, + "num_input_tokens_seen": 80022450, + "step": 3714, + "time_per_iteration": 2.80964732170105 + }, + { + "auxiliary_loss_clip": 0.01127119, + "auxiliary_loss_mlp": 0.01034592, + "balance_loss_clip": 1.05777895, + "balance_loss_mlp": 1.01759267, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.920405200727473, + "language_loss": 0.79395628, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81557333, + "num_input_tokens_seen": 80042100, + "step": 3715, + "time_per_iteration": 2.593567371368408 + }, + { + "auxiliary_loss_clip": 0.01115463, + "auxiliary_loss_mlp": 0.01052043, + "balance_loss_clip": 1.05907023, + "balance_loss_mlp": 1.03416157, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.784677043002099, + "language_loss": 0.77219689, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79387194, + "num_input_tokens_seen": 80059690, + "step": 3716, + "time_per_iteration": 2.533154010772705 + }, + { + "auxiliary_loss_clip": 0.01128923, + "auxiliary_loss_mlp": 0.01048384, + "balance_loss_clip": 1.05186021, + "balance_loss_mlp": 1.03055072, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.1233924430981035, + "language_loss": 0.79128659, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81305969, + "num_input_tokens_seen": 80076060, + "step": 3717, + "time_per_iteration": 2.4998292922973633 + }, + { + "auxiliary_loss_clip": 0.01084346, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.04794014, + "balance_loss_mlp": 1.02760684, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 1.915433949564482, + "language_loss": 0.67854571, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.69986039, + "num_input_tokens_seen": 80094760, + "step": 3718, + "time_per_iteration": 2.6354870796203613 + }, + { + "auxiliary_loss_clip": 0.01132091, + "auxiliary_loss_mlp": 0.01047936, + "balance_loss_clip": 1.04987979, + "balance_loss_mlp": 1.02871978, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.5400355480510048, + "language_loss": 0.80667841, + "learning_rate": 3.619543522896045e-06, + "loss": 0.82847869, + "num_input_tokens_seen": 80114475, + "step": 3719, + "time_per_iteration": 2.6282079219818115 + }, + { + "auxiliary_loss_clip": 0.01132247, + "auxiliary_loss_mlp": 0.01055664, + "balance_loss_clip": 1.05420232, + "balance_loss_mlp": 1.03523159, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 1.6904120109006442, + "language_loss": 0.86864626, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.89052534, + "num_input_tokens_seen": 80132920, + "step": 3720, + "time_per_iteration": 2.515144109725952 + }, + { + "auxiliary_loss_clip": 0.0112151, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.05105913, + "balance_loss_mlp": 1.02118504, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 2.0465355500599354, + "language_loss": 0.74444366, + "learning_rate": 3.619086370692945e-06, + "loss": 0.76605248, + "num_input_tokens_seen": 80152845, + "step": 3721, + "time_per_iteration": 2.5338332653045654 + }, + { + "auxiliary_loss_clip": 0.01158429, + "auxiliary_loss_mlp": 0.01046456, + "balance_loss_clip": 1.05673122, + "balance_loss_mlp": 1.02867007, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.492329099188852, + "language_loss": 0.79230261, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81435144, + "num_input_tokens_seen": 80170680, + "step": 3722, + "time_per_iteration": 2.45858097076416 + }, + { + "auxiliary_loss_clip": 0.01121766, + "auxiliary_loss_mlp": 0.01041162, + "balance_loss_clip": 1.05909443, + "balance_loss_mlp": 1.02419877, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.2073417636615433, + "language_loss": 0.82506675, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84669602, + "num_input_tokens_seen": 80189030, + "step": 3723, + "time_per_iteration": 2.5321273803710938 + }, + { + "auxiliary_loss_clip": 0.0115523, + "auxiliary_loss_mlp": 0.01053473, + "balance_loss_clip": 1.05591989, + "balance_loss_mlp": 1.03513813, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 2.1062349454792653, + "language_loss": 0.84651554, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86860263, + "num_input_tokens_seen": 80208365, + "step": 3724, + "time_per_iteration": 2.487299680709839 + }, + { + "auxiliary_loss_clip": 0.0112853, + "auxiliary_loss_mlp": 0.01046865, + "balance_loss_clip": 1.05179548, + "balance_loss_mlp": 1.02804184, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 1.8081047395143126, + "language_loss": 0.79190028, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81365418, + "num_input_tokens_seen": 80228685, + "step": 3725, + "time_per_iteration": 2.551506757736206 + }, + { + "auxiliary_loss_clip": 0.01094615, + "auxiliary_loss_mlp": 0.010424, + "balance_loss_clip": 1.05265212, + "balance_loss_mlp": 1.02330303, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 2.4169985097077697, + "language_loss": 0.7731111, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79448128, + "num_input_tokens_seen": 80247635, + "step": 3726, + "time_per_iteration": 2.6489553451538086 + }, + { + "auxiliary_loss_clip": 0.01150904, + "auxiliary_loss_mlp": 0.01049149, + "balance_loss_clip": 1.05432272, + "balance_loss_mlp": 1.02852619, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 3.6416869123157647, + "language_loss": 0.72343451, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74543506, + "num_input_tokens_seen": 80260045, + "step": 3727, + "time_per_iteration": 2.437467098236084 + }, + { + "auxiliary_loss_clip": 0.01159344, + "auxiliary_loss_mlp": 0.01046527, + "balance_loss_clip": 1.05711102, + "balance_loss_mlp": 1.02541542, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.8475592663506633, + "language_loss": 0.86570758, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.88776636, + "num_input_tokens_seen": 80277680, + "step": 3728, + "time_per_iteration": 2.4791903495788574 + }, + { + "auxiliary_loss_clip": 0.01125713, + "auxiliary_loss_mlp": 0.0105964, + "balance_loss_clip": 1.05259061, + "balance_loss_mlp": 1.03778934, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.190971686397097, + "language_loss": 0.80317765, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82503122, + "num_input_tokens_seen": 80294795, + "step": 3729, + "time_per_iteration": 2.52412486076355 + }, + { + "auxiliary_loss_clip": 0.01129426, + "auxiliary_loss_mlp": 0.01050358, + "balance_loss_clip": 1.0551517, + "balance_loss_mlp": 1.03295386, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 1.5637928260560243, + "language_loss": 0.86408824, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88588607, + "num_input_tokens_seen": 80315425, + "step": 3730, + "time_per_iteration": 4.079046010971069 + }, + { + "auxiliary_loss_clip": 0.01127629, + "auxiliary_loss_mlp": 0.00785557, + "balance_loss_clip": 1.05493808, + "balance_loss_mlp": 1.00104785, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 2.08478872150914, + "language_loss": 0.72891629, + "learning_rate": 3.616796927310559e-06, + "loss": 0.74804819, + "num_input_tokens_seen": 80333905, + "step": 3731, + "time_per_iteration": 2.5053606033325195 + }, + { + "auxiliary_loss_clip": 0.01126862, + "auxiliary_loss_mlp": 0.01040394, + "balance_loss_clip": 1.05557919, + "balance_loss_mlp": 1.02235818, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 2.9335774670827055, + "language_loss": 0.75304699, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77471954, + "num_input_tokens_seen": 80352165, + "step": 3732, + "time_per_iteration": 4.056875467300415 + }, + { + "auxiliary_loss_clip": 0.01157846, + "auxiliary_loss_mlp": 0.01062826, + "balance_loss_clip": 1.06027496, + "balance_loss_mlp": 1.04421699, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 2.1112868792416877, + "language_loss": 0.8796615, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90186822, + "num_input_tokens_seen": 80371305, + "step": 3733, + "time_per_iteration": 2.470616579055786 + }, + { + "auxiliary_loss_clip": 0.01110483, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.05085087, + "balance_loss_mlp": 1.02725601, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 2.3184008926650517, + "language_loss": 0.84286869, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86443311, + "num_input_tokens_seen": 80391020, + "step": 3734, + "time_per_iteration": 2.5786359310150146 + }, + { + "auxiliary_loss_clip": 0.01133291, + "auxiliary_loss_mlp": 0.01047973, + "balance_loss_clip": 1.05447602, + "balance_loss_mlp": 1.02987742, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.6322286917014768, + "language_loss": 0.76764786, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.78946042, + "num_input_tokens_seen": 80411365, + "step": 3735, + "time_per_iteration": 2.5682387351989746 + }, + { + "auxiliary_loss_clip": 0.01138568, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_clip": 1.06057394, + "balance_loss_mlp": 1.02894163, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 1.617532873597607, + "language_loss": 0.84571266, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.8675611, + "num_input_tokens_seen": 80431075, + "step": 3736, + "time_per_iteration": 2.555401563644409 + }, + { + "auxiliary_loss_clip": 0.01118369, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.05833745, + "balance_loss_mlp": 1.02705491, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 2.356620470114721, + "language_loss": 0.86619079, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88782746, + "num_input_tokens_seen": 80449240, + "step": 3737, + "time_per_iteration": 2.5381081104278564 + }, + { + "auxiliary_loss_clip": 0.01156599, + "auxiliary_loss_mlp": 0.01052396, + "balance_loss_clip": 1.05782652, + "balance_loss_mlp": 1.03344202, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 3.6765377067679554, + "language_loss": 0.79274458, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81483454, + "num_input_tokens_seen": 80467900, + "step": 3738, + "time_per_iteration": 2.528209924697876 + }, + { + "auxiliary_loss_clip": 0.01124277, + "auxiliary_loss_mlp": 0.01049427, + "balance_loss_clip": 1.05231118, + "balance_loss_mlp": 1.03198695, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 1.554654793364992, + "language_loss": 0.76115561, + "learning_rate": 3.614960957933224e-06, + "loss": 0.7828927, + "num_input_tokens_seen": 80487100, + "step": 3739, + "time_per_iteration": 3.9523613452911377 + }, + { + "auxiliary_loss_clip": 0.01112717, + "auxiliary_loss_mlp": 0.01050628, + "balance_loss_clip": 1.04495907, + "balance_loss_mlp": 1.02995753, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 2.009009492882275, + "language_loss": 0.74802673, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76966017, + "num_input_tokens_seen": 80508625, + "step": 3740, + "time_per_iteration": 2.583660125732422 + }, + { + "auxiliary_loss_clip": 0.01153459, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.05755568, + "balance_loss_mlp": 1.02133262, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 2.9436977589673443, + "language_loss": 0.75431931, + "learning_rate": 3.614501353019939e-06, + "loss": 0.77624577, + "num_input_tokens_seen": 80527345, + "step": 3741, + "time_per_iteration": 2.4307515621185303 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.01037534, + "balance_loss_clip": 1.05612397, + "balance_loss_mlp": 1.02047467, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.83755201953945, + "language_loss": 0.87404358, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89574766, + "num_input_tokens_seen": 80545545, + "step": 3742, + "time_per_iteration": 2.4953818321228027 + }, + { + "auxiliary_loss_clip": 0.0109919, + "auxiliary_loss_mlp": 0.01053069, + "balance_loss_clip": 1.04770756, + "balance_loss_mlp": 1.03381658, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 1.7865866393501058, + "language_loss": 0.81559658, + "learning_rate": 3.614041503218444e-06, + "loss": 0.83711916, + "num_input_tokens_seen": 80565040, + "step": 3743, + "time_per_iteration": 2.594266891479492 + }, + { + "auxiliary_loss_clip": 0.01138824, + "auxiliary_loss_mlp": 0.01040241, + "balance_loss_clip": 1.0504148, + "balance_loss_mlp": 1.02270496, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 2.2025740557554263, + "language_loss": 0.63843274, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.66022348, + "num_input_tokens_seen": 80582815, + "step": 3744, + "time_per_iteration": 2.4561567306518555 + }, + { + "auxiliary_loss_clip": 0.01138392, + "auxiliary_loss_mlp": 0.010437, + "balance_loss_clip": 1.05609322, + "balance_loss_mlp": 1.02569914, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 2.9432118645260164, + "language_loss": 0.76397061, + "learning_rate": 3.613581408598489e-06, + "loss": 0.78579158, + "num_input_tokens_seen": 80600865, + "step": 3745, + "time_per_iteration": 2.451122760772705 + }, + { + "auxiliary_loss_clip": 0.01122891, + "auxiliary_loss_mlp": 0.01044375, + "balance_loss_clip": 1.05617404, + "balance_loss_mlp": 1.02656472, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 5.873764021395962, + "language_loss": 0.80664766, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.82832032, + "num_input_tokens_seen": 80617455, + "step": 3746, + "time_per_iteration": 2.4711368083953857 + }, + { + "auxiliary_loss_clip": 0.01144743, + "auxiliary_loss_mlp": 0.01047497, + "balance_loss_clip": 1.05674052, + "balance_loss_mlp": 1.02972269, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.408916189976434, + "language_loss": 0.85867333, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88059568, + "num_input_tokens_seen": 80635125, + "step": 3747, + "time_per_iteration": 2.5081067085266113 + }, + { + "auxiliary_loss_clip": 0.01141704, + "auxiliary_loss_mlp": 0.00784656, + "balance_loss_clip": 1.05522978, + "balance_loss_mlp": 1.00123727, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 2.640932507331204, + "language_loss": 0.76169097, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78095454, + "num_input_tokens_seen": 80656370, + "step": 3748, + "time_per_iteration": 4.013699293136597 + }, + { + "auxiliary_loss_clip": 0.01161899, + "auxiliary_loss_mlp": 0.0104683, + "balance_loss_clip": 1.06212997, + "balance_loss_mlp": 1.02910352, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.6588590605289977, + "language_loss": 0.79636645, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81845379, + "num_input_tokens_seen": 80676495, + "step": 3749, + "time_per_iteration": 2.453291893005371 + }, + { + "auxiliary_loss_clip": 0.0112723, + "auxiliary_loss_mlp": 0.01046292, + "balance_loss_clip": 1.05470991, + "balance_loss_mlp": 1.02974606, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.5880941536012942, + "language_loss": 0.79359043, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.81532562, + "num_input_tokens_seen": 80694755, + "step": 3750, + "time_per_iteration": 2.5635287761688232 + }, + { + "auxiliary_loss_clip": 0.01094765, + "auxiliary_loss_mlp": 0.01050868, + "balance_loss_clip": 1.04897141, + "balance_loss_mlp": 1.03277218, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 2.09502563921708, + "language_loss": 0.82164419, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.84310055, + "num_input_tokens_seen": 80713670, + "step": 3751, + "time_per_iteration": 2.640841007232666 + }, + { + "auxiliary_loss_clip": 0.01121449, + "auxiliary_loss_mlp": 0.0104896, + "balance_loss_clip": 1.05385685, + "balance_loss_mlp": 1.03033948, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.8176431242261064, + "language_loss": 0.83958149, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86128557, + "num_input_tokens_seen": 80731450, + "step": 3752, + "time_per_iteration": 2.5243709087371826 + }, + { + "auxiliary_loss_clip": 0.01157447, + "auxiliary_loss_mlp": 0.01039836, + "balance_loss_clip": 1.06051159, + "balance_loss_mlp": 1.02337337, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.7732129742700002, + "language_loss": 0.78650022, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80847305, + "num_input_tokens_seen": 80748415, + "step": 3753, + "time_per_iteration": 2.419821262359619 + }, + { + "auxiliary_loss_clip": 0.01131225, + "auxiliary_loss_mlp": 0.01042503, + "balance_loss_clip": 1.05691373, + "balance_loss_mlp": 1.0245626, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 2.143813409552078, + "language_loss": 0.78718197, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80891919, + "num_input_tokens_seen": 80770835, + "step": 3754, + "time_per_iteration": 2.662667989730835 + }, + { + "auxiliary_loss_clip": 0.01133944, + "auxiliary_loss_mlp": 0.01047907, + "balance_loss_clip": 1.06052101, + "balance_loss_mlp": 1.02984703, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.7632744095471342, + "language_loss": 0.70128143, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72309995, + "num_input_tokens_seen": 80787840, + "step": 3755, + "time_per_iteration": 2.522169828414917 + }, + { + "auxiliary_loss_clip": 0.01130407, + "auxiliary_loss_mlp": 0.01056465, + "balance_loss_clip": 1.05938649, + "balance_loss_mlp": 1.03879797, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.3727777950795352, + "language_loss": 0.77207041, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79393911, + "num_input_tokens_seen": 80806335, + "step": 3756, + "time_per_iteration": 2.5927600860595703 + }, + { + "auxiliary_loss_clip": 0.01138255, + "auxiliary_loss_mlp": 0.01047018, + "balance_loss_clip": 1.0589447, + "balance_loss_mlp": 1.02865946, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.8574311456448915, + "language_loss": 0.8259666, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.84781939, + "num_input_tokens_seen": 80825355, + "step": 3757, + "time_per_iteration": 2.536557912826538 + }, + { + "auxiliary_loss_clip": 0.01148089, + "auxiliary_loss_mlp": 0.01045396, + "balance_loss_clip": 1.06163108, + "balance_loss_mlp": 1.02668047, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.707500641551068, + "language_loss": 0.72758245, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.74951732, + "num_input_tokens_seen": 80842570, + "step": 3758, + "time_per_iteration": 2.484140157699585 + }, + { + "auxiliary_loss_clip": 0.01140404, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_clip": 1.05890262, + "balance_loss_mlp": 1.03274345, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.565322666074132, + "language_loss": 0.77154225, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79346275, + "num_input_tokens_seen": 80858745, + "step": 3759, + "time_per_iteration": 2.5274131298065186 + }, + { + "auxiliary_loss_clip": 0.01108627, + "auxiliary_loss_mlp": 0.0104415, + "balance_loss_clip": 1.04934263, + "balance_loss_mlp": 1.02444506, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.7597169529052537, + "language_loss": 0.78301883, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80454659, + "num_input_tokens_seen": 80880085, + "step": 3760, + "time_per_iteration": 2.704610824584961 + }, + { + "auxiliary_loss_clip": 0.01039643, + "auxiliary_loss_mlp": 0.01014625, + "balance_loss_clip": 1.029778, + "balance_loss_mlp": 1.01278913, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9469339487029315, + "language_loss": 0.60077012, + "learning_rate": 3.609891846556569e-06, + "loss": 0.6213128, + "num_input_tokens_seen": 80937660, + "step": 3761, + "time_per_iteration": 3.0493037700653076 + }, + { + "auxiliary_loss_clip": 0.01123959, + "auxiliary_loss_mlp": 0.010472, + "balance_loss_clip": 1.05660021, + "balance_loss_mlp": 1.0281384, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.2319882187436395, + "language_loss": 0.77604723, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79775882, + "num_input_tokens_seen": 80956265, + "step": 3762, + "time_per_iteration": 2.619976043701172 + }, + { + "auxiliary_loss_clip": 0.01136784, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_clip": 1.05870855, + "balance_loss_mlp": 1.02463329, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.079536997967268, + "language_loss": 0.78973997, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.81155813, + "num_input_tokens_seen": 80975185, + "step": 3763, + "time_per_iteration": 2.5236170291900635 + }, + { + "auxiliary_loss_clip": 0.01152628, + "auxiliary_loss_mlp": 0.01059692, + "balance_loss_clip": 1.0659436, + "balance_loss_mlp": 1.03972435, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.9079819919108592, + "language_loss": 0.91276598, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.9348892, + "num_input_tokens_seen": 80992830, + "step": 3764, + "time_per_iteration": 2.469844102859497 + }, + { + "auxiliary_loss_clip": 0.01141697, + "auxiliary_loss_mlp": 0.0105779, + "balance_loss_clip": 1.05915642, + "balance_loss_mlp": 1.03767931, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 2.2386353750012034, + "language_loss": 0.75053108, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77252603, + "num_input_tokens_seen": 81013675, + "step": 3765, + "time_per_iteration": 2.5461554527282715 + }, + { + "auxiliary_loss_clip": 0.01148249, + "auxiliary_loss_mlp": 0.01047721, + "balance_loss_clip": 1.06184614, + "balance_loss_mlp": 1.02993464, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 2.014611533925385, + "language_loss": 0.89698744, + "learning_rate": 3.608735651752494e-06, + "loss": 0.9189471, + "num_input_tokens_seen": 81030345, + "step": 3766, + "time_per_iteration": -0.17982029914855957 + }, + { + "auxiliary_loss_clip": 0.01135468, + "auxiliary_loss_mlp": 0.01040011, + "balance_loss_clip": 1.06358171, + "balance_loss_mlp": 1.02177215, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.5350155172991276, + "language_loss": 0.74745381, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76920861, + "num_input_tokens_seen": 81051000, + "step": 3767, + "time_per_iteration": 2.5612502098083496 + }, + { + "auxiliary_loss_clip": 0.01148162, + "auxiliary_loss_mlp": 0.01042724, + "balance_loss_clip": 1.0584656, + "balance_loss_mlp": 1.02367401, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.4603797970124384, + "language_loss": 0.71938205, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.74129093, + "num_input_tokens_seen": 81071205, + "step": 3768, + "time_per_iteration": 2.50273060798645 + }, + { + "auxiliary_loss_clip": 0.01152871, + "auxiliary_loss_mlp": 0.01056642, + "balance_loss_clip": 1.06558979, + "balance_loss_mlp": 1.03732967, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 2.1334485468564113, + "language_loss": 0.78624201, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80833709, + "num_input_tokens_seen": 81091880, + "step": 3769, + "time_per_iteration": 3.9589805603027344 + }, + { + "auxiliary_loss_clip": 0.01137145, + "auxiliary_loss_mlp": 0.01048002, + "balance_loss_clip": 1.05396295, + "balance_loss_mlp": 1.02807009, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.8041851880989703, + "language_loss": 0.68218988, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70404136, + "num_input_tokens_seen": 81113290, + "step": 3770, + "time_per_iteration": 2.6270222663879395 + }, + { + "auxiliary_loss_clip": 0.01160486, + "auxiliary_loss_mlp": 0.01041306, + "balance_loss_clip": 1.05960548, + "balance_loss_mlp": 1.02299571, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 2.720991594062706, + "language_loss": 0.80431908, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82633698, + "num_input_tokens_seen": 81133535, + "step": 3771, + "time_per_iteration": 2.509523868560791 + }, + { + "auxiliary_loss_clip": 0.0111816, + "auxiliary_loss_mlp": 0.01050423, + "balance_loss_clip": 1.05505741, + "balance_loss_mlp": 1.03218353, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.5980402845823112, + "language_loss": 0.79030508, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81199086, + "num_input_tokens_seen": 81154650, + "step": 3772, + "time_per_iteration": 4.080235958099365 + }, + { + "auxiliary_loss_clip": 0.010296, + "auxiliary_loss_mlp": 0.01011556, + "balance_loss_clip": 1.03814745, + "balance_loss_mlp": 1.00938654, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.656245864878136, + "language_loss": 0.54362857, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56404006, + "num_input_tokens_seen": 81221240, + "step": 3773, + "time_per_iteration": 3.2535197734832764 + }, + { + "auxiliary_loss_clip": 0.01125402, + "auxiliary_loss_mlp": 0.01041055, + "balance_loss_clip": 1.05757797, + "balance_loss_mlp": 1.02287602, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.9095944113861285, + "language_loss": 0.70169777, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72336233, + "num_input_tokens_seen": 81241520, + "step": 3774, + "time_per_iteration": 2.5739023685455322 + }, + { + "auxiliary_loss_clip": 0.01131146, + "auxiliary_loss_mlp": 0.0105071, + "balance_loss_clip": 1.05870318, + "balance_loss_mlp": 1.03275681, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.2177130755833603, + "language_loss": 0.74204934, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76386786, + "num_input_tokens_seen": 81256825, + "step": 3775, + "time_per_iteration": 2.521857976913452 + }, + { + "auxiliary_loss_clip": 0.01159352, + "auxiliary_loss_mlp": 0.01050965, + "balance_loss_clip": 1.0599885, + "balance_loss_mlp": 1.0330956, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.016892629343319, + "language_loss": 0.82218957, + "learning_rate": 3.606418687985928e-06, + "loss": 0.84429276, + "num_input_tokens_seen": 81275695, + "step": 3776, + "time_per_iteration": 2.463881015777588 + }, + { + "auxiliary_loss_clip": 0.01138073, + "auxiliary_loss_mlp": 0.01040534, + "balance_loss_clip": 1.05582678, + "balance_loss_mlp": 1.02324867, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 1.7216030245643636, + "language_loss": 0.82763976, + "learning_rate": 3.606186656428641e-06, + "loss": 0.84942579, + "num_input_tokens_seen": 81294920, + "step": 3777, + "time_per_iteration": 2.570711851119995 + }, + { + "auxiliary_loss_clip": 0.01133147, + "auxiliary_loss_mlp": 0.01044764, + "balance_loss_clip": 1.06054831, + "balance_loss_mlp": 1.02645373, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.6816847462711237, + "language_loss": 0.72236049, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74413955, + "num_input_tokens_seen": 81314275, + "step": 3778, + "time_per_iteration": 2.582850694656372 + }, + { + "auxiliary_loss_clip": 0.01115286, + "auxiliary_loss_mlp": 0.01048516, + "balance_loss_clip": 1.05341268, + "balance_loss_mlp": 1.02872682, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.71361969713294, + "language_loss": 0.64357656, + "learning_rate": 3.605722410602591e-06, + "loss": 0.6652146, + "num_input_tokens_seen": 81333890, + "step": 3779, + "time_per_iteration": 3.9868810176849365 + }, + { + "auxiliary_loss_clip": 0.01139659, + "auxiliary_loss_mlp": 0.01053086, + "balance_loss_clip": 1.05806279, + "balance_loss_mlp": 1.0352639, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.6913444190322457, + "language_loss": 0.70839119, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.73031867, + "num_input_tokens_seen": 81353640, + "step": 3780, + "time_per_iteration": 2.4962849617004395 + }, + { + "auxiliary_loss_clip": 0.01150883, + "auxiliary_loss_mlp": 0.0105463, + "balance_loss_clip": 1.06215727, + "balance_loss_mlp": 1.03549743, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 1.8875064371325165, + "language_loss": 0.89876097, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.92081618, + "num_input_tokens_seen": 81371595, + "step": 3781, + "time_per_iteration": 2.525136709213257 + }, + { + "auxiliary_loss_clip": 0.01160667, + "auxiliary_loss_mlp": 0.01049332, + "balance_loss_clip": 1.05936921, + "balance_loss_mlp": 1.03060412, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.0077537332839563, + "language_loss": 0.74448985, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76658988, + "num_input_tokens_seen": 81388435, + "step": 3782, + "time_per_iteration": 2.431529998779297 + }, + { + "auxiliary_loss_clip": 0.011342, + "auxiliary_loss_mlp": 0.01051342, + "balance_loss_clip": 1.05594552, + "balance_loss_mlp": 1.03396094, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.4887136632541882, + "language_loss": 0.82806289, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84991837, + "num_input_tokens_seen": 81410195, + "step": 3783, + "time_per_iteration": 2.5576517581939697 + }, + { + "auxiliary_loss_clip": 0.0112975, + "auxiliary_loss_mlp": 0.01048843, + "balance_loss_clip": 1.05519009, + "balance_loss_mlp": 1.03012729, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 3.461361843295034, + "language_loss": 0.75861979, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78040576, + "num_input_tokens_seen": 81430060, + "step": 3784, + "time_per_iteration": 2.566967248916626 + }, + { + "auxiliary_loss_clip": 0.01152879, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_clip": 1.05568027, + "balance_loss_mlp": 1.03080618, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.6622862029987757, + "language_loss": 0.70868278, + "learning_rate": 3.604328212066594e-06, + "loss": 0.7307049, + "num_input_tokens_seen": 81447375, + "step": 3785, + "time_per_iteration": 2.506258249282837 + }, + { + "auxiliary_loss_clip": 0.01045181, + "auxiliary_loss_mlp": 0.01024274, + "balance_loss_clip": 1.03493118, + "balance_loss_mlp": 1.02216399, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8244344361080648, + "language_loss": 0.61834747, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63904196, + "num_input_tokens_seen": 81505235, + "step": 3786, + "time_per_iteration": 3.111138105392456 + }, + { + "auxiliary_loss_clip": 0.01140528, + "auxiliary_loss_mlp": 0.01045877, + "balance_loss_clip": 1.05716228, + "balance_loss_mlp": 1.02686357, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 3.129530447458822, + "language_loss": 0.86237592, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88423991, + "num_input_tokens_seen": 81518685, + "step": 3787, + "time_per_iteration": 2.5027174949645996 + }, + { + "auxiliary_loss_clip": 0.01131677, + "auxiliary_loss_mlp": 0.01042425, + "balance_loss_clip": 1.05372453, + "balance_loss_mlp": 1.02547336, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.240873370426789, + "language_loss": 0.7276715, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74941248, + "num_input_tokens_seen": 81538940, + "step": 3788, + "time_per_iteration": 4.098814249038696 + }, + { + "auxiliary_loss_clip": 0.01133574, + "auxiliary_loss_mlp": 0.01036121, + "balance_loss_clip": 1.05653214, + "balance_loss_mlp": 1.01810849, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.783850388073628, + "language_loss": 0.67048872, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69218564, + "num_input_tokens_seen": 81555525, + "step": 3789, + "time_per_iteration": 2.501065254211426 + }, + { + "auxiliary_loss_clip": 0.01123088, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.0512929, + "balance_loss_mlp": 1.02763498, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 1.9452854037407155, + "language_loss": 0.75839317, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78009212, + "num_input_tokens_seen": 81576305, + "step": 3790, + "time_per_iteration": 2.5645525455474854 + }, + { + "auxiliary_loss_clip": 0.01099571, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.05106127, + "balance_loss_mlp": 1.02480507, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.017609582283508, + "language_loss": 0.90878201, + "learning_rate": 3.602931823424522e-06, + "loss": 0.9302302, + "num_input_tokens_seen": 81594115, + "step": 3791, + "time_per_iteration": 2.596674919128418 + }, + { + "auxiliary_loss_clip": 0.01145592, + "auxiliary_loss_mlp": 0.01039917, + "balance_loss_clip": 1.05388284, + "balance_loss_mlp": 1.02185678, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.6883371526040651, + "language_loss": 0.82462049, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84647566, + "num_input_tokens_seen": 81615355, + "step": 3792, + "time_per_iteration": 2.587031126022339 + }, + { + "auxiliary_loss_clip": 0.01072131, + "auxiliary_loss_mlp": 0.01004939, + "balance_loss_clip": 1.03469086, + "balance_loss_mlp": 1.00262594, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.137394785616626, + "language_loss": 0.65641928, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67719001, + "num_input_tokens_seen": 81662075, + "step": 3793, + "time_per_iteration": 2.789313316345215 + }, + { + "auxiliary_loss_clip": 0.01158838, + "auxiliary_loss_mlp": 0.01047835, + "balance_loss_clip": 1.05625272, + "balance_loss_mlp": 1.02869034, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 8.596537881918882, + "language_loss": 0.76655191, + "learning_rate": 3.602232808409293e-06, + "loss": 0.78861868, + "num_input_tokens_seen": 81681625, + "step": 3794, + "time_per_iteration": 2.508838653564453 + }, + { + "auxiliary_loss_clip": 0.01113767, + "auxiliary_loss_mlp": 0.01049196, + "balance_loss_clip": 1.04964948, + "balance_loss_mlp": 1.02842999, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.1021416234354855, + "language_loss": 0.80743909, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82906878, + "num_input_tokens_seen": 81701170, + "step": 3795, + "time_per_iteration": 2.614386796951294 + }, + { + "auxiliary_loss_clip": 0.01142513, + "auxiliary_loss_mlp": 0.01048573, + "balance_loss_clip": 1.05709624, + "balance_loss_mlp": 1.03058374, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.652895039939836, + "language_loss": 0.77482653, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79673737, + "num_input_tokens_seen": 81721265, + "step": 3796, + "time_per_iteration": 2.5580036640167236 + }, + { + "auxiliary_loss_clip": 0.01117812, + "auxiliary_loss_mlp": 0.00785949, + "balance_loss_clip": 1.05416453, + "balance_loss_mlp": 1.00108266, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 5.236494665974503, + "language_loss": 0.96248198, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98151958, + "num_input_tokens_seen": 81736565, + "step": 3797, + "time_per_iteration": 2.537555694580078 + }, + { + "auxiliary_loss_clip": 0.01143422, + "auxiliary_loss_mlp": 0.00784949, + "balance_loss_clip": 1.05438638, + "balance_loss_mlp": 1.00117636, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.6102836283150173, + "language_loss": 0.81617749, + "learning_rate": 3.601299937834666e-06, + "loss": 0.8354612, + "num_input_tokens_seen": 81756240, + "step": 3798, + "time_per_iteration": 2.5715835094451904 + }, + { + "auxiliary_loss_clip": 0.01117681, + "auxiliary_loss_mlp": 0.01040727, + "balance_loss_clip": 1.04844201, + "balance_loss_mlp": 1.02199888, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.003091702852071, + "language_loss": 0.79109502, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.81267917, + "num_input_tokens_seen": 81775720, + "step": 3799, + "time_per_iteration": 2.6205873489379883 + }, + { + "auxiliary_loss_clip": 0.01128817, + "auxiliary_loss_mlp": 0.01051508, + "balance_loss_clip": 1.054286, + "balance_loss_mlp": 1.0306704, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.6176611775026983, + "language_loss": 0.75046331, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77226663, + "num_input_tokens_seen": 81795830, + "step": 3800, + "time_per_iteration": 2.5520856380462646 + }, + { + "auxiliary_loss_clip": 0.01131661, + "auxiliary_loss_mlp": 0.01041081, + "balance_loss_clip": 1.05565405, + "balance_loss_mlp": 1.02428401, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.7497627358343404, + "language_loss": 0.63632566, + "learning_rate": 3.600599647297484e-06, + "loss": 0.65805304, + "num_input_tokens_seen": 81815745, + "step": 3801, + "time_per_iteration": 2.57523512840271 + }, + { + "auxiliary_loss_clip": 0.0113234, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.05666804, + "balance_loss_mlp": 1.01812005, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.6526238661520147, + "language_loss": 0.81453788, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83621526, + "num_input_tokens_seen": 81835155, + "step": 3802, + "time_per_iteration": 2.5715672969818115 + }, + { + "auxiliary_loss_clip": 0.01127615, + "auxiliary_loss_mlp": 0.01054052, + "balance_loss_clip": 1.05136633, + "balance_loss_mlp": 1.03551507, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.6897345510802468, + "language_loss": 0.78362465, + "learning_rate": 3.600132483450114e-06, + "loss": 0.80544132, + "num_input_tokens_seen": 81855655, + "step": 3803, + "time_per_iteration": 2.589974880218506 + }, + { + "auxiliary_loss_clip": 0.01118793, + "auxiliary_loss_mlp": 0.01047703, + "balance_loss_clip": 1.05020893, + "balance_loss_mlp": 1.02888012, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.6034950866270048, + "language_loss": 0.85328078, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.8749457, + "num_input_tokens_seen": 81876385, + "step": 3804, + "time_per_iteration": 2.5727365016937256 + }, + { + "auxiliary_loss_clip": 0.01148288, + "auxiliary_loss_mlp": 0.01044402, + "balance_loss_clip": 1.05775261, + "balance_loss_mlp": 1.02697313, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.4103644072685886, + "language_loss": 0.76422024, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78614724, + "num_input_tokens_seen": 81893225, + "step": 3805, + "time_per_iteration": 2.4655134677886963 + }, + { + "auxiliary_loss_clip": 0.01136251, + "auxiliary_loss_mlp": 0.00785677, + "balance_loss_clip": 1.05712175, + "balance_loss_mlp": 1.00108421, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.416003147970622, + "language_loss": 0.79150355, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81072289, + "num_input_tokens_seen": 81911350, + "step": 3806, + "time_per_iteration": 2.509540557861328 + }, + { + "auxiliary_loss_clip": 0.0113344, + "auxiliary_loss_mlp": 0.01056947, + "balance_loss_clip": 1.05852175, + "balance_loss_mlp": 1.03716993, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 1.8591040149412437, + "language_loss": 0.69759762, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71950155, + "num_input_tokens_seen": 81935420, + "step": 3807, + "time_per_iteration": 2.6979758739471436 + }, + { + "auxiliary_loss_clip": 0.01150955, + "auxiliary_loss_mlp": 0.01053352, + "balance_loss_clip": 1.06132889, + "balance_loss_mlp": 1.03476703, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.442491133146274, + "language_loss": 0.66115475, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.68319786, + "num_input_tokens_seen": 81953845, + "step": 3808, + "time_per_iteration": 2.528177261352539 + }, + { + "auxiliary_loss_clip": 0.0110578, + "auxiliary_loss_mlp": 0.01052633, + "balance_loss_clip": 1.05255818, + "balance_loss_mlp": 1.03378582, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 2.944763867294607, + "language_loss": 0.75035095, + "learning_rate": 3.598729535939222e-06, + "loss": 0.77193505, + "num_input_tokens_seen": 81972100, + "step": 3809, + "time_per_iteration": 3.982325315475464 + }, + { + "auxiliary_loss_clip": 0.01132638, + "auxiliary_loss_mlp": 0.01045959, + "balance_loss_clip": 1.05501914, + "balance_loss_mlp": 1.02907884, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.787086094450145, + "language_loss": 0.81547916, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83726513, + "num_input_tokens_seen": 81992760, + "step": 3810, + "time_per_iteration": 2.5664329528808594 + }, + { + "auxiliary_loss_clip": 0.01141095, + "auxiliary_loss_mlp": 0.01041971, + "balance_loss_clip": 1.0609324, + "balance_loss_mlp": 1.02528191, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.0496368200639377, + "language_loss": 0.78562766, + "learning_rate": 3.598261401682441e-06, + "loss": 0.80745828, + "num_input_tokens_seen": 82009080, + "step": 3811, + "time_per_iteration": 3.9627201557159424 + }, + { + "auxiliary_loss_clip": 0.01133746, + "auxiliary_loss_mlp": 0.00785179, + "balance_loss_clip": 1.05658674, + "balance_loss_mlp": 1.00105453, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.9273546299659434, + "language_loss": 0.82883656, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.8480258, + "num_input_tokens_seen": 82026705, + "step": 3812, + "time_per_iteration": 2.554037094116211 + }, + { + "auxiliary_loss_clip": 0.01101793, + "auxiliary_loss_mlp": 0.01070437, + "balance_loss_clip": 1.05750346, + "balance_loss_mlp": 1.04969418, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 2.718028530585827, + "language_loss": 0.82471514, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84643745, + "num_input_tokens_seen": 82043245, + "step": 3813, + "time_per_iteration": 2.5673604011535645 + }, + { + "auxiliary_loss_clip": 0.01140824, + "auxiliary_loss_mlp": 0.01051402, + "balance_loss_clip": 1.05499315, + "balance_loss_mlp": 1.03433084, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 1.7656592309763, + "language_loss": 0.70241445, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72433674, + "num_input_tokens_seen": 82066870, + "step": 3814, + "time_per_iteration": 2.6207854747772217 + }, + { + "auxiliary_loss_clip": 0.0114227, + "auxiliary_loss_mlp": 0.01047799, + "balance_loss_clip": 1.05361164, + "balance_loss_mlp": 1.03004861, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 3.112754642487348, + "language_loss": 0.6718089, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69370961, + "num_input_tokens_seen": 82083180, + "step": 3815, + "time_per_iteration": 2.511121988296509 + }, + { + "auxiliary_loss_clip": 0.01148635, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.05883098, + "balance_loss_mlp": 1.02765965, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.745135891682468, + "language_loss": 0.83717787, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85911167, + "num_input_tokens_seen": 82102950, + "step": 3816, + "time_per_iteration": 2.548097848892212 + }, + { + "auxiliary_loss_clip": 0.01144112, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.05760574, + "balance_loss_mlp": 1.02019608, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.246198984378179, + "language_loss": 0.87021768, + "learning_rate": 3.596855544646742e-06, + "loss": 0.89204097, + "num_input_tokens_seen": 82119510, + "step": 3817, + "time_per_iteration": 2.4575791358947754 + }, + { + "auxiliary_loss_clip": 0.01130621, + "auxiliary_loss_mlp": 0.01049535, + "balance_loss_clip": 1.05414498, + "balance_loss_mlp": 1.03189182, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 1.6420819414185603, + "language_loss": 0.74445623, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.76625776, + "num_input_tokens_seen": 82140095, + "step": 3818, + "time_per_iteration": 3.978693723678589 + }, + { + "auxiliary_loss_clip": 0.01145829, + "auxiliary_loss_mlp": 0.01042083, + "balance_loss_clip": 1.05674505, + "balance_loss_mlp": 1.02366495, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6312163926412586, + "language_loss": 0.74423504, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76611412, + "num_input_tokens_seen": 82159510, + "step": 3819, + "time_per_iteration": 2.507749557495117 + }, + { + "auxiliary_loss_clip": 0.01146427, + "auxiliary_loss_mlp": 0.01044355, + "balance_loss_clip": 1.05915844, + "balance_loss_mlp": 1.02674782, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.7197152058923955, + "language_loss": 0.81027102, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83217883, + "num_input_tokens_seen": 82179580, + "step": 3820, + "time_per_iteration": 2.574378490447998 + }, + { + "auxiliary_loss_clip": 0.01134349, + "auxiliary_loss_mlp": 0.01043667, + "balance_loss_clip": 1.05706239, + "balance_loss_mlp": 1.02489114, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 1.888357363827342, + "language_loss": 0.69227123, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71405143, + "num_input_tokens_seen": 82195585, + "step": 3821, + "time_per_iteration": 2.4989118576049805 + }, + { + "auxiliary_loss_clip": 0.01098682, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.05097175, + "balance_loss_mlp": 1.01838946, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.5384404844623543, + "language_loss": 0.83126152, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85261816, + "num_input_tokens_seen": 82217530, + "step": 3822, + "time_per_iteration": 2.6433091163635254 + }, + { + "auxiliary_loss_clip": 0.01154994, + "auxiliary_loss_mlp": 0.01045302, + "balance_loss_clip": 1.05773926, + "balance_loss_mlp": 1.02678847, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.5336365205152276, + "language_loss": 0.66662294, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68862593, + "num_input_tokens_seen": 82237980, + "step": 3823, + "time_per_iteration": 2.480154514312744 + }, + { + "auxiliary_loss_clip": 0.01058936, + "auxiliary_loss_mlp": 0.01004139, + "balance_loss_clip": 1.0372442, + "balance_loss_mlp": 1.00201726, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 1.2184247774074988, + "language_loss": 0.567644, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58827478, + "num_input_tokens_seen": 82301785, + "step": 3824, + "time_per_iteration": 3.1327273845672607 + }, + { + "auxiliary_loss_clip": 0.01127377, + "auxiliary_loss_mlp": 0.01039399, + "balance_loss_clip": 1.0544914, + "balance_loss_mlp": 1.02218533, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.6089429235611235, + "language_loss": 0.72735715, + "learning_rate": 3.594977677968009e-06, + "loss": 0.74902487, + "num_input_tokens_seen": 82317355, + "step": 3825, + "time_per_iteration": 2.4952046871185303 + }, + { + "auxiliary_loss_clip": 0.01149367, + "auxiliary_loss_mlp": 0.01048827, + "balance_loss_clip": 1.06068444, + "balance_loss_mlp": 1.02915692, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 2.142268282249844, + "language_loss": 0.87764311, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.899625, + "num_input_tokens_seen": 82336645, + "step": 3826, + "time_per_iteration": 2.5269646644592285 + }, + { + "auxiliary_loss_clip": 0.01134775, + "auxiliary_loss_mlp": 0.01047434, + "balance_loss_clip": 1.05759525, + "balance_loss_mlp": 1.02840805, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.328922944187515, + "language_loss": 0.81559873, + "learning_rate": 3.594507606303083e-06, + "loss": 0.83742082, + "num_input_tokens_seen": 82354225, + "step": 3827, + "time_per_iteration": 3.8548660278320312 + }, + { + "auxiliary_loss_clip": 0.01087935, + "auxiliary_loss_mlp": 0.01047467, + "balance_loss_clip": 1.05285716, + "balance_loss_mlp": 1.02861953, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.820874351917951, + "language_loss": 0.86522639, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88658041, + "num_input_tokens_seen": 82370240, + "step": 3828, + "time_per_iteration": 2.5891549587249756 + }, + { + "auxiliary_loss_clip": 0.01127917, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_clip": 1.05559373, + "balance_loss_mlp": 1.02767968, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 3.2243993168447207, + "language_loss": 0.70576817, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72751808, + "num_input_tokens_seen": 82389145, + "step": 3829, + "time_per_iteration": 2.545708417892456 + }, + { + "auxiliary_loss_clip": 0.01088261, + "auxiliary_loss_mlp": 0.01043647, + "balance_loss_clip": 1.04767418, + "balance_loss_mlp": 1.02689219, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.7658479109889298, + "language_loss": 0.84265047, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86396956, + "num_input_tokens_seen": 82409185, + "step": 3830, + "time_per_iteration": 2.702144145965576 + }, + { + "auxiliary_loss_clip": 0.01140501, + "auxiliary_loss_mlp": 0.01053646, + "balance_loss_clip": 1.0589366, + "balance_loss_mlp": 1.03571749, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.6983975251773598, + "language_loss": 0.66932064, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69126213, + "num_input_tokens_seen": 82432070, + "step": 3831, + "time_per_iteration": 2.717650890350342 + }, + { + "auxiliary_loss_clip": 0.01115114, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_clip": 1.05418241, + "balance_loss_mlp": 1.03162301, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.39968100980789, + "language_loss": 0.75419617, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.7758525, + "num_input_tokens_seen": 82450625, + "step": 3832, + "time_per_iteration": 2.6097025871276855 + }, + { + "auxiliary_loss_clip": 0.01103479, + "auxiliary_loss_mlp": 0.01039905, + "balance_loss_clip": 1.05161619, + "balance_loss_mlp": 1.02092671, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 2.022145150417908, + "language_loss": 0.87317437, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89460826, + "num_input_tokens_seen": 82468575, + "step": 3833, + "time_per_iteration": 2.5789921283721924 + }, + { + "auxiliary_loss_clip": 0.01118637, + "auxiliary_loss_mlp": 0.0104616, + "balance_loss_clip": 1.05047441, + "balance_loss_mlp": 1.02750397, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 1.710121753762833, + "language_loss": 0.74937582, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77102381, + "num_input_tokens_seen": 82488655, + "step": 3834, + "time_per_iteration": 2.6068170070648193 + }, + { + "auxiliary_loss_clip": 0.01109287, + "auxiliary_loss_mlp": 0.01059743, + "balance_loss_clip": 1.04833376, + "balance_loss_mlp": 1.03790438, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.888918686839905, + "language_loss": 0.85495627, + "learning_rate": 3.592624901801432e-06, + "loss": 0.87664658, + "num_input_tokens_seen": 82507220, + "step": 3835, + "time_per_iteration": 2.5785951614379883 + }, + { + "auxiliary_loss_clip": 0.01120301, + "auxiliary_loss_mlp": 0.01055489, + "balance_loss_clip": 1.05109429, + "balance_loss_mlp": 1.03415024, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.9338260762179216, + "language_loss": 0.82496572, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84672356, + "num_input_tokens_seen": 82527920, + "step": 3836, + "time_per_iteration": 2.5927700996398926 + }, + { + "auxiliary_loss_clip": 0.01145677, + "auxiliary_loss_mlp": 0.01050158, + "balance_loss_clip": 1.06104088, + "balance_loss_mlp": 1.03225279, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.8017567793291072, + "language_loss": 0.7959078, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81786609, + "num_input_tokens_seen": 82549040, + "step": 3837, + "time_per_iteration": 2.532169818878174 + }, + { + "auxiliary_loss_clip": 0.01062096, + "auxiliary_loss_mlp": 0.01022821, + "balance_loss_clip": 1.05347311, + "balance_loss_mlp": 1.02085423, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9111376196035372, + "language_loss": 0.65449953, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67534864, + "num_input_tokens_seen": 82604070, + "step": 3838, + "time_per_iteration": 3.0602569580078125 + }, + { + "auxiliary_loss_clip": 0.01139898, + "auxiliary_loss_mlp": 0.01049816, + "balance_loss_clip": 1.05633116, + "balance_loss_mlp": 1.03232813, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 1.8991128126558714, + "language_loss": 0.75356805, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77546513, + "num_input_tokens_seen": 82619665, + "step": 3839, + "time_per_iteration": 2.4715137481689453 + }, + { + "auxiliary_loss_clip": 0.01129093, + "auxiliary_loss_mlp": 0.01048569, + "balance_loss_clip": 1.05657935, + "balance_loss_mlp": 1.03010368, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 1.8422949600756506, + "language_loss": 0.69070071, + "learning_rate": 3.591446248441752e-06, + "loss": 0.71247733, + "num_input_tokens_seen": 82637530, + "step": 3840, + "time_per_iteration": 2.5074353218078613 + }, + { + "auxiliary_loss_clip": 0.01158456, + "auxiliary_loss_mlp": 0.01047255, + "balance_loss_clip": 1.05956435, + "balance_loss_mlp": 1.02763367, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 2.1759701542126106, + "language_loss": 0.79503983, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81709695, + "num_input_tokens_seen": 82656130, + "step": 3841, + "time_per_iteration": 2.45642352104187 + }, + { + "auxiliary_loss_clip": 0.01146183, + "auxiliary_loss_mlp": 0.01042952, + "balance_loss_clip": 1.05951834, + "balance_loss_mlp": 1.02645385, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 1.978899001478149, + "language_loss": 0.82863617, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85052758, + "num_input_tokens_seen": 82675295, + "step": 3842, + "time_per_iteration": 2.5240352153778076 + }, + { + "auxiliary_loss_clip": 0.01146954, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.05697632, + "balance_loss_mlp": 1.02654898, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.546971669181229, + "language_loss": 0.66417265, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68609536, + "num_input_tokens_seen": 82703260, + "step": 3843, + "time_per_iteration": 2.702266216278076 + }, + { + "auxiliary_loss_clip": 0.01140928, + "auxiliary_loss_mlp": 0.01043967, + "balance_loss_clip": 1.05472362, + "balance_loss_mlp": 1.02582335, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.9264155034143022, + "language_loss": 0.77408254, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79593152, + "num_input_tokens_seen": 82725060, + "step": 3844, + "time_per_iteration": 2.6316139698028564 + }, + { + "auxiliary_loss_clip": 0.0114367, + "auxiliary_loss_mlp": 0.0104497, + "balance_loss_clip": 1.05601168, + "balance_loss_mlp": 1.02571762, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.6878375233684768, + "language_loss": 0.785209, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80709541, + "num_input_tokens_seen": 82742960, + "step": 3845, + "time_per_iteration": 2.499101400375366 + }, + { + "auxiliary_loss_clip": 0.01123107, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.05795491, + "balance_loss_mlp": 1.01965022, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.212267787288095, + "language_loss": 0.7667402, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78833139, + "num_input_tokens_seen": 82760205, + "step": 3846, + "time_per_iteration": 2.580841064453125 + }, + { + "auxiliary_loss_clip": 0.01133106, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.05466104, + "balance_loss_mlp": 1.02880406, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 1.9475566454480253, + "language_loss": 0.69504356, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71683538, + "num_input_tokens_seen": 82778590, + "step": 3847, + "time_per_iteration": 2.5264346599578857 + }, + { + "auxiliary_loss_clip": 0.01062853, + "auxiliary_loss_mlp": 0.01004574, + "balance_loss_clip": 1.04452252, + "balance_loss_mlp": 1.00245225, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7882570668872865, + "language_loss": 0.61030227, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63097656, + "num_input_tokens_seen": 82833925, + "step": 3848, + "time_per_iteration": 4.419454097747803 + }, + { + "auxiliary_loss_clip": 0.01139752, + "auxiliary_loss_mlp": 0.01046706, + "balance_loss_clip": 1.05398083, + "balance_loss_mlp": 1.02832413, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.635463139847976, + "language_loss": 0.77982169, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80168629, + "num_input_tokens_seen": 82850625, + "step": 3849, + "time_per_iteration": 2.4847731590270996 + }, + { + "auxiliary_loss_clip": 0.01139818, + "auxiliary_loss_mlp": 0.01042397, + "balance_loss_clip": 1.05461502, + "balance_loss_mlp": 1.02458704, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 2.146692756369339, + "language_loss": 0.71219885, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73402101, + "num_input_tokens_seen": 82872105, + "step": 3850, + "time_per_iteration": 2.7016327381134033 + }, + { + "auxiliary_loss_clip": 0.01122021, + "auxiliary_loss_mlp": 0.00786487, + "balance_loss_clip": 1.05019522, + "balance_loss_mlp": 1.00081813, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 2.1292652956812312, + "language_loss": 0.7641325, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78321755, + "num_input_tokens_seen": 82890595, + "step": 3851, + "time_per_iteration": 4.011887788772583 + }, + { + "auxiliary_loss_clip": 0.01151633, + "auxiliary_loss_mlp": 0.01039513, + "balance_loss_clip": 1.05762422, + "balance_loss_mlp": 1.02198958, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.7410909009275564, + "language_loss": 0.6948474, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71675885, + "num_input_tokens_seen": 82908910, + "step": 3852, + "time_per_iteration": 2.503011465072632 + }, + { + "auxiliary_loss_clip": 0.01111683, + "auxiliary_loss_mlp": 0.01050813, + "balance_loss_clip": 1.05096686, + "balance_loss_mlp": 1.03172755, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.4662335747148774, + "language_loss": 0.67245966, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69408464, + "num_input_tokens_seen": 82925405, + "step": 3853, + "time_per_iteration": 2.5344481468200684 + }, + { + "auxiliary_loss_clip": 0.01145596, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.05754566, + "balance_loss_mlp": 1.02495742, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.7832913704609525, + "language_loss": 0.7978251, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.81971407, + "num_input_tokens_seen": 82945615, + "step": 3854, + "time_per_iteration": 2.601773500442505 + }, + { + "auxiliary_loss_clip": 0.01121162, + "auxiliary_loss_mlp": 0.01057606, + "balance_loss_clip": 1.05007625, + "balance_loss_mlp": 1.03580284, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 2.4787798579292466, + "language_loss": 0.64887619, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67066383, + "num_input_tokens_seen": 82967570, + "step": 3855, + "time_per_iteration": 2.5626561641693115 + }, + { + "auxiliary_loss_clip": 0.01154379, + "auxiliary_loss_mlp": 0.01047891, + "balance_loss_clip": 1.0558989, + "balance_loss_mlp": 1.03008127, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.3163733029848226, + "language_loss": 0.70876861, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.73079121, + "num_input_tokens_seen": 82987435, + "step": 3856, + "time_per_iteration": 2.567153215408325 + }, + { + "auxiliary_loss_clip": 0.01110396, + "auxiliary_loss_mlp": 0.01039839, + "balance_loss_clip": 1.05819559, + "balance_loss_mlp": 1.0240674, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 1.800007836525421, + "language_loss": 0.77583861, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79734099, + "num_input_tokens_seen": 83010505, + "step": 3857, + "time_per_iteration": 4.142800569534302 + }, + { + "auxiliary_loss_clip": 0.01142983, + "auxiliary_loss_mlp": 0.00786122, + "balance_loss_clip": 1.05861771, + "balance_loss_mlp": 1.00080585, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 2.3652908285982415, + "language_loss": 0.91494215, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93423325, + "num_input_tokens_seen": 83026705, + "step": 3858, + "time_per_iteration": 2.5137076377868652 + }, + { + "auxiliary_loss_clip": 0.01100649, + "auxiliary_loss_mlp": 0.01038701, + "balance_loss_clip": 1.05569303, + "balance_loss_mlp": 1.02166593, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 1.8904044245504459, + "language_loss": 0.764884, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78627753, + "num_input_tokens_seen": 83046500, + "step": 3859, + "time_per_iteration": 2.60605788230896 + }, + { + "auxiliary_loss_clip": 0.01140003, + "auxiliary_loss_mlp": 0.01038214, + "balance_loss_clip": 1.05419195, + "balance_loss_mlp": 1.02054691, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.6883148704177282, + "language_loss": 0.84182835, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86361051, + "num_input_tokens_seen": 83065280, + "step": 3860, + "time_per_iteration": 2.51719331741333 + }, + { + "auxiliary_loss_clip": 0.01095963, + "auxiliary_loss_mlp": 0.0104648, + "balance_loss_clip": 1.04895878, + "balance_loss_mlp": 1.02794266, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 1.8321833596459107, + "language_loss": 0.83271438, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85413879, + "num_input_tokens_seen": 83082310, + "step": 3861, + "time_per_iteration": 2.579359769821167 + }, + { + "auxiliary_loss_clip": 0.01134468, + "auxiliary_loss_mlp": 0.0078597, + "balance_loss_clip": 1.05419016, + "balance_loss_mlp": 1.00077891, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.7542236626155612, + "language_loss": 0.85954642, + "learning_rate": 3.586242265438576e-06, + "loss": 0.8787508, + "num_input_tokens_seen": 83102065, + "step": 3862, + "time_per_iteration": 2.523968458175659 + }, + { + "auxiliary_loss_clip": 0.01115311, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_clip": 1.05288792, + "balance_loss_mlp": 1.02680039, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 2.5814217624128766, + "language_loss": 0.75035703, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.77193379, + "num_input_tokens_seen": 83121445, + "step": 3863, + "time_per_iteration": 2.601102590560913 + }, + { + "auxiliary_loss_clip": 0.01115673, + "auxiliary_loss_mlp": 0.01046069, + "balance_loss_clip": 1.05772507, + "balance_loss_mlp": 1.02980852, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 1.801027874810135, + "language_loss": 0.74610549, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76772296, + "num_input_tokens_seen": 83138175, + "step": 3864, + "time_per_iteration": 2.5541727542877197 + }, + { + "auxiliary_loss_clip": 0.01150182, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.05486059, + "balance_loss_mlp": 1.02366042, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 3.258839402260576, + "language_loss": 0.71084881, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.73275435, + "num_input_tokens_seen": 83161975, + "step": 3865, + "time_per_iteration": 2.600959300994873 + }, + { + "auxiliary_loss_clip": 0.01161296, + "auxiliary_loss_mlp": 0.01051614, + "balance_loss_clip": 1.05905116, + "balance_loss_mlp": 1.03249276, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 2.0708152531648962, + "language_loss": 0.94684482, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.96897388, + "num_input_tokens_seen": 83180905, + "step": 3866, + "time_per_iteration": 4.021198749542236 + }, + { + "auxiliary_loss_clip": 0.01138387, + "auxiliary_loss_mlp": 0.01048181, + "balance_loss_clip": 1.05450821, + "balance_loss_mlp": 1.03118205, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.5764720318450856, + "language_loss": 0.7358886, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.75775427, + "num_input_tokens_seen": 83196390, + "step": 3867, + "time_per_iteration": 2.489424467086792 + }, + { + "auxiliary_loss_clip": 0.01137447, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_clip": 1.05810511, + "balance_loss_mlp": 1.02860057, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.8122439479267556, + "language_loss": 0.82297885, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84481657, + "num_input_tokens_seen": 83216165, + "step": 3868, + "time_per_iteration": 2.6404621601104736 + }, + { + "auxiliary_loss_clip": 0.01131299, + "auxiliary_loss_mlp": 0.01048649, + "balance_loss_clip": 1.05294609, + "balance_loss_mlp": 1.03112531, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.6139001710597485, + "language_loss": 0.73056149, + "learning_rate": 3.58458034283495e-06, + "loss": 0.752361, + "num_input_tokens_seen": 83233845, + "step": 3869, + "time_per_iteration": 2.558760404586792 + }, + { + "auxiliary_loss_clip": 0.01137353, + "auxiliary_loss_mlp": 0.0104946, + "balance_loss_clip": 1.0550108, + "balance_loss_mlp": 1.03237772, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.7540180511462606, + "language_loss": 0.79628038, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81814849, + "num_input_tokens_seen": 83254930, + "step": 3870, + "time_per_iteration": 2.653184413909912 + }, + { + "auxiliary_loss_clip": 0.01158148, + "auxiliary_loss_mlp": 0.01042514, + "balance_loss_clip": 1.05906296, + "balance_loss_mlp": 1.02460873, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 2.172274499952217, + "language_loss": 0.70474279, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72674936, + "num_input_tokens_seen": 83272095, + "step": 3871, + "time_per_iteration": 2.4969823360443115 + }, + { + "auxiliary_loss_clip": 0.01143134, + "auxiliary_loss_mlp": 0.01057838, + "balance_loss_clip": 1.05855322, + "balance_loss_mlp": 1.03931284, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 1.869056649006176, + "language_loss": 0.69193447, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.7139442, + "num_input_tokens_seen": 83290980, + "step": 3872, + "time_per_iteration": 2.5726194381713867 + }, + { + "auxiliary_loss_clip": 0.01150357, + "auxiliary_loss_mlp": 0.01044935, + "balance_loss_clip": 1.05792522, + "balance_loss_mlp": 1.02582574, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.5118001709968036, + "language_loss": 0.77750158, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.79945451, + "num_input_tokens_seen": 83315175, + "step": 3873, + "time_per_iteration": 2.7350428104400635 + }, + { + "auxiliary_loss_clip": 0.01046824, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.03087592, + "balance_loss_mlp": 1.03646564, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.8547550204935557, + "language_loss": 0.60550237, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62635267, + "num_input_tokens_seen": 83372060, + "step": 3874, + "time_per_iteration": 3.096970796585083 + }, + { + "auxiliary_loss_clip": 0.01131617, + "auxiliary_loss_mlp": 0.01045904, + "balance_loss_clip": 1.05506897, + "balance_loss_mlp": 1.02778435, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.243696645246746, + "language_loss": 0.80769742, + "learning_rate": 3.583153494218927e-06, + "loss": 0.82947254, + "num_input_tokens_seen": 83389795, + "step": 3875, + "time_per_iteration": 2.6357555389404297 + }, + { + "auxiliary_loss_clip": 0.01153304, + "auxiliary_loss_mlp": 0.00783704, + "balance_loss_clip": 1.05861413, + "balance_loss_mlp": 1.00087214, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.8045062298102348, + "language_loss": 0.61293232, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.6323024, + "num_input_tokens_seen": 83410005, + "step": 3876, + "time_per_iteration": 2.5858349800109863 + }, + { + "auxiliary_loss_clip": 0.01121008, + "auxiliary_loss_mlp": 0.01047257, + "balance_loss_clip": 1.05208707, + "balance_loss_mlp": 1.02836275, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.6459604262890852, + "language_loss": 0.70668381, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72836649, + "num_input_tokens_seen": 83430250, + "step": 3877, + "time_per_iteration": 2.6055655479431152 + }, + { + "auxiliary_loss_clip": 0.01143768, + "auxiliary_loss_mlp": 0.01049656, + "balance_loss_clip": 1.05600309, + "balance_loss_mlp": 1.0310235, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.5031908734422927, + "language_loss": 0.80634236, + "learning_rate": 3.582439259339073e-06, + "loss": 0.82827657, + "num_input_tokens_seen": 83447950, + "step": 3878, + "time_per_iteration": 2.4847214221954346 + }, + { + "auxiliary_loss_clip": 0.01093397, + "auxiliary_loss_mlp": 0.0104997, + "balance_loss_clip": 1.04489636, + "balance_loss_mlp": 1.03009772, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 1.5297791143129027, + "language_loss": 0.75137591, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.77280962, + "num_input_tokens_seen": 83467785, + "step": 3879, + "time_per_iteration": 2.7450215816497803 + }, + { + "auxiliary_loss_clip": 0.01100565, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.04805994, + "balance_loss_mlp": 1.02410722, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.223728492710815, + "language_loss": 0.89819407, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.91962981, + "num_input_tokens_seen": 83485390, + "step": 3880, + "time_per_iteration": 2.6126232147216797 + }, + { + "auxiliary_loss_clip": 0.01129728, + "auxiliary_loss_mlp": 0.01046961, + "balance_loss_clip": 1.05353606, + "balance_loss_mlp": 1.02953279, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.6366381478435947, + "language_loss": 0.72187966, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.7436465, + "num_input_tokens_seen": 83504890, + "step": 3881, + "time_per_iteration": 2.5322086811065674 + }, + { + "auxiliary_loss_clip": 0.0115026, + "auxiliary_loss_mlp": 0.0104243, + "balance_loss_clip": 1.05261374, + "balance_loss_mlp": 1.02402377, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.4986035951030252, + "language_loss": 0.67687726, + "learning_rate": 3.581486106120537e-06, + "loss": 0.69880414, + "num_input_tokens_seen": 83526475, + "step": 3882, + "time_per_iteration": 2.5308260917663574 + }, + { + "auxiliary_loss_clip": 0.01111532, + "auxiliary_loss_mlp": 0.01053912, + "balance_loss_clip": 1.04653585, + "balance_loss_mlp": 1.0348978, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 1.961768121586717, + "language_loss": 0.77306741, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.79472184, + "num_input_tokens_seen": 83546620, + "step": 3883, + "time_per_iteration": 2.649127244949341 + }, + { + "auxiliary_loss_clip": 0.01048808, + "auxiliary_loss_mlp": 0.01014171, + "balance_loss_clip": 1.03168046, + "balance_loss_mlp": 1.01191807, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.8004473277086966, + "language_loss": 0.5913536, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61198336, + "num_input_tokens_seen": 83616160, + "step": 3884, + "time_per_iteration": 3.2456326484680176 + }, + { + "auxiliary_loss_clip": 0.01118081, + "auxiliary_loss_mlp": 0.01036331, + "balance_loss_clip": 1.04952347, + "balance_loss_mlp": 1.01873565, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 1.6666208568642507, + "language_loss": 0.80300575, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82454991, + "num_input_tokens_seen": 83636795, + "step": 3885, + "time_per_iteration": 2.6091673374176025 + }, + { + "auxiliary_loss_clip": 0.0113573, + "auxiliary_loss_mlp": 0.01034879, + "balance_loss_clip": 1.05355191, + "balance_loss_mlp": 1.01678348, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.1190224304734127, + "language_loss": 0.88003886, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90174496, + "num_input_tokens_seen": 83654050, + "step": 3886, + "time_per_iteration": 2.4778778553009033 + }, + { + "auxiliary_loss_clip": 0.01152893, + "auxiliary_loss_mlp": 0.01041598, + "balance_loss_clip": 1.0556829, + "balance_loss_mlp": 1.02374053, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 2.007050840658286, + "language_loss": 0.73680377, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75874871, + "num_input_tokens_seen": 83673720, + "step": 3887, + "time_per_iteration": 2.535423994064331 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_clip": 1.05296922, + "balance_loss_mlp": 1.02773619, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.7450553732901721, + "language_loss": 0.84267569, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86454117, + "num_input_tokens_seen": 83693470, + "step": 3888, + "time_per_iteration": 4.065736532211304 + }, + { + "auxiliary_loss_clip": 0.01124499, + "auxiliary_loss_mlp": 0.01055967, + "balance_loss_clip": 1.05083156, + "balance_loss_mlp": 1.0362618, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 4.407817121698441, + "language_loss": 0.87267387, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89447856, + "num_input_tokens_seen": 83711620, + "step": 3889, + "time_per_iteration": 2.5164365768432617 + }, + { + "auxiliary_loss_clip": 0.01139171, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.05109286, + "balance_loss_mlp": 1.02321577, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 2.80335478110673, + "language_loss": 0.76765478, + "learning_rate": 3.579576921697125e-06, + "loss": 0.78945071, + "num_input_tokens_seen": 83727890, + "step": 3890, + "time_per_iteration": 2.48508882522583 + }, + { + "auxiliary_loss_clip": 0.01109577, + "auxiliary_loss_mlp": 0.00784609, + "balance_loss_clip": 1.0512712, + "balance_loss_mlp": 1.00088072, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 1.8525473494060964, + "language_loss": 0.74050975, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75945157, + "num_input_tokens_seen": 83749370, + "step": 3891, + "time_per_iteration": 4.308088541030884 + }, + { + "auxiliary_loss_clip": 0.01147036, + "auxiliary_loss_mlp": 0.01043045, + "balance_loss_clip": 1.05230558, + "balance_loss_mlp": 1.02469873, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.5945349024031146, + "language_loss": 0.82611454, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84801531, + "num_input_tokens_seen": 83769560, + "step": 3892, + "time_per_iteration": 2.483414649963379 + }, + { + "auxiliary_loss_clip": 0.01103292, + "auxiliary_loss_mlp": 0.01053513, + "balance_loss_clip": 1.04366791, + "balance_loss_mlp": 1.03281808, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 1.814245061507953, + "language_loss": 0.647026, + "learning_rate": 3.578859988977082e-06, + "loss": 0.66859406, + "num_input_tokens_seen": 83795635, + "step": 3893, + "time_per_iteration": 2.741717576980591 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01040714, + "balance_loss_clip": 1.04849637, + "balance_loss_mlp": 1.02218878, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 1.927572564958424, + "language_loss": 0.79368079, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81516892, + "num_input_tokens_seen": 83814090, + "step": 3894, + "time_per_iteration": 2.570476770401001 + }, + { + "auxiliary_loss_clip": 0.0113592, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_clip": 1.04822588, + "balance_loss_mlp": 1.02292395, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.4755521030701797, + "language_loss": 0.82068837, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.84245199, + "num_input_tokens_seen": 83836870, + "step": 3895, + "time_per_iteration": 2.6189064979553223 + }, + { + "auxiliary_loss_clip": 0.01138587, + "auxiliary_loss_mlp": 0.01043523, + "balance_loss_clip": 1.05310464, + "balance_loss_mlp": 1.02586806, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 8.15939693440127, + "language_loss": 0.80109107, + "learning_rate": 3.578142517422292e-06, + "loss": 0.8229121, + "num_input_tokens_seen": 83853275, + "step": 3896, + "time_per_iteration": 2.4739878177642822 + }, + { + "auxiliary_loss_clip": 0.01127524, + "auxiliary_loss_mlp": 0.01043069, + "balance_loss_clip": 1.04714334, + "balance_loss_mlp": 1.02441335, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.6893855553303567, + "language_loss": 0.83071148, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85241747, + "num_input_tokens_seen": 83872340, + "step": 3897, + "time_per_iteration": 3.944967031478882 + }, + { + "auxiliary_loss_clip": 0.0113511, + "auxiliary_loss_mlp": 0.01047686, + "balance_loss_clip": 1.05245888, + "balance_loss_mlp": 1.02956605, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 1.6221897642407428, + "language_loss": 0.7909143, + "learning_rate": 3.577663903820705e-06, + "loss": 0.81274229, + "num_input_tokens_seen": 83888795, + "step": 3898, + "time_per_iteration": 2.449551582336426 + }, + { + "auxiliary_loss_clip": 0.01112163, + "auxiliary_loss_mlp": 0.01055653, + "balance_loss_clip": 1.04715967, + "balance_loss_mlp": 1.03781939, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 2.3807341185789577, + "language_loss": 0.73808759, + "learning_rate": 3.577424507277614e-06, + "loss": 0.75976574, + "num_input_tokens_seen": 83906820, + "step": 3899, + "time_per_iteration": 2.5695016384124756 + }, + { + "auxiliary_loss_clip": 0.01113799, + "auxiliary_loss_mlp": 0.01053163, + "balance_loss_clip": 1.04863763, + "balance_loss_mlp": 1.03543639, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.745683828428116, + "language_loss": 0.75330484, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77497447, + "num_input_tokens_seen": 83926370, + "step": 3900, + "time_per_iteration": 2.57889461517334 + }, + { + "auxiliary_loss_clip": 0.01098207, + "auxiliary_loss_mlp": 0.01052799, + "balance_loss_clip": 1.04728365, + "balance_loss_mlp": 1.03563285, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 2.6073085071309174, + "language_loss": 0.66897553, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.6904856, + "num_input_tokens_seen": 83944600, + "step": 3901, + "time_per_iteration": 2.648724317550659 + }, + { + "auxiliary_loss_clip": 0.01020883, + "auxiliary_loss_mlp": 0.01011879, + "balance_loss_clip": 1.02143347, + "balance_loss_mlp": 1.0099721, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7602317024004573, + "language_loss": 0.58249187, + "learning_rate": 3.576705958788091e-06, + "loss": 0.6028195, + "num_input_tokens_seen": 84005100, + "step": 3902, + "time_per_iteration": 3.11734938621521 + }, + { + "auxiliary_loss_clip": 0.01125887, + "auxiliary_loss_mlp": 0.01044856, + "balance_loss_clip": 1.04987478, + "balance_loss_mlp": 1.0267477, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.918434544490414, + "language_loss": 0.800421, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82212842, + "num_input_tokens_seen": 84023775, + "step": 3903, + "time_per_iteration": 2.52140212059021 + }, + { + "auxiliary_loss_clip": 0.01094316, + "auxiliary_loss_mlp": 0.01045049, + "balance_loss_clip": 1.04064536, + "balance_loss_mlp": 1.02707195, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 1.8968864846774411, + "language_loss": 0.82070017, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84209377, + "num_input_tokens_seen": 84042605, + "step": 3904, + "time_per_iteration": 2.607509136199951 + }, + { + "auxiliary_loss_clip": 0.01147186, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_clip": 1.05167317, + "balance_loss_mlp": 1.02897704, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 2.0991001668545715, + "language_loss": 0.71177667, + "learning_rate": 3.57598687219895e-06, + "loss": 0.73371363, + "num_input_tokens_seen": 84061520, + "step": 3905, + "time_per_iteration": 2.470345973968506 + }, + { + "auxiliary_loss_clip": 0.01143809, + "auxiliary_loss_mlp": 0.01038444, + "balance_loss_clip": 1.05015731, + "balance_loss_mlp": 1.02133751, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 2.4686914548163204, + "language_loss": 0.71154445, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73336697, + "num_input_tokens_seen": 84081800, + "step": 3906, + "time_per_iteration": 3.956364393234253 + }, + { + "auxiliary_loss_clip": 0.01139327, + "auxiliary_loss_mlp": 0.0103989, + "balance_loss_clip": 1.04739141, + "balance_loss_mlp": 1.02001739, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.440365736848873, + "language_loss": 0.73375106, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75554317, + "num_input_tokens_seen": 84102340, + "step": 3907, + "time_per_iteration": 2.5598156452178955 + }, + { + "auxiliary_loss_clip": 0.011347, + "auxiliary_loss_mlp": 0.01045314, + "balance_loss_clip": 1.0485177, + "balance_loss_mlp": 1.02661014, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.6549392874029532, + "language_loss": 0.72918969, + "learning_rate": 3.575267247755601e-06, + "loss": 0.75098991, + "num_input_tokens_seen": 84120370, + "step": 3908, + "time_per_iteration": 2.5057554244995117 + }, + { + "auxiliary_loss_clip": 0.01048303, + "auxiliary_loss_mlp": 0.01006084, + "balance_loss_clip": 1.03160548, + "balance_loss_mlp": 1.00386667, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.031745811332319, + "language_loss": 0.73372865, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75427252, + "num_input_tokens_seen": 84165515, + "step": 3909, + "time_per_iteration": 2.8369321823120117 + }, + { + "auxiliary_loss_clip": 0.0113848, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.05020404, + "balance_loss_mlp": 1.02379048, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.5237643777462508, + "language_loss": 0.88138664, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.90319073, + "num_input_tokens_seen": 84184540, + "step": 3910, + "time_per_iteration": 2.511261463165283 + }, + { + "auxiliary_loss_clip": 0.01139339, + "auxiliary_loss_mlp": 0.01042758, + "balance_loss_clip": 1.05277443, + "balance_loss_mlp": 1.0256989, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.035396611334757, + "language_loss": 0.7599076, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78172857, + "num_input_tokens_seen": 84202025, + "step": 3911, + "time_per_iteration": 2.484623670578003 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_clip": 1.04961872, + "balance_loss_mlp": 1.02808404, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.495931175534552, + "language_loss": 0.81915843, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.84092486, + "num_input_tokens_seen": 84221895, + "step": 3912, + "time_per_iteration": 2.495968818664551 + }, + { + "auxiliary_loss_clip": 0.01125528, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.05082202, + "balance_loss_mlp": 1.03348613, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 2.136780114576803, + "language_loss": 0.71750385, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73927504, + "num_input_tokens_seen": 84240455, + "step": 3913, + "time_per_iteration": 2.546982526779175 + }, + { + "auxiliary_loss_clip": 0.01144001, + "auxiliary_loss_mlp": 0.0078629, + "balance_loss_clip": 1.05125737, + "balance_loss_mlp": 1.00084472, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 2.0311316580728973, + "language_loss": 0.76044738, + "learning_rate": 3.57382638628884e-06, + "loss": 0.77975023, + "num_input_tokens_seen": 84261605, + "step": 3914, + "time_per_iteration": 2.520328998565674 + }, + { + "auxiliary_loss_clip": 0.01092684, + "auxiliary_loss_mlp": 0.01044142, + "balance_loss_clip": 1.05099344, + "balance_loss_mlp": 1.02555764, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.7299703635776567, + "language_loss": 0.89844608, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.91981435, + "num_input_tokens_seen": 84278675, + "step": 3915, + "time_per_iteration": 2.610725164413452 + }, + { + "auxiliary_loss_clip": 0.01028233, + "auxiliary_loss_mlp": 0.010156, + "balance_loss_clip": 1.01332724, + "balance_loss_mlp": 1.01375198, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.9352358768493254, + "language_loss": 0.59398925, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61442763, + "num_input_tokens_seen": 84329765, + "step": 3916, + "time_per_iteration": 3.0043892860412598 + }, + { + "auxiliary_loss_clip": 0.01015826, + "auxiliary_loss_mlp": 0.0101336, + "balance_loss_clip": 1.02403009, + "balance_loss_mlp": 1.0115124, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7647990922189434, + "language_loss": 0.49463528, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51492715, + "num_input_tokens_seen": 84393680, + "step": 3917, + "time_per_iteration": 3.1635918617248535 + }, + { + "auxiliary_loss_clip": 0.01110859, + "auxiliary_loss_mlp": 0.01056542, + "balance_loss_clip": 1.0475477, + "balance_loss_mlp": 1.03943598, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 1.8596142801490312, + "language_loss": 0.76769358, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78936756, + "num_input_tokens_seen": 84412640, + "step": 3918, + "time_per_iteration": 2.5634074211120605 + }, + { + "auxiliary_loss_clip": 0.01102449, + "auxiliary_loss_mlp": 0.01051693, + "balance_loss_clip": 1.04456496, + "balance_loss_mlp": 1.0344193, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 2.709972911550876, + "language_loss": 0.68963647, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71117795, + "num_input_tokens_seen": 84431605, + "step": 3919, + "time_per_iteration": 2.5695624351501465 + }, + { + "auxiliary_loss_clip": 0.01111064, + "auxiliary_loss_mlp": 0.01041955, + "balance_loss_clip": 1.05218005, + "balance_loss_mlp": 1.02419233, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 2.402126190872193, + "language_loss": 0.70579076, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72732091, + "num_input_tokens_seen": 84454210, + "step": 3920, + "time_per_iteration": 2.6832947731018066 + }, + { + "auxiliary_loss_clip": 0.01122455, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_clip": 1.04928076, + "balance_loss_mlp": 1.02529049, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.5467354326221545, + "language_loss": 0.77011877, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79176551, + "num_input_tokens_seen": 84475540, + "step": 3921, + "time_per_iteration": 2.5749778747558594 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.05240357, + "balance_loss_mlp": 1.02797234, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.594837017546704, + "language_loss": 0.75071704, + "learning_rate": 3.571901895946612e-06, + "loss": 0.77234185, + "num_input_tokens_seen": 84494580, + "step": 3922, + "time_per_iteration": 2.534848213195801 + }, + { + "auxiliary_loss_clip": 0.01117596, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.04984725, + "balance_loss_mlp": 1.02341032, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 1.9902942859651205, + "language_loss": 0.80670524, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82828969, + "num_input_tokens_seen": 84513850, + "step": 3923, + "time_per_iteration": 2.653245210647583 + }, + { + "auxiliary_loss_clip": 0.01092598, + "auxiliary_loss_mlp": 0.01050474, + "balance_loss_clip": 1.04745054, + "balance_loss_mlp": 1.03196096, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.611988359734243, + "language_loss": 0.74629009, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76772082, + "num_input_tokens_seen": 84532315, + "step": 3924, + "time_per_iteration": 2.5623056888580322 + }, + { + "auxiliary_loss_clip": 0.01147243, + "auxiliary_loss_mlp": 0.01048285, + "balance_loss_clip": 1.05181694, + "balance_loss_mlp": 1.03119087, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 1.7438823099310528, + "language_loss": 0.82483906, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84679437, + "num_input_tokens_seen": 84550970, + "step": 3925, + "time_per_iteration": 2.439176082611084 + }, + { + "auxiliary_loss_clip": 0.01125702, + "auxiliary_loss_mlp": 0.01047757, + "balance_loss_clip": 1.04944909, + "balance_loss_mlp": 1.02930331, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.8686678555478187, + "language_loss": 0.59546977, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61720443, + "num_input_tokens_seen": 84571655, + "step": 3926, + "time_per_iteration": 2.5347585678100586 + }, + { + "auxiliary_loss_clip": 0.01127142, + "auxiliary_loss_mlp": 0.01047576, + "balance_loss_clip": 1.05103159, + "balance_loss_mlp": 1.03068411, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 2.6593074413438837, + "language_loss": 0.71223557, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73398268, + "num_input_tokens_seen": 84593130, + "step": 3927, + "time_per_iteration": 2.560487985610962 + }, + { + "auxiliary_loss_clip": 0.01122914, + "auxiliary_loss_mlp": 0.01046923, + "balance_loss_clip": 1.04772699, + "balance_loss_mlp": 1.03056777, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.9949635685072356, + "language_loss": 0.75211906, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77381742, + "num_input_tokens_seen": 84612410, + "step": 3928, + "time_per_iteration": 3.9891767501831055 + }, + { + "auxiliary_loss_clip": 0.01123146, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_clip": 1.05251884, + "balance_loss_mlp": 1.02864122, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.353444535749335, + "language_loss": 0.81415594, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83585918, + "num_input_tokens_seen": 84627610, + "step": 3929, + "time_per_iteration": 2.497981309890747 + }, + { + "auxiliary_loss_clip": 0.01155127, + "auxiliary_loss_mlp": 0.01049457, + "balance_loss_clip": 1.05407274, + "balance_loss_mlp": 1.02937007, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 2.089722511804951, + "language_loss": 0.71795356, + "learning_rate": 3.569973590777789e-06, + "loss": 0.73999941, + "num_input_tokens_seen": 84648415, + "step": 3930, + "time_per_iteration": 3.95375657081604 + }, + { + "auxiliary_loss_clip": 0.01146935, + "auxiliary_loss_mlp": 0.010409, + "balance_loss_clip": 1.05049443, + "balance_loss_mlp": 1.02272081, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 1.7937923502294288, + "language_loss": 0.74165785, + "learning_rate": 3.569732284634665e-06, + "loss": 0.76353621, + "num_input_tokens_seen": 84670080, + "step": 3931, + "time_per_iteration": 2.6223976612091064 + }, + { + "auxiliary_loss_clip": 0.01137798, + "auxiliary_loss_mlp": 0.01039315, + "balance_loss_clip": 1.0522294, + "balance_loss_mlp": 1.02053952, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.329365818915707, + "language_loss": 0.80024898, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82202017, + "num_input_tokens_seen": 84686465, + "step": 3932, + "time_per_iteration": 2.514275074005127 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.01039943, + "balance_loss_clip": 1.05080116, + "balance_loss_mlp": 1.02389741, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.5156710050228985, + "language_loss": 0.85422969, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87573695, + "num_input_tokens_seen": 84708825, + "step": 3933, + "time_per_iteration": 2.5910444259643555 + }, + { + "auxiliary_loss_clip": 0.01105482, + "auxiliary_loss_mlp": 0.01048646, + "balance_loss_clip": 1.04844022, + "balance_loss_mlp": 1.02820206, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 1.7984780950161632, + "language_loss": 0.8268922, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84843343, + "num_input_tokens_seen": 84726165, + "step": 3934, + "time_per_iteration": 2.6136133670806885 + }, + { + "auxiliary_loss_clip": 0.01149692, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.05328882, + "balance_loss_mlp": 1.0226903, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 2.2073787253267243, + "language_loss": 0.79129267, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.81320018, + "num_input_tokens_seen": 84745815, + "step": 3935, + "time_per_iteration": 2.4674620628356934 + }, + { + "auxiliary_loss_clip": 0.01135675, + "auxiliary_loss_mlp": 0.01039195, + "balance_loss_clip": 1.0520072, + "balance_loss_mlp": 1.02206492, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.6769415836436106, + "language_loss": 0.79492384, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81667256, + "num_input_tokens_seen": 84765415, + "step": 3936, + "time_per_iteration": 3.966391086578369 + }, + { + "auxiliary_loss_clip": 0.01129254, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.05126679, + "balance_loss_mlp": 1.02065742, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.4907687673253225, + "language_loss": 0.79420877, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81588513, + "num_input_tokens_seen": 84787080, + "step": 3937, + "time_per_iteration": 2.5476815700531006 + }, + { + "auxiliary_loss_clip": 0.01134706, + "auxiliary_loss_mlp": 0.01039029, + "balance_loss_clip": 1.05303741, + "balance_loss_mlp": 1.02281666, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 2.0703331134080405, + "language_loss": 0.85208237, + "learning_rate": 3.568041475462147e-06, + "loss": 0.87381971, + "num_input_tokens_seen": 84805395, + "step": 3938, + "time_per_iteration": 2.4551548957824707 + }, + { + "auxiliary_loss_clip": 0.01145507, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.05135369, + "balance_loss_mlp": 1.03215837, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.596921393263434, + "language_loss": 0.93844682, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96040148, + "num_input_tokens_seen": 84818090, + "step": 3939, + "time_per_iteration": 2.487335681915283 + }, + { + "auxiliary_loss_clip": 0.01149038, + "auxiliary_loss_mlp": 0.01048568, + "balance_loss_clip": 1.05022895, + "balance_loss_mlp": 1.02993524, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6926972698301734, + "language_loss": 0.82432371, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84629977, + "num_input_tokens_seen": 84837695, + "step": 3940, + "time_per_iteration": 2.4587278366088867 + }, + { + "auxiliary_loss_clip": 0.01128574, + "auxiliary_loss_mlp": 0.0078684, + "balance_loss_clip": 1.05045211, + "balance_loss_mlp": 1.00075948, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 4.559556326932442, + "language_loss": 0.89563811, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91479224, + "num_input_tokens_seen": 84854630, + "step": 3941, + "time_per_iteration": 2.5068235397338867 + }, + { + "auxiliary_loss_clip": 0.01146818, + "auxiliary_loss_mlp": 0.01043728, + "balance_loss_clip": 1.04867625, + "balance_loss_mlp": 1.02492833, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 1.9276527404146515, + "language_loss": 0.84833038, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.8702358, + "num_input_tokens_seen": 84871805, + "step": 3942, + "time_per_iteration": 2.416477918624878 + }, + { + "auxiliary_loss_clip": 0.01110846, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.04613662, + "balance_loss_mlp": 1.03118682, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.8556608848213516, + "language_loss": 0.80856979, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83019054, + "num_input_tokens_seen": 84889815, + "step": 3943, + "time_per_iteration": 2.5809946060180664 + }, + { + "auxiliary_loss_clip": 0.0111861, + "auxiliary_loss_mlp": 0.01049465, + "balance_loss_clip": 1.04898214, + "balance_loss_mlp": 1.02924693, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.1082792579579768, + "language_loss": 0.67192459, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69360542, + "num_input_tokens_seen": 84904380, + "step": 3944, + "time_per_iteration": 2.4705724716186523 + }, + { + "auxiliary_loss_clip": 0.01121985, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_clip": 1.04814827, + "balance_loss_mlp": 1.0249033, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 1.6947563487932298, + "language_loss": 0.75506163, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77672917, + "num_input_tokens_seen": 84922935, + "step": 3945, + "time_per_iteration": 4.002439022064209 + }, + { + "auxiliary_loss_clip": 0.01129045, + "auxiliary_loss_mlp": 0.01045302, + "balance_loss_clip": 1.04812169, + "balance_loss_mlp": 1.02776647, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.4436024135011576, + "language_loss": 0.63886201, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.66060549, + "num_input_tokens_seen": 84943685, + "step": 3946, + "time_per_iteration": 2.527846574783325 + }, + { + "auxiliary_loss_clip": 0.01133977, + "auxiliary_loss_mlp": 0.01048134, + "balance_loss_clip": 1.04683197, + "balance_loss_mlp": 1.02879834, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.103462937857494, + "language_loss": 0.77548462, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79730582, + "num_input_tokens_seen": 84959505, + "step": 3947, + "time_per_iteration": 2.469944477081299 + }, + { + "auxiliary_loss_clip": 0.01147463, + "auxiliary_loss_mlp": 0.01048901, + "balance_loss_clip": 1.05678308, + "balance_loss_mlp": 1.03031635, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.4767403307872204, + "language_loss": 0.80611038, + "learning_rate": 3.565620980442944e-06, + "loss": 0.8280741, + "num_input_tokens_seen": 84982130, + "step": 3948, + "time_per_iteration": 2.578301429748535 + }, + { + "auxiliary_loss_clip": 0.01131607, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_clip": 1.05238223, + "balance_loss_mlp": 1.02940392, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.1101507490100206, + "language_loss": 0.80527431, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82706976, + "num_input_tokens_seen": 85000640, + "step": 3949, + "time_per_iteration": 2.557807445526123 + }, + { + "auxiliary_loss_clip": 0.01124689, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.05050778, + "balance_loss_mlp": 1.0239768, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.9729901784219488, + "language_loss": 0.72693181, + "learning_rate": 3.565136168723163e-06, + "loss": 0.74861026, + "num_input_tokens_seen": 85018970, + "step": 3950, + "time_per_iteration": 2.526254653930664 + }, + { + "auxiliary_loss_clip": 0.01147901, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.05093527, + "balance_loss_mlp": 1.02721894, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 2.5086526778893794, + "language_loss": 0.73170245, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75362432, + "num_input_tokens_seen": 85035905, + "step": 3951, + "time_per_iteration": 2.4554779529571533 + }, + { + "auxiliary_loss_clip": 0.01127856, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.05584133, + "balance_loss_mlp": 1.02325928, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.804636694945097, + "language_loss": 0.74469745, + "learning_rate": 3.564651119602903e-06, + "loss": 0.76640153, + "num_input_tokens_seen": 85054560, + "step": 3952, + "time_per_iteration": 2.5375800132751465 + }, + { + "auxiliary_loss_clip": 0.01104537, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.04337955, + "balance_loss_mlp": 1.02783012, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.8483091742171838, + "language_loss": 0.70695829, + "learning_rate": 3.564408506040583e-06, + "loss": 0.72846401, + "num_input_tokens_seen": 85074425, + "step": 3953, + "time_per_iteration": 2.6247763633728027 + }, + { + "auxiliary_loss_clip": 0.01152349, + "auxiliary_loss_mlp": 0.01050336, + "balance_loss_clip": 1.05225134, + "balance_loss_mlp": 1.03063047, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.9279720339575679, + "language_loss": 0.81810474, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.84013158, + "num_input_tokens_seen": 85092865, + "step": 3954, + "time_per_iteration": 2.511368751525879 + }, + { + "auxiliary_loss_clip": 0.01131955, + "auxiliary_loss_mlp": 0.01047615, + "balance_loss_clip": 1.05311513, + "balance_loss_mlp": 1.02730155, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 3.0827712069652504, + "language_loss": 0.66387379, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.68566954, + "num_input_tokens_seen": 85110175, + "step": 3955, + "time_per_iteration": 2.5043017864227295 + }, + { + "auxiliary_loss_clip": 0.01149401, + "auxiliary_loss_mlp": 0.01054388, + "balance_loss_clip": 1.05131388, + "balance_loss_mlp": 1.03495741, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.6969848354703183, + "language_loss": 0.84142482, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86346275, + "num_input_tokens_seen": 85129925, + "step": 3956, + "time_per_iteration": 2.4730541706085205 + }, + { + "auxiliary_loss_clip": 0.0110565, + "auxiliary_loss_mlp": 0.01047508, + "balance_loss_clip": 1.04724407, + "balance_loss_mlp": 1.02883959, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.102660688677662, + "language_loss": 0.84779882, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.86933035, + "num_input_tokens_seen": 85147755, + "step": 3957, + "time_per_iteration": 2.5660595893859863 + }, + { + "auxiliary_loss_clip": 0.01092857, + "auxiliary_loss_mlp": 0.01048169, + "balance_loss_clip": 1.0479902, + "balance_loss_mlp": 1.03102684, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 1.9178511853703535, + "language_loss": 0.70252109, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72393131, + "num_input_tokens_seen": 85165270, + "step": 3958, + "time_per_iteration": 2.615403175354004 + }, + { + "auxiliary_loss_clip": 0.01106564, + "auxiliary_loss_mlp": 0.01051037, + "balance_loss_clip": 1.04568386, + "balance_loss_mlp": 1.02907908, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 5.327273439219305, + "language_loss": 0.6575675, + "learning_rate": 3.562951579215745e-06, + "loss": 0.67914349, + "num_input_tokens_seen": 85181555, + "step": 3959, + "time_per_iteration": 2.567490339279175 + }, + { + "auxiliary_loss_clip": 0.01108977, + "auxiliary_loss_mlp": 0.01046374, + "balance_loss_clip": 1.05082643, + "balance_loss_mlp": 1.02820671, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 2.2149844011793665, + "language_loss": 0.72646701, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74802053, + "num_input_tokens_seen": 85199455, + "step": 3960, + "time_per_iteration": 2.611571788787842 + }, + { + "auxiliary_loss_clip": 0.01082214, + "auxiliary_loss_mlp": 0.01044483, + "balance_loss_clip": 1.05995357, + "balance_loss_mlp": 1.02508712, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.5808921025365843, + "language_loss": 0.73949319, + "learning_rate": 3.562465462704307e-06, + "loss": 0.76076013, + "num_input_tokens_seen": 85219170, + "step": 3961, + "time_per_iteration": 2.831743001937866 + }, + { + "auxiliary_loss_clip": 0.01149943, + "auxiliary_loss_mlp": 0.01054744, + "balance_loss_clip": 1.04863727, + "balance_loss_mlp": 1.03397822, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 4.9241592036704125, + "language_loss": 0.65498263, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.67702949, + "num_input_tokens_seen": 85238480, + "step": 3962, + "time_per_iteration": 2.662792444229126 + }, + { + "auxiliary_loss_clip": 0.01123148, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.04693699, + "balance_loss_mlp": 1.0257585, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.6022588058492662, + "language_loss": 0.74440235, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76607299, + "num_input_tokens_seen": 85259180, + "step": 3963, + "time_per_iteration": 2.561892032623291 + }, + { + "auxiliary_loss_clip": 0.01122217, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.05381131, + "balance_loss_mlp": 1.02557349, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.054826660516512, + "language_loss": 0.77091658, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79258335, + "num_input_tokens_seen": 85278550, + "step": 3964, + "time_per_iteration": 2.552797555923462 + }, + { + "auxiliary_loss_clip": 0.01109883, + "auxiliary_loss_mlp": 0.01046368, + "balance_loss_clip": 1.0473609, + "balance_loss_mlp": 1.02783108, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 1.932673099968209, + "language_loss": 0.71438813, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73595065, + "num_input_tokens_seen": 85297345, + "step": 3965, + "time_per_iteration": 2.5679759979248047 + }, + { + "auxiliary_loss_clip": 0.01116464, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.04306149, + "balance_loss_mlp": 1.02675295, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 2.0382395692878252, + "language_loss": 0.78346103, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80507475, + "num_input_tokens_seen": 85315105, + "step": 3966, + "time_per_iteration": 2.530977725982666 + }, + { + "auxiliary_loss_clip": 0.01124141, + "auxiliary_loss_mlp": 0.0104392, + "balance_loss_clip": 1.04950511, + "balance_loss_mlp": 1.02557325, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 1.6111253982113352, + "language_loss": 0.68586725, + "learning_rate": 3.561005691492797e-06, + "loss": 0.70754784, + "num_input_tokens_seen": 85334735, + "step": 3967, + "time_per_iteration": 3.962651491165161 + }, + { + "auxiliary_loss_clip": 0.01120153, + "auxiliary_loss_mlp": 0.0105937, + "balance_loss_clip": 1.04933763, + "balance_loss_mlp": 1.03949785, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 1.9400739270259206, + "language_loss": 0.6756779, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.69747317, + "num_input_tokens_seen": 85352875, + "step": 3968, + "time_per_iteration": 2.5211868286132812 + }, + { + "auxiliary_loss_clip": 0.01101429, + "auxiliary_loss_mlp": 0.01049688, + "balance_loss_clip": 1.05164623, + "balance_loss_mlp": 1.03008986, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 3.0712169266591958, + "language_loss": 0.76711798, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78862917, + "num_input_tokens_seen": 85372205, + "step": 3969, + "time_per_iteration": 4.190165758132935 + }, + { + "auxiliary_loss_clip": 0.01122491, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.04826272, + "balance_loss_mlp": 1.0224123, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.6257846511641296, + "language_loss": 0.75997019, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78159583, + "num_input_tokens_seen": 85389705, + "step": 3970, + "time_per_iteration": 2.5258378982543945 + }, + { + "auxiliary_loss_clip": 0.01111769, + "auxiliary_loss_mlp": 0.01050108, + "balance_loss_clip": 1.04407239, + "balance_loss_mlp": 1.03024817, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 2.339100757442227, + "language_loss": 0.8533386, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87495744, + "num_input_tokens_seen": 85407855, + "step": 3971, + "time_per_iteration": 2.613579273223877 + }, + { + "auxiliary_loss_clip": 0.0105837, + "auxiliary_loss_mlp": 0.01026337, + "balance_loss_clip": 1.03298211, + "balance_loss_mlp": 1.02409542, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7672485673903974, + "language_loss": 0.62762535, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64847243, + "num_input_tokens_seen": 85470885, + "step": 3972, + "time_per_iteration": 3.1255176067352295 + }, + { + "auxiliary_loss_clip": 0.01125789, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.05073214, + "balance_loss_mlp": 1.026124, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 6.273078220228447, + "language_loss": 0.81671119, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.83841097, + "num_input_tokens_seen": 85488460, + "step": 3973, + "time_per_iteration": 2.5127439498901367 + }, + { + "auxiliary_loss_clip": 0.01118376, + "auxiliary_loss_mlp": 0.01058146, + "balance_loss_clip": 1.04743826, + "balance_loss_mlp": 1.03869116, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 2.0973557237674334, + "language_loss": 0.79384112, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81560642, + "num_input_tokens_seen": 85508590, + "step": 3974, + "time_per_iteration": 2.5239434242248535 + }, + { + "auxiliary_loss_clip": 0.01133465, + "auxiliary_loss_mlp": 0.01056256, + "balance_loss_clip": 1.04667115, + "balance_loss_mlp": 1.03661048, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.9039463564610242, + "language_loss": 0.84381223, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86570942, + "num_input_tokens_seen": 85525970, + "step": 3975, + "time_per_iteration": 2.445819616317749 + }, + { + "auxiliary_loss_clip": 0.01126974, + "auxiliary_loss_mlp": 0.01054564, + "balance_loss_clip": 1.04565549, + "balance_loss_mlp": 1.03559804, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.113004914904524, + "language_loss": 0.83470958, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85652494, + "num_input_tokens_seen": 85543700, + "step": 3976, + "time_per_iteration": 3.9025466442108154 + }, + { + "auxiliary_loss_clip": 0.01078566, + "auxiliary_loss_mlp": 0.01053425, + "balance_loss_clip": 1.04586112, + "balance_loss_mlp": 1.03623462, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.7742528990768505, + "language_loss": 0.74958277, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.77090263, + "num_input_tokens_seen": 85562765, + "step": 3977, + "time_per_iteration": 2.6287684440612793 + }, + { + "auxiliary_loss_clip": 0.01149243, + "auxiliary_loss_mlp": 0.01059097, + "balance_loss_clip": 1.05082011, + "balance_loss_mlp": 1.03949916, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.8526235189510938, + "language_loss": 0.72045451, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74253792, + "num_input_tokens_seen": 85581755, + "step": 3978, + "time_per_iteration": 2.4613115787506104 + }, + { + "auxiliary_loss_clip": 0.01127723, + "auxiliary_loss_mlp": 0.01059712, + "balance_loss_clip": 1.04660261, + "balance_loss_mlp": 1.04137743, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.6727218982602774, + "language_loss": 0.78789306, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80976748, + "num_input_tokens_seen": 85599455, + "step": 3979, + "time_per_iteration": 2.515293836593628 + }, + { + "auxiliary_loss_clip": 0.01123664, + "auxiliary_loss_mlp": 0.01067513, + "balance_loss_clip": 1.04593635, + "balance_loss_mlp": 1.04758155, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.783674221106205, + "language_loss": 0.81676614, + "learning_rate": 3.557835546134977e-06, + "loss": 0.83867794, + "num_input_tokens_seen": 85619970, + "step": 3980, + "time_per_iteration": 2.551555633544922 + }, + { + "auxiliary_loss_clip": 0.01095568, + "auxiliary_loss_mlp": 0.01056911, + "balance_loss_clip": 1.04487538, + "balance_loss_mlp": 1.03820753, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.6845549272721712, + "language_loss": 0.8391732, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86069798, + "num_input_tokens_seen": 85638850, + "step": 3981, + "time_per_iteration": 2.5406463146209717 + }, + { + "auxiliary_loss_clip": 0.01126468, + "auxiliary_loss_mlp": 0.01059875, + "balance_loss_clip": 1.04788244, + "balance_loss_mlp": 1.04121888, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 1.8383432704543912, + "language_loss": 0.76893842, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79080188, + "num_input_tokens_seen": 85656285, + "step": 3982, + "time_per_iteration": 2.5892581939697266 + }, + { + "auxiliary_loss_clip": 0.01110845, + "auxiliary_loss_mlp": 0.01050628, + "balance_loss_clip": 1.04918277, + "balance_loss_mlp": 1.03311563, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 2.15716802521109, + "language_loss": 0.77455187, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.79616654, + "num_input_tokens_seen": 85673020, + "step": 3983, + "time_per_iteration": 2.540262460708618 + }, + { + "auxiliary_loss_clip": 0.01133731, + "auxiliary_loss_mlp": 0.00786814, + "balance_loss_clip": 1.04816103, + "balance_loss_mlp": 1.00088501, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.929353529311776, + "language_loss": 0.73513162, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75433707, + "num_input_tokens_seen": 85692565, + "step": 3984, + "time_per_iteration": 2.4818713665008545 + }, + { + "auxiliary_loss_clip": 0.01101152, + "auxiliary_loss_mlp": 0.01064323, + "balance_loss_clip": 1.04473603, + "balance_loss_mlp": 1.04452252, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 1.8472217728689904, + "language_loss": 0.78781533, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.80947012, + "num_input_tokens_seen": 85709730, + "step": 3985, + "time_per_iteration": 4.086170196533203 + }, + { + "auxiliary_loss_clip": 0.011063, + "auxiliary_loss_mlp": 0.01055649, + "balance_loss_clip": 1.04686856, + "balance_loss_mlp": 1.03679037, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 2.0182890886077782, + "language_loss": 0.7349143, + "learning_rate": 3.556369033716254e-06, + "loss": 0.75653374, + "num_input_tokens_seen": 85730045, + "step": 3986, + "time_per_iteration": 2.6264777183532715 + }, + { + "auxiliary_loss_clip": 0.01138837, + "auxiliary_loss_mlp": 0.01051336, + "balance_loss_clip": 1.04883671, + "balance_loss_mlp": 1.03341877, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 2.0017471869289665, + "language_loss": 0.87442851, + "learning_rate": 3.556124408363871e-06, + "loss": 0.89633024, + "num_input_tokens_seen": 85747590, + "step": 3987, + "time_per_iteration": 2.505223035812378 + }, + { + "auxiliary_loss_clip": 0.01129164, + "auxiliary_loss_mlp": 0.01041306, + "balance_loss_clip": 1.04741907, + "balance_loss_mlp": 1.02526021, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.241297852051308, + "language_loss": 0.83047348, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85217822, + "num_input_tokens_seen": 85763460, + "step": 3988, + "time_per_iteration": 2.4553637504577637 + }, + { + "auxiliary_loss_clip": 0.0113299, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.0482713, + "balance_loss_mlp": 1.02731466, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.7213280252995247, + "language_loss": 0.85332465, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87511015, + "num_input_tokens_seen": 85782050, + "step": 3989, + "time_per_iteration": 2.485203266143799 + }, + { + "auxiliary_loss_clip": 0.01144852, + "auxiliary_loss_mlp": 0.0104076, + "balance_loss_clip": 1.04903388, + "balance_loss_mlp": 1.0233916, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.1330078103892847, + "language_loss": 0.84679496, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86865115, + "num_input_tokens_seen": 85797400, + "step": 3990, + "time_per_iteration": 2.442373275756836 + }, + { + "auxiliary_loss_clip": 0.0112902, + "auxiliary_loss_mlp": 0.01046397, + "balance_loss_clip": 1.04412341, + "balance_loss_mlp": 1.02917147, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.4380815993833276, + "language_loss": 0.75480568, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77655989, + "num_input_tokens_seen": 85818995, + "step": 3991, + "time_per_iteration": 2.5083119869232178 + }, + { + "auxiliary_loss_clip": 0.0102903, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.01751328, + "balance_loss_mlp": 1.03803492, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.9047331738827393, + "language_loss": 0.63763154, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65831971, + "num_input_tokens_seen": 85876695, + "step": 3992, + "time_per_iteration": 2.993396043777466 + }, + { + "auxiliary_loss_clip": 0.01044609, + "auxiliary_loss_mlp": 0.01017671, + "balance_loss_clip": 1.01945615, + "balance_loss_mlp": 1.01583481, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7623791972285464, + "language_loss": 0.6294502, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65007299, + "num_input_tokens_seen": 85940990, + "step": 3993, + "time_per_iteration": 3.1372082233428955 + }, + { + "auxiliary_loss_clip": 0.01109727, + "auxiliary_loss_mlp": 0.01046443, + "balance_loss_clip": 1.04809618, + "balance_loss_mlp": 1.02670228, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.6208110062731005, + "language_loss": 0.76699507, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.78855681, + "num_input_tokens_seen": 85961165, + "step": 3994, + "time_per_iteration": 2.5804224014282227 + }, + { + "auxiliary_loss_clip": 0.01120309, + "auxiliary_loss_mlp": 0.01054399, + "balance_loss_clip": 1.04453683, + "balance_loss_mlp": 1.03442001, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.6706531770702369, + "language_loss": 0.78102094, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80276799, + "num_input_tokens_seen": 85982710, + "step": 3995, + "time_per_iteration": 2.5532636642456055 + }, + { + "auxiliary_loss_clip": 0.01023305, + "auxiliary_loss_mlp": 0.01015618, + "balance_loss_clip": 1.01759672, + "balance_loss_mlp": 1.01343691, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.9364246332644247, + "language_loss": 0.63507748, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65546668, + "num_input_tokens_seen": 86046935, + "step": 3996, + "time_per_iteration": 3.1764140129089355 + }, + { + "auxiliary_loss_clip": 0.01123584, + "auxiliary_loss_mlp": 0.01054343, + "balance_loss_clip": 1.04644763, + "balance_loss_mlp": 1.03610384, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.2330462562307503, + "language_loss": 0.70245719, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72423649, + "num_input_tokens_seen": 86064355, + "step": 3997, + "time_per_iteration": 2.495546817779541 + }, + { + "auxiliary_loss_clip": 0.01135843, + "auxiliary_loss_mlp": 0.01057097, + "balance_loss_clip": 1.0480001, + "balance_loss_mlp": 1.03876281, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.7218666061066286, + "language_loss": 0.87284571, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89477509, + "num_input_tokens_seen": 86081340, + "step": 3998, + "time_per_iteration": 2.4702675342559814 + }, + { + "auxiliary_loss_clip": 0.01127203, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.04240656, + "balance_loss_mlp": 1.02934527, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.538712085490315, + "language_loss": 0.76029629, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.78204584, + "num_input_tokens_seen": 86102260, + "step": 3999, + "time_per_iteration": 2.4958152770996094 + }, + { + "auxiliary_loss_clip": 0.01117321, + "auxiliary_loss_mlp": 0.01045552, + "balance_loss_clip": 1.04467845, + "balance_loss_mlp": 1.02709818, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 3.2945664298235964, + "language_loss": 0.72391748, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74554622, + "num_input_tokens_seen": 86123400, + "step": 4000, + "time_per_iteration": 2.5401811599731445 + }, + { + "auxiliary_loss_clip": 0.01134792, + "auxiliary_loss_mlp": 0.01050218, + "balance_loss_clip": 1.0494833, + "balance_loss_mlp": 1.03194284, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 1.7329492662674797, + "language_loss": 0.66652203, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68837219, + "num_input_tokens_seen": 86144060, + "step": 4001, + "time_per_iteration": 2.499210834503174 + }, + { + "auxiliary_loss_clip": 0.01143886, + "auxiliary_loss_mlp": 0.01045418, + "balance_loss_clip": 1.04665041, + "balance_loss_mlp": 1.02651191, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 2.0286808436152963, + "language_loss": 0.83270884, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.85460186, + "num_input_tokens_seen": 86163005, + "step": 4002, + "time_per_iteration": 2.451343536376953 + }, + { + "auxiliary_loss_clip": 0.01105735, + "auxiliary_loss_mlp": 0.01045337, + "balance_loss_clip": 1.04582882, + "balance_loss_mlp": 1.02767015, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.9073892781885002, + "language_loss": 0.83196902, + "learning_rate": 3.552202383898897e-06, + "loss": 0.85347974, + "num_input_tokens_seen": 86182580, + "step": 4003, + "time_per_iteration": 2.549283742904663 + }, + { + "auxiliary_loss_clip": 0.01113642, + "auxiliary_loss_mlp": 0.01038926, + "balance_loss_clip": 1.04420197, + "balance_loss_mlp": 1.0201273, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 1.844398631348441, + "language_loss": 0.87204581, + "learning_rate": 3.551956756667215e-06, + "loss": 0.8935715, + "num_input_tokens_seen": 86200665, + "step": 4004, + "time_per_iteration": 2.4844884872436523 + }, + { + "auxiliary_loss_clip": 0.01114358, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_clip": 1.04277098, + "balance_loss_mlp": 1.04275918, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.997854536818559, + "language_loss": 0.77772915, + "learning_rate": 3.551711070585177e-06, + "loss": 0.79948413, + "num_input_tokens_seen": 86221640, + "step": 4005, + "time_per_iteration": 2.5501465797424316 + }, + { + "auxiliary_loss_clip": 0.01090983, + "auxiliary_loss_mlp": 0.01042548, + "balance_loss_clip": 1.0418371, + "balance_loss_mlp": 1.02420223, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 2.17078218081594, + "language_loss": 0.78941041, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81074578, + "num_input_tokens_seen": 86240795, + "step": 4006, + "time_per_iteration": 3.9869539737701416 + }, + { + "auxiliary_loss_clip": 0.01127608, + "auxiliary_loss_mlp": 0.00787944, + "balance_loss_clip": 1.04443741, + "balance_loss_mlp": 1.00106859, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 1.6974846592863602, + "language_loss": 0.71518028, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73433578, + "num_input_tokens_seen": 86262000, + "step": 4007, + "time_per_iteration": 2.55434250831604 + }, + { + "auxiliary_loss_clip": 0.01100594, + "auxiliary_loss_mlp": 0.01050633, + "balance_loss_clip": 1.04237962, + "balance_loss_mlp": 1.0335747, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.7596971563187036, + "language_loss": 0.76198554, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78349775, + "num_input_tokens_seen": 86279680, + "step": 4008, + "time_per_iteration": 2.51000714302063 + }, + { + "auxiliary_loss_clip": 0.01133433, + "auxiliary_loss_mlp": 0.01039844, + "balance_loss_clip": 1.04705358, + "balance_loss_mlp": 1.02253461, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.6107516114827054, + "language_loss": 0.7498256, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.77155834, + "num_input_tokens_seen": 86297180, + "step": 4009, + "time_per_iteration": 3.9992599487304688 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01042692, + "balance_loss_clip": 1.04973018, + "balance_loss_mlp": 1.02651548, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.7760648776825914, + "language_loss": 0.8045373, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82630509, + "num_input_tokens_seen": 86317660, + "step": 4010, + "time_per_iteration": 2.4787707328796387 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01053466, + "balance_loss_clip": 1.04577971, + "balance_loss_mlp": 1.0342617, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 2.065929599276349, + "language_loss": 0.70314604, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72485125, + "num_input_tokens_seen": 86338325, + "step": 4011, + "time_per_iteration": 2.5524544715881348 + }, + { + "auxiliary_loss_clip": 0.01066477, + "auxiliary_loss_mlp": 0.01052587, + "balance_loss_clip": 1.04060185, + "balance_loss_mlp": 1.03360891, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.6364936174844504, + "language_loss": 0.69659662, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71778727, + "num_input_tokens_seen": 86357615, + "step": 4012, + "time_per_iteration": 2.702481508255005 + }, + { + "auxiliary_loss_clip": 0.01133737, + "auxiliary_loss_mlp": 0.01045377, + "balance_loss_clip": 1.04596329, + "balance_loss_mlp": 1.02577913, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 1.5384743869571007, + "language_loss": 0.73259437, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75438547, + "num_input_tokens_seen": 86380355, + "step": 4013, + "time_per_iteration": 2.635589361190796 + }, + { + "auxiliary_loss_clip": 0.01146893, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.04918504, + "balance_loss_mlp": 1.02337146, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8396389355228309, + "language_loss": 0.88104081, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90290678, + "num_input_tokens_seen": 86399125, + "step": 4014, + "time_per_iteration": 2.410160541534424 + }, + { + "auxiliary_loss_clip": 0.01114909, + "auxiliary_loss_mlp": 0.01050981, + "balance_loss_clip": 1.04076278, + "balance_loss_mlp": 1.03189552, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.1217496008408814, + "language_loss": 0.94806826, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96972722, + "num_input_tokens_seen": 86418625, + "step": 4015, + "time_per_iteration": 4.01144552230835 + }, + { + "auxiliary_loss_clip": 0.0111923, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.04328549, + "balance_loss_mlp": 1.03139901, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.6210397808545025, + "language_loss": 0.82688862, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84856814, + "num_input_tokens_seen": 86438375, + "step": 4016, + "time_per_iteration": 2.543766975402832 + }, + { + "auxiliary_loss_clip": 0.01099635, + "auxiliary_loss_mlp": 0.01047776, + "balance_loss_clip": 1.04244518, + "balance_loss_mlp": 1.02969146, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 1.9675398774622026, + "language_loss": 0.69327873, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71475285, + "num_input_tokens_seen": 86463230, + "step": 4017, + "time_per_iteration": 2.690546989440918 + }, + { + "auxiliary_loss_clip": 0.01138454, + "auxiliary_loss_mlp": 0.01050072, + "balance_loss_clip": 1.04743767, + "balance_loss_mlp": 1.03184533, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 1.653182200580224, + "language_loss": 0.85143918, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87332445, + "num_input_tokens_seen": 86481230, + "step": 4018, + "time_per_iteration": 2.441819667816162 + }, + { + "auxiliary_loss_clip": 0.01034545, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.01072097, + "balance_loss_mlp": 1.02641773, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8210047910860027, + "language_loss": 0.6064353, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62706423, + "num_input_tokens_seen": 86541260, + "step": 4019, + "time_per_iteration": 3.098292112350464 + }, + { + "auxiliary_loss_clip": 0.01115204, + "auxiliary_loss_mlp": 0.01052964, + "balance_loss_clip": 1.04359317, + "balance_loss_mlp": 1.03393841, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.9369526614064603, + "language_loss": 0.73703015, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75871181, + "num_input_tokens_seen": 86559580, + "step": 4020, + "time_per_iteration": 2.5274851322174072 + }, + { + "auxiliary_loss_clip": 0.01109906, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.04791248, + "balance_loss_mlp": 1.02682304, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 3.712703834347189, + "language_loss": 0.81747669, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.83902043, + "num_input_tokens_seen": 86577560, + "step": 4021, + "time_per_iteration": 2.5388987064361572 + }, + { + "auxiliary_loss_clip": 0.01149012, + "auxiliary_loss_mlp": 0.01054753, + "balance_loss_clip": 1.04888928, + "balance_loss_mlp": 1.03504801, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 2.2683330873815217, + "language_loss": 0.76565707, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78769475, + "num_input_tokens_seen": 86595350, + "step": 4022, + "time_per_iteration": 2.447603225708008 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.01056258, + "balance_loss_clip": 1.0420028, + "balance_loss_mlp": 1.034109, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 2.595188682619979, + "language_loss": 0.7535162, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77515179, + "num_input_tokens_seen": 86614805, + "step": 4023, + "time_per_iteration": 2.522338390350342 + }, + { + "auxiliary_loss_clip": 0.01120275, + "auxiliary_loss_mlp": 0.01047855, + "balance_loss_clip": 1.0448761, + "balance_loss_mlp": 1.02994955, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 2.944733874050914, + "language_loss": 0.82495618, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.84663749, + "num_input_tokens_seen": 86633700, + "step": 4024, + "time_per_iteration": 3.9898030757904053 + }, + { + "auxiliary_loss_clip": 0.01133737, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.04919481, + "balance_loss_mlp": 1.02918053, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 3.4752694895523493, + "language_loss": 0.85828561, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88009906, + "num_input_tokens_seen": 86650905, + "step": 4025, + "time_per_iteration": 2.443338394165039 + }, + { + "auxiliary_loss_clip": 0.01095122, + "auxiliary_loss_mlp": 0.01062645, + "balance_loss_clip": 1.04234195, + "balance_loss_mlp": 1.04192686, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.8250010116184199, + "language_loss": 0.71421164, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73578936, + "num_input_tokens_seen": 86669185, + "step": 4026, + "time_per_iteration": 2.505873680114746 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.01043555, + "balance_loss_clip": 1.05024242, + "balance_loss_mlp": 1.02616286, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.9878704207207523, + "language_loss": 0.64541417, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66721916, + "num_input_tokens_seen": 86686805, + "step": 4027, + "time_per_iteration": 2.4535768032073975 + }, + { + "auxiliary_loss_clip": 0.01136874, + "auxiliary_loss_mlp": 0.00786527, + "balance_loss_clip": 1.05176342, + "balance_loss_mlp": 1.0008688, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.320297304801823, + "language_loss": 0.71092552, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.73015958, + "num_input_tokens_seen": 86705520, + "step": 4028, + "time_per_iteration": 2.450906991958618 + }, + { + "auxiliary_loss_clip": 0.01037889, + "auxiliary_loss_mlp": 0.01007419, + "balance_loss_clip": 1.01178813, + "balance_loss_mlp": 1.00529659, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8562167629200134, + "language_loss": 0.5532304, + "learning_rate": 3.545796973765623e-06, + "loss": 0.5736835, + "num_input_tokens_seen": 86767320, + "step": 4029, + "time_per_iteration": 3.0362064838409424 + }, + { + "auxiliary_loss_clip": 0.01134916, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.04709721, + "balance_loss_mlp": 1.02607155, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 1.714903986847517, + "language_loss": 0.74075568, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76256108, + "num_input_tokens_seen": 86788110, + "step": 4030, + "time_per_iteration": 2.504459857940674 + }, + { + "auxiliary_loss_clip": 0.01147678, + "auxiliary_loss_mlp": 0.01051838, + "balance_loss_clip": 1.04903793, + "balance_loss_mlp": 1.03307414, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 2.483545359794074, + "language_loss": 0.76725644, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78925157, + "num_input_tokens_seen": 86807640, + "step": 4031, + "time_per_iteration": 2.4660234451293945 + }, + { + "auxiliary_loss_clip": 0.01132574, + "auxiliary_loss_mlp": 0.00787081, + "balance_loss_clip": 1.04893315, + "balance_loss_mlp": 1.00129294, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 2.101650455042184, + "language_loss": 0.65995145, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.67914796, + "num_input_tokens_seen": 86826795, + "step": 4032, + "time_per_iteration": 2.5304946899414062 + }, + { + "auxiliary_loss_clip": 0.01130142, + "auxiliary_loss_mlp": 0.01046775, + "balance_loss_clip": 1.04537356, + "balance_loss_mlp": 1.02841723, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 2.4114693663546904, + "language_loss": 0.81716263, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83893174, + "num_input_tokens_seen": 86843175, + "step": 4033, + "time_per_iteration": 2.4456582069396973 + }, + { + "auxiliary_loss_clip": 0.01102071, + "auxiliary_loss_mlp": 0.01044653, + "balance_loss_clip": 1.03986168, + "balance_loss_mlp": 1.02597308, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 2.4502303694049035, + "language_loss": 0.69876808, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.72023535, + "num_input_tokens_seen": 86863185, + "step": 4034, + "time_per_iteration": 2.6005349159240723 + }, + { + "auxiliary_loss_clip": 0.01127082, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.04918456, + "balance_loss_mlp": 1.01623225, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.010619649806704, + "language_loss": 0.9625476, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98417294, + "num_input_tokens_seen": 86880040, + "step": 4035, + "time_per_iteration": 2.4757883548736572 + }, + { + "auxiliary_loss_clip": 0.01114369, + "auxiliary_loss_mlp": 0.0105081, + "balance_loss_clip": 1.04505444, + "balance_loss_mlp": 1.03333402, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 1.5553739391324028, + "language_loss": 0.77921385, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80086565, + "num_input_tokens_seen": 86900610, + "step": 4036, + "time_per_iteration": 2.516425371170044 + }, + { + "auxiliary_loss_clip": 0.011388, + "auxiliary_loss_mlp": 0.010542, + "balance_loss_clip": 1.04938185, + "balance_loss_mlp": 1.03538895, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.8108393629224342, + "language_loss": 0.73965764, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76158762, + "num_input_tokens_seen": 86919385, + "step": 4037, + "time_per_iteration": 2.474591016769409 + }, + { + "auxiliary_loss_clip": 0.01103909, + "auxiliary_loss_mlp": 0.01046065, + "balance_loss_clip": 1.03978181, + "balance_loss_mlp": 1.02637148, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.009205147030067, + "language_loss": 0.7677213, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78922105, + "num_input_tokens_seen": 86938885, + "step": 4038, + "time_per_iteration": 2.527338743209839 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.01047415, + "balance_loss_clip": 1.05092907, + "balance_loss_mlp": 1.02762616, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 1.7615708694398169, + "language_loss": 0.72014683, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74195886, + "num_input_tokens_seen": 86957705, + "step": 4039, + "time_per_iteration": 2.4593677520751953 + }, + { + "auxiliary_loss_clip": 0.01131064, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_clip": 1.04671466, + "balance_loss_mlp": 1.03129196, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.66000159582679, + "language_loss": 0.78667277, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80849493, + "num_input_tokens_seen": 86975845, + "step": 4040, + "time_per_iteration": 2.4623863697052 + }, + { + "auxiliary_loss_clip": 0.0108983, + "auxiliary_loss_mlp": 0.01041147, + "balance_loss_clip": 1.04371762, + "balance_loss_mlp": 1.02462494, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.7452078653630227, + "language_loss": 0.80615407, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82746381, + "num_input_tokens_seen": 86994800, + "step": 4041, + "time_per_iteration": 2.6035854816436768 + }, + { + "auxiliary_loss_clip": 0.01111647, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.04587162, + "balance_loss_mlp": 1.03143013, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 2.256977148602171, + "language_loss": 0.76903212, + "learning_rate": 3.542579399075957e-06, + "loss": 0.7906478, + "num_input_tokens_seen": 87016845, + "step": 4042, + "time_per_iteration": 2.569761276245117 + }, + { + "auxiliary_loss_clip": 0.01057063, + "auxiliary_loss_mlp": 0.01043202, + "balance_loss_clip": 1.04021287, + "balance_loss_mlp": 1.0265367, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.7719663608843261, + "language_loss": 0.81366909, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83467168, + "num_input_tokens_seen": 87036270, + "step": 4043, + "time_per_iteration": 2.7886059284210205 + }, + { + "auxiliary_loss_clip": 0.01127236, + "auxiliary_loss_mlp": 0.01042171, + "balance_loss_clip": 1.04516268, + "balance_loss_mlp": 1.02312124, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 4.541554391675126, + "language_loss": 0.72898096, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75067502, + "num_input_tokens_seen": 87049920, + "step": 4044, + "time_per_iteration": 2.746976137161255 + }, + { + "auxiliary_loss_clip": 0.01136026, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_clip": 1.0505147, + "balance_loss_mlp": 1.02665639, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 3.5441585080877926, + "language_loss": 0.83414555, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85595441, + "num_input_tokens_seen": 87068230, + "step": 4045, + "time_per_iteration": 2.5220465660095215 + }, + { + "auxiliary_loss_clip": 0.01074822, + "auxiliary_loss_mlp": 0.01053925, + "balance_loss_clip": 1.04610133, + "balance_loss_mlp": 1.03605556, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 1.6956676374877926, + "language_loss": 0.86836648, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88965398, + "num_input_tokens_seen": 87086435, + "step": 4046, + "time_per_iteration": 4.190885066986084 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.01041554, + "balance_loss_clip": 1.04540849, + "balance_loss_mlp": 1.02366018, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.9145064895114765, + "language_loss": 0.73002982, + "learning_rate": 3.5413392369578e-06, + "loss": 0.7516557, + "num_input_tokens_seen": 87105340, + "step": 4047, + "time_per_iteration": 2.589425563812256 + }, + { + "auxiliary_loss_clip": 0.01125405, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_clip": 1.0438906, + "balance_loss_mlp": 1.02343869, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 3.0516921208697445, + "language_loss": 0.73176777, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75345117, + "num_input_tokens_seen": 87125780, + "step": 4048, + "time_per_iteration": 4.250971794128418 + }, + { + "auxiliary_loss_clip": 0.01115782, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_clip": 1.05109644, + "balance_loss_mlp": 1.03137147, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 2.1790879411260264, + "language_loss": 0.73616666, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75780988, + "num_input_tokens_seen": 87144470, + "step": 4049, + "time_per_iteration": 2.5434672832489014 + }, + { + "auxiliary_loss_clip": 0.01096816, + "auxiliary_loss_mlp": 0.01045613, + "balance_loss_clip": 1.0436132, + "balance_loss_mlp": 1.02817297, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 1.5889550570099689, + "language_loss": 0.7332288, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.7546531, + "num_input_tokens_seen": 87162830, + "step": 4050, + "time_per_iteration": 2.5515811443328857 + }, + { + "auxiliary_loss_clip": 0.01117504, + "auxiliary_loss_mlp": 0.01048288, + "balance_loss_clip": 1.04575443, + "balance_loss_mlp": 1.03192115, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 2.7855757500145804, + "language_loss": 0.7480222, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.76968014, + "num_input_tokens_seen": 87180905, + "step": 4051, + "time_per_iteration": 2.5186333656311035 + }, + { + "auxiliary_loss_clip": 0.0109674, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.04402125, + "balance_loss_mlp": 1.02807856, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 3.1019901831992227, + "language_loss": 0.70445859, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72588634, + "num_input_tokens_seen": 87202290, + "step": 4052, + "time_per_iteration": 2.6760833263397217 + }, + { + "auxiliary_loss_clip": 0.01119408, + "auxiliary_loss_mlp": 0.01048897, + "balance_loss_clip": 1.05068111, + "balance_loss_mlp": 1.03016973, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.5611641948993318, + "language_loss": 0.81102324, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83270633, + "num_input_tokens_seen": 87221650, + "step": 4053, + "time_per_iteration": 2.528942823410034 + }, + { + "auxiliary_loss_clip": 0.01150149, + "auxiliary_loss_mlp": 0.01038862, + "balance_loss_clip": 1.05146015, + "balance_loss_mlp": 1.02107549, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.5395377726838624, + "language_loss": 0.77595949, + "learning_rate": 3.539600555451172e-06, + "loss": 0.7978496, + "num_input_tokens_seen": 87238515, + "step": 4054, + "time_per_iteration": 2.4449665546417236 + }, + { + "auxiliary_loss_clip": 0.01099913, + "auxiliary_loss_mlp": 0.0105194, + "balance_loss_clip": 1.04160547, + "balance_loss_mlp": 1.03405905, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.6103165513563809, + "language_loss": 0.8405695, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.86208808, + "num_input_tokens_seen": 87256290, + "step": 4055, + "time_per_iteration": 3.970608711242676 + }, + { + "auxiliary_loss_clip": 0.01114022, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.04479003, + "balance_loss_mlp": 1.02364993, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 3.4050336794066633, + "language_loss": 0.55475003, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57631642, + "num_input_tokens_seen": 87277085, + "step": 4056, + "time_per_iteration": 2.6247010231018066 + }, + { + "auxiliary_loss_clip": 0.01139131, + "auxiliary_loss_mlp": 0.01049203, + "balance_loss_clip": 1.049492, + "balance_loss_mlp": 1.03107178, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.073432421915559, + "language_loss": 0.79910141, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82098478, + "num_input_tokens_seen": 87293020, + "step": 4057, + "time_per_iteration": 2.541788101196289 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01046378, + "balance_loss_clip": 1.04920554, + "balance_loss_mlp": 1.02865183, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.8108758695624627, + "language_loss": 0.79428482, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81608593, + "num_input_tokens_seen": 87311445, + "step": 4058, + "time_per_iteration": 2.4805445671081543 + }, + { + "auxiliary_loss_clip": 0.01148232, + "auxiliary_loss_mlp": 0.01040276, + "balance_loss_clip": 1.04815912, + "balance_loss_mlp": 1.02374196, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.6313073783041199, + "language_loss": 0.85517293, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87705803, + "num_input_tokens_seen": 87332055, + "step": 4059, + "time_per_iteration": 2.520454168319702 + }, + { + "auxiliary_loss_clip": 0.0112814, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.05001199, + "balance_loss_mlp": 1.02092361, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.6388366209804537, + "language_loss": 0.74379134, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.76544517, + "num_input_tokens_seen": 87351295, + "step": 4060, + "time_per_iteration": 2.5044443607330322 + }, + { + "auxiliary_loss_clip": 0.01116098, + "auxiliary_loss_mlp": 0.01051731, + "balance_loss_clip": 1.04829109, + "balance_loss_mlp": 1.03106022, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.7001649691604936, + "language_loss": 0.73293293, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75461119, + "num_input_tokens_seen": 87370650, + "step": 4061, + "time_per_iteration": 2.603724241256714 + }, + { + "auxiliary_loss_clip": 0.01146374, + "auxiliary_loss_mlp": 0.01048735, + "balance_loss_clip": 1.05247223, + "balance_loss_mlp": 1.0320456, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.7888163661201981, + "language_loss": 0.76539552, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78734666, + "num_input_tokens_seen": 87389020, + "step": 4062, + "time_per_iteration": 2.4604697227478027 + }, + { + "auxiliary_loss_clip": 0.01108272, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.04687643, + "balance_loss_mlp": 1.025056, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 1.687400962709504, + "language_loss": 0.85159308, + "learning_rate": 3.537360904763011e-06, + "loss": 0.8731029, + "num_input_tokens_seen": 87409695, + "step": 4063, + "time_per_iteration": 4.163123369216919 + }, + { + "auxiliary_loss_clip": 0.01121124, + "auxiliary_loss_mlp": 0.01049115, + "balance_loss_clip": 1.04588699, + "balance_loss_mlp": 1.02884936, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 7.897794688757461, + "language_loss": 0.68299818, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70470059, + "num_input_tokens_seen": 87428250, + "step": 4064, + "time_per_iteration": 2.5059359073638916 + }, + { + "auxiliary_loss_clip": 0.01141402, + "auxiliary_loss_mlp": 0.01041389, + "balance_loss_clip": 1.04874372, + "balance_loss_mlp": 1.02263772, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 16.9659805526246, + "language_loss": 0.69661289, + "learning_rate": 3.536862563102088e-06, + "loss": 0.71844077, + "num_input_tokens_seen": 87449380, + "step": 4065, + "time_per_iteration": 2.5247652530670166 + }, + { + "auxiliary_loss_clip": 0.01151561, + "auxiliary_loss_mlp": 0.01054247, + "balance_loss_clip": 1.04989219, + "balance_loss_mlp": 1.03371894, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 1.9092942912103346, + "language_loss": 0.84311265, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86517072, + "num_input_tokens_seen": 87465365, + "step": 4066, + "time_per_iteration": 2.433784246444702 + }, + { + "auxiliary_loss_clip": 0.01052274, + "auxiliary_loss_mlp": 0.01010943, + "balance_loss_clip": 1.01846123, + "balance_loss_mlp": 1.00852323, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7377317752600676, + "language_loss": 0.5226981, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54333025, + "num_input_tokens_seen": 87522525, + "step": 4067, + "time_per_iteration": 2.9336414337158203 + }, + { + "auxiliary_loss_clip": 0.01123767, + "auxiliary_loss_mlp": 0.01048656, + "balance_loss_clip": 1.05015647, + "balance_loss_mlp": 1.03015494, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 2.7562367114746666, + "language_loss": 0.72533035, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74705458, + "num_input_tokens_seen": 87539170, + "step": 4068, + "time_per_iteration": 2.4989075660705566 + }, + { + "auxiliary_loss_clip": 0.01091531, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.04630554, + "balance_loss_mlp": 1.03016031, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.5396002268041764, + "language_loss": 0.77515262, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79655576, + "num_input_tokens_seen": 87558875, + "step": 4069, + "time_per_iteration": 2.6569020748138428 + }, + { + "auxiliary_loss_clip": 0.01120672, + "auxiliary_loss_mlp": 0.01050947, + "balance_loss_clip": 1.05181193, + "balance_loss_mlp": 1.03240955, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 2.2769756891388435, + "language_loss": 0.8041662, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82588243, + "num_input_tokens_seen": 87576485, + "step": 4070, + "time_per_iteration": 2.521566867828369 + }, + { + "auxiliary_loss_clip": 0.01125636, + "auxiliary_loss_mlp": 0.01049579, + "balance_loss_clip": 1.04445946, + "balance_loss_mlp": 1.03099465, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.5174293505773042, + "language_loss": 0.84430796, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86606008, + "num_input_tokens_seen": 87598620, + "step": 4071, + "time_per_iteration": 2.566999912261963 + }, + { + "auxiliary_loss_clip": 0.01121939, + "auxiliary_loss_mlp": 0.0105784, + "balance_loss_clip": 1.04651046, + "balance_loss_mlp": 1.03540468, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.9733137947402528, + "language_loss": 0.80045998, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82225776, + "num_input_tokens_seen": 87616595, + "step": 4072, + "time_per_iteration": 2.4996860027313232 + }, + { + "auxiliary_loss_clip": 0.01132938, + "auxiliary_loss_mlp": 0.01047307, + "balance_loss_clip": 1.05012178, + "balance_loss_mlp": 1.03041458, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.675709668467361, + "language_loss": 0.70226383, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.72406626, + "num_input_tokens_seen": 87635755, + "step": 4073, + "time_per_iteration": 2.4850869178771973 + }, + { + "auxiliary_loss_clip": 0.01113527, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.04625535, + "balance_loss_mlp": 1.02787209, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.294270697823174, + "language_loss": 0.67366701, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69525671, + "num_input_tokens_seen": 87652885, + "step": 4074, + "time_per_iteration": 2.527615785598755 + }, + { + "auxiliary_loss_clip": 0.01056262, + "auxiliary_loss_mlp": 0.01001654, + "balance_loss_clip": 1.02238941, + "balance_loss_mlp": 0.99928206, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.8990971800538426, + "language_loss": 0.68729347, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70787263, + "num_input_tokens_seen": 87713220, + "step": 4075, + "time_per_iteration": 3.1044726371765137 + }, + { + "auxiliary_loss_clip": 0.01148195, + "auxiliary_loss_mlp": 0.01045433, + "balance_loss_clip": 1.05317748, + "balance_loss_mlp": 1.02730155, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 1.7728285593327922, + "language_loss": 0.79711652, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81905282, + "num_input_tokens_seen": 87732680, + "step": 4076, + "time_per_iteration": 2.4824330806732178 + }, + { + "auxiliary_loss_clip": 0.01131127, + "auxiliary_loss_mlp": 0.00787409, + "balance_loss_clip": 1.04907799, + "balance_loss_mlp": 1.00149274, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 2.336632638921792, + "language_loss": 0.82039678, + "learning_rate": 3.533867620434151e-06, + "loss": 0.83958209, + "num_input_tokens_seen": 87751880, + "step": 4077, + "time_per_iteration": 2.5365869998931885 + }, + { + "auxiliary_loss_clip": 0.01149575, + "auxiliary_loss_mlp": 0.01045847, + "balance_loss_clip": 1.05139184, + "balance_loss_mlp": 1.02614212, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 2.0194377069212366, + "language_loss": 0.62104028, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64299452, + "num_input_tokens_seen": 87771795, + "step": 4078, + "time_per_iteration": 2.510728597640991 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.01040086, + "balance_loss_clip": 1.05233479, + "balance_loss_mlp": 1.02257466, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.6107481715935146, + "language_loss": 0.757918, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77950114, + "num_input_tokens_seen": 87793640, + "step": 4079, + "time_per_iteration": 2.549133777618408 + }, + { + "auxiliary_loss_clip": 0.01145693, + "auxiliary_loss_mlp": 0.01044421, + "balance_loss_clip": 1.05027103, + "balance_loss_mlp": 1.0259788, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 2.3234007499484304, + "language_loss": 0.7486918, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77059293, + "num_input_tokens_seen": 87812390, + "step": 4080, + "time_per_iteration": 2.4208335876464844 + }, + { + "auxiliary_loss_clip": 0.01116131, + "auxiliary_loss_mlp": 0.01044408, + "balance_loss_clip": 1.04635811, + "balance_loss_mlp": 1.02702713, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 1.818076147502872, + "language_loss": 0.82850385, + "learning_rate": 3.532867444142186e-06, + "loss": 0.85010922, + "num_input_tokens_seen": 87830640, + "step": 4081, + "time_per_iteration": 2.497587203979492 + }, + { + "auxiliary_loss_clip": 0.01121317, + "auxiliary_loss_mlp": 0.0104075, + "balance_loss_clip": 1.05283165, + "balance_loss_mlp": 1.02353585, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 4.273985115378027, + "language_loss": 0.73154724, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75316787, + "num_input_tokens_seen": 87850450, + "step": 4082, + "time_per_iteration": 2.6383941173553467 + }, + { + "auxiliary_loss_clip": 0.01107301, + "auxiliary_loss_mlp": 0.01046468, + "balance_loss_clip": 1.04567182, + "balance_loss_mlp": 1.03015971, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.6977085381172545, + "language_loss": 0.72034729, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.74188495, + "num_input_tokens_seen": 87868810, + "step": 4083, + "time_per_iteration": 2.5452935695648193 + }, + { + "auxiliary_loss_clip": 0.01118442, + "auxiliary_loss_mlp": 0.01047372, + "balance_loss_clip": 1.04433024, + "balance_loss_mlp": 1.02710676, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 1.9319337913067163, + "language_loss": 0.74761081, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76926899, + "num_input_tokens_seen": 87885685, + "step": 4084, + "time_per_iteration": 2.5025360584259033 + }, + { + "auxiliary_loss_clip": 0.01130429, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.04646611, + "balance_loss_mlp": 1.02017069, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.243515042702007, + "language_loss": 0.85635114, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87803459, + "num_input_tokens_seen": 87903715, + "step": 4085, + "time_per_iteration": 2.4919204711914062 + }, + { + "auxiliary_loss_clip": 0.01115941, + "auxiliary_loss_mlp": 0.01053625, + "balance_loss_clip": 1.05177784, + "balance_loss_mlp": 1.03623235, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.741941368235826, + "language_loss": 0.79711175, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.81880742, + "num_input_tokens_seen": 87923375, + "step": 4086, + "time_per_iteration": 4.039304971694946 + }, + { + "auxiliary_loss_clip": 0.01089519, + "auxiliary_loss_mlp": 0.01048124, + "balance_loss_clip": 1.04738116, + "balance_loss_mlp": 1.03082669, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.5821801124251937, + "language_loss": 0.74945283, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77082932, + "num_input_tokens_seen": 87943115, + "step": 4087, + "time_per_iteration": 2.645427942276001 + }, + { + "auxiliary_loss_clip": 0.01094693, + "auxiliary_loss_mlp": 0.01048539, + "balance_loss_clip": 1.05102313, + "balance_loss_mlp": 1.0295732, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.5899489107937312, + "language_loss": 0.79427809, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81571037, + "num_input_tokens_seen": 87959505, + "step": 4088, + "time_per_iteration": 4.042985916137695 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01035479, + "balance_loss_clip": 1.045946, + "balance_loss_mlp": 1.01923084, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.6554192693224792, + "language_loss": 0.76726019, + "learning_rate": 3.5308643020944e-06, + "loss": 0.78865945, + "num_input_tokens_seen": 87979725, + "step": 4089, + "time_per_iteration": 2.590350866317749 + }, + { + "auxiliary_loss_clip": 0.01132665, + "auxiliary_loss_mlp": 0.01044335, + "balance_loss_clip": 1.0525775, + "balance_loss_mlp": 1.0265255, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 1.8940534977346029, + "language_loss": 0.81645125, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83822125, + "num_input_tokens_seen": 87998270, + "step": 4090, + "time_per_iteration": 2.6780149936676025 + }, + { + "auxiliary_loss_clip": 0.01125615, + "auxiliary_loss_mlp": 0.01051779, + "balance_loss_clip": 1.0478853, + "balance_loss_mlp": 1.03321767, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.682224716884262, + "language_loss": 0.72777146, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.7495454, + "num_input_tokens_seen": 88016760, + "step": 4091, + "time_per_iteration": 2.515341281890869 + }, + { + "auxiliary_loss_clip": 0.01116797, + "auxiliary_loss_mlp": 0.01047991, + "balance_loss_clip": 1.05510926, + "balance_loss_mlp": 1.02946591, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.8987757681450268, + "language_loss": 0.77127588, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.79292375, + "num_input_tokens_seen": 88036465, + "step": 4092, + "time_per_iteration": 2.5492148399353027 + }, + { + "auxiliary_loss_clip": 0.01116645, + "auxiliary_loss_mlp": 0.01045711, + "balance_loss_clip": 1.04164541, + "balance_loss_mlp": 1.02754354, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.506487543320959, + "language_loss": 0.80881894, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83044243, + "num_input_tokens_seen": 88053270, + "step": 4093, + "time_per_iteration": 2.519472360610962 + }, + { + "auxiliary_loss_clip": 0.01139946, + "auxiliary_loss_mlp": 0.01043324, + "balance_loss_clip": 1.05068851, + "balance_loss_mlp": 1.0252043, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.955190690307996, + "language_loss": 0.87523466, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89706737, + "num_input_tokens_seen": 88072305, + "step": 4094, + "time_per_iteration": 3.899768829345703 + }, + { + "auxiliary_loss_clip": 0.01005752, + "auxiliary_loss_mlp": 0.01116061, + "balance_loss_clip": 1.03041148, + "balance_loss_mlp": 1.11378407, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.8030985120819645, + "language_loss": 0.57494569, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59616387, + "num_input_tokens_seen": 88137995, + "step": 4095, + "time_per_iteration": 3.3565030097961426 + }, + { + "auxiliary_loss_clip": 0.01031958, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.01587057, + "balance_loss_mlp": 1.02679753, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.6590019384024876, + "language_loss": 0.56268418, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58329618, + "num_input_tokens_seen": 88208490, + "step": 4096, + "time_per_iteration": 3.3711912631988525 + }, + { + "auxiliary_loss_clip": 0.01128621, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.05288029, + "balance_loss_mlp": 1.01960135, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 4.574080878588904, + "language_loss": 0.77347946, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79513526, + "num_input_tokens_seen": 88228050, + "step": 4097, + "time_per_iteration": 2.561404228210449 + }, + { + "auxiliary_loss_clip": 0.0111678, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.04944372, + "balance_loss_mlp": 1.02417767, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 1.7091019184109881, + "language_loss": 0.76671791, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78832704, + "num_input_tokens_seen": 88248090, + "step": 4098, + "time_per_iteration": 2.570596218109131 + }, + { + "auxiliary_loss_clip": 0.01130425, + "auxiliary_loss_mlp": 0.01044444, + "balance_loss_clip": 1.05291057, + "balance_loss_mlp": 1.02742112, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 3.6459953179640046, + "language_loss": 0.68230039, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70404911, + "num_input_tokens_seen": 88267545, + "step": 4099, + "time_per_iteration": 2.5969719886779785 + }, + { + "auxiliary_loss_clip": 0.01133161, + "auxiliary_loss_mlp": 0.01044569, + "balance_loss_clip": 1.05055559, + "balance_loss_mlp": 1.02737856, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.459602579562185, + "language_loss": 0.66610652, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68788379, + "num_input_tokens_seen": 88289785, + "step": 4100, + "time_per_iteration": 2.5862696170806885 + }, + { + "auxiliary_loss_clip": 0.01037877, + "auxiliary_loss_mlp": 0.01094473, + "balance_loss_clip": 1.02081227, + "balance_loss_mlp": 1.0917666, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7926584349133917, + "language_loss": 0.61525375, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63657731, + "num_input_tokens_seen": 88357320, + "step": 4101, + "time_per_iteration": 3.166417360305786 + }, + { + "auxiliary_loss_clip": 0.01144752, + "auxiliary_loss_mlp": 0.01052313, + "balance_loss_clip": 1.05002916, + "balance_loss_mlp": 1.03383517, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.7409485708673036, + "language_loss": 0.73107541, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75304604, + "num_input_tokens_seen": 88377040, + "step": 4102, + "time_per_iteration": 2.4484896659851074 + }, + { + "auxiliary_loss_clip": 0.0112728, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.05047107, + "balance_loss_mlp": 1.0265255, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.1377516670961216, + "language_loss": 0.75998056, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78168762, + "num_input_tokens_seen": 88395085, + "step": 4103, + "time_per_iteration": 4.121765851974487 + }, + { + "auxiliary_loss_clip": 0.01131496, + "auxiliary_loss_mlp": 0.01045431, + "balance_loss_clip": 1.04976857, + "balance_loss_mlp": 1.02652407, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.021924873729905, + "language_loss": 0.78400213, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80577135, + "num_input_tokens_seen": 88413205, + "step": 4104, + "time_per_iteration": 2.4922049045562744 + }, + { + "auxiliary_loss_clip": 0.01133728, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_clip": 1.05030036, + "balance_loss_mlp": 1.02582073, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.8114670037278797, + "language_loss": 0.83867753, + "learning_rate": 3.526846877170133e-06, + "loss": 0.8604672, + "num_input_tokens_seen": 88431525, + "step": 4105, + "time_per_iteration": 2.4837937355041504 + }, + { + "auxiliary_loss_clip": 0.01150141, + "auxiliary_loss_mlp": 0.01046017, + "balance_loss_clip": 1.05453086, + "balance_loss_mlp": 1.02906573, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.8105246858956103, + "language_loss": 0.7639094, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78587103, + "num_input_tokens_seen": 88451210, + "step": 4106, + "time_per_iteration": 2.4377570152282715 + }, + { + "auxiliary_loss_clip": 0.01108404, + "auxiliary_loss_mlp": 0.01064903, + "balance_loss_clip": 1.04208755, + "balance_loss_mlp": 1.04344559, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.7100663994056617, + "language_loss": 0.72314978, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74488282, + "num_input_tokens_seen": 88467790, + "step": 4107, + "time_per_iteration": 2.462275505065918 + }, + { + "auxiliary_loss_clip": 0.01148061, + "auxiliary_loss_mlp": 0.01045021, + "balance_loss_clip": 1.05187297, + "balance_loss_mlp": 1.02772331, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 1.4534175311103579, + "language_loss": 0.65403277, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67596358, + "num_input_tokens_seen": 88490330, + "step": 4108, + "time_per_iteration": 2.525167942047119 + }, + { + "auxiliary_loss_clip": 0.0110192, + "auxiliary_loss_mlp": 0.01050509, + "balance_loss_clip": 1.04969633, + "balance_loss_mlp": 1.03211498, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 1.7697859874957254, + "language_loss": 0.72702861, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74855292, + "num_input_tokens_seen": 88512435, + "step": 4109, + "time_per_iteration": 2.826651096343994 + }, + { + "auxiliary_loss_clip": 0.01115029, + "auxiliary_loss_mlp": 0.01052537, + "balance_loss_clip": 1.04954815, + "balance_loss_mlp": 1.03507268, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.7890089239511489, + "language_loss": 0.78913975, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81081545, + "num_input_tokens_seen": 88529780, + "step": 4110, + "time_per_iteration": 2.545719623565674 + }, + { + "auxiliary_loss_clip": 0.0111719, + "auxiliary_loss_mlp": 0.01045418, + "balance_loss_clip": 1.04742265, + "balance_loss_mlp": 1.02667832, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.1152600594234183, + "language_loss": 0.8093822, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.83100832, + "num_input_tokens_seen": 88547200, + "step": 4111, + "time_per_iteration": 2.5324699878692627 + }, + { + "auxiliary_loss_clip": 0.01146089, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.05009341, + "balance_loss_mlp": 1.02877235, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 1.8387730146513166, + "language_loss": 0.75013602, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.772053, + "num_input_tokens_seen": 88566415, + "step": 4112, + "time_per_iteration": 2.464177131652832 + }, + { + "auxiliary_loss_clip": 0.0111308, + "auxiliary_loss_mlp": 0.00786869, + "balance_loss_clip": 1.04773045, + "balance_loss_mlp": 1.00111353, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.789575637676334, + "language_loss": 0.82921946, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84821898, + "num_input_tokens_seen": 88585225, + "step": 4113, + "time_per_iteration": 2.5555763244628906 + }, + { + "auxiliary_loss_clip": 0.01144392, + "auxiliary_loss_mlp": 0.01045236, + "balance_loss_clip": 1.04891622, + "balance_loss_mlp": 1.02785575, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 5.629612688378033, + "language_loss": 0.87289149, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89478773, + "num_input_tokens_seen": 88603280, + "step": 4114, + "time_per_iteration": 2.4236154556274414 + }, + { + "auxiliary_loss_clip": 0.01097472, + "auxiliary_loss_mlp": 0.0104326, + "balance_loss_clip": 1.04750109, + "balance_loss_mlp": 1.02616525, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 1.531667666144519, + "language_loss": 0.75648499, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77789235, + "num_input_tokens_seen": 88624925, + "step": 4115, + "time_per_iteration": 2.6183431148529053 + }, + { + "auxiliary_loss_clip": 0.01013397, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.02455175, + "balance_loss_mlp": 1.02598143, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6687648996121698, + "language_loss": 0.58223546, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60264874, + "num_input_tokens_seen": 88691475, + "step": 4116, + "time_per_iteration": 3.3283307552337646 + }, + { + "auxiliary_loss_clip": 0.01120794, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.04627514, + "balance_loss_mlp": 1.02181959, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.4582272695639682, + "language_loss": 0.83635712, + "learning_rate": 3.523824079451235e-06, + "loss": 0.85795051, + "num_input_tokens_seen": 88713425, + "step": 4117, + "time_per_iteration": 2.6872880458831787 + }, + { + "auxiliary_loss_clip": 0.01040621, + "auxiliary_loss_mlp": 0.00760201, + "balance_loss_clip": 1.02594781, + "balance_loss_mlp": 1.00158966, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.8989257342513636, + "language_loss": 0.63396072, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65196896, + "num_input_tokens_seen": 88769995, + "step": 4118, + "time_per_iteration": 2.9409563541412354 + }, + { + "auxiliary_loss_clip": 0.01129936, + "auxiliary_loss_mlp": 0.01049679, + "balance_loss_clip": 1.04528058, + "balance_loss_mlp": 1.03226244, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.5753312712900385, + "language_loss": 0.79491276, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81670886, + "num_input_tokens_seen": 88789970, + "step": 4119, + "time_per_iteration": 2.505052089691162 + }, + { + "auxiliary_loss_clip": 0.01132037, + "auxiliary_loss_mlp": 0.01042026, + "balance_loss_clip": 1.05001092, + "balance_loss_mlp": 1.02524114, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.4286655070347505, + "language_loss": 0.74281567, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76455629, + "num_input_tokens_seen": 88810000, + "step": 4120, + "time_per_iteration": 2.4698922634124756 + }, + { + "auxiliary_loss_clip": 0.01136815, + "auxiliary_loss_mlp": 0.010507, + "balance_loss_clip": 1.04904664, + "balance_loss_mlp": 1.03254437, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 2.0631644640036613, + "language_loss": 0.87732643, + "learning_rate": 3.522814630322041e-06, + "loss": 0.89920163, + "num_input_tokens_seen": 88827515, + "step": 4121, + "time_per_iteration": 2.446699619293213 + }, + { + "auxiliary_loss_clip": 0.01145884, + "auxiliary_loss_mlp": 0.01042316, + "balance_loss_clip": 1.04845667, + "balance_loss_mlp": 1.02348137, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.256536098881901, + "language_loss": 0.6921463, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71402836, + "num_input_tokens_seen": 88845025, + "step": 4122, + "time_per_iteration": 2.444694757461548 + }, + { + "auxiliary_loss_clip": 0.01146174, + "auxiliary_loss_mlp": 0.01042523, + "balance_loss_clip": 1.04915643, + "balance_loss_mlp": 1.02331877, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.48407397798051, + "language_loss": 0.8024286, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82431555, + "num_input_tokens_seen": 88861740, + "step": 4123, + "time_per_iteration": 2.425071954727173 + }, + { + "auxiliary_loss_clip": 0.01086451, + "auxiliary_loss_mlp": 0.01047337, + "balance_loss_clip": 1.04686618, + "balance_loss_mlp": 1.0299207, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 3.1599627647768527, + "language_loss": 0.75027794, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77161586, + "num_input_tokens_seen": 88879740, + "step": 4124, + "time_per_iteration": 2.5965778827667236 + }, + { + "auxiliary_loss_clip": 0.01130909, + "auxiliary_loss_mlp": 0.01043704, + "balance_loss_clip": 1.04726934, + "balance_loss_mlp": 1.02690744, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.460740928825043, + "language_loss": 0.73694617, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75869232, + "num_input_tokens_seen": 88904095, + "step": 4125, + "time_per_iteration": 4.07455039024353 + }, + { + "auxiliary_loss_clip": 0.01111089, + "auxiliary_loss_mlp": 0.00786054, + "balance_loss_clip": 1.04311907, + "balance_loss_mlp": 1.00124836, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 1.7969014610098746, + "language_loss": 0.69177067, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.710742, + "num_input_tokens_seen": 88920740, + "step": 4126, + "time_per_iteration": 2.540375232696533 + }, + { + "auxiliary_loss_clip": 0.01133479, + "auxiliary_loss_mlp": 0.010489, + "balance_loss_clip": 1.04698801, + "balance_loss_mlp": 1.03134012, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.0164070033485895, + "language_loss": 0.81044662, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83227032, + "num_input_tokens_seen": 88938510, + "step": 4127, + "time_per_iteration": 4.378163814544678 + }, + { + "auxiliary_loss_clip": 0.01137557, + "auxiliary_loss_mlp": 0.00784331, + "balance_loss_clip": 1.04970527, + "balance_loss_mlp": 1.00107968, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 2.239419356876861, + "language_loss": 0.84500039, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86421925, + "num_input_tokens_seen": 88955235, + "step": 4128, + "time_per_iteration": 2.507948875427246 + }, + { + "auxiliary_loss_clip": 0.01115757, + "auxiliary_loss_mlp": 0.01052315, + "balance_loss_clip": 1.04419982, + "balance_loss_mlp": 1.03338408, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 2.1058307251037616, + "language_loss": 0.65754962, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67923033, + "num_input_tokens_seen": 88975210, + "step": 4129, + "time_per_iteration": 2.5595481395721436 + }, + { + "auxiliary_loss_clip": 0.01097045, + "auxiliary_loss_mlp": 0.01048716, + "balance_loss_clip": 1.04416871, + "balance_loss_mlp": 1.03016663, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 1.5764546328199867, + "language_loss": 0.75176048, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77321804, + "num_input_tokens_seen": 88996120, + "step": 4130, + "time_per_iteration": 2.634805917739868 + }, + { + "auxiliary_loss_clip": 0.01079646, + "auxiliary_loss_mlp": 0.01046709, + "balance_loss_clip": 1.04184222, + "balance_loss_mlp": 1.02854133, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 1.989733147552307, + "language_loss": 0.76991093, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79117453, + "num_input_tokens_seen": 89008685, + "step": 4131, + "time_per_iteration": 2.6474082469940186 + }, + { + "auxiliary_loss_clip": 0.01128856, + "auxiliary_loss_mlp": 0.0104234, + "balance_loss_clip": 1.04700875, + "balance_loss_mlp": 1.02578211, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.6666035513133508, + "language_loss": 0.83742237, + "learning_rate": 3.520033883075255e-06, + "loss": 0.85913432, + "num_input_tokens_seen": 89031160, + "step": 4132, + "time_per_iteration": 2.5598180294036865 + }, + { + "auxiliary_loss_clip": 0.01120405, + "auxiliary_loss_mlp": 0.01039366, + "balance_loss_clip": 1.04511225, + "balance_loss_mlp": 1.0211513, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 2.381358315831822, + "language_loss": 0.70871937, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73031706, + "num_input_tokens_seen": 89047235, + "step": 4133, + "time_per_iteration": 2.5035240650177 + }, + { + "auxiliary_loss_clip": 0.01152281, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_clip": 1.05033064, + "balance_loss_mlp": 1.02241993, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.43523717236909, + "language_loss": 0.61959159, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.64154661, + "num_input_tokens_seen": 89064790, + "step": 4134, + "time_per_iteration": 3.9190409183502197 + }, + { + "auxiliary_loss_clip": 0.01138248, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.04977107, + "balance_loss_mlp": 1.02081931, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 1.8567032882001362, + "language_loss": 0.78642845, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.8081885, + "num_input_tokens_seen": 89083250, + "step": 4135, + "time_per_iteration": 2.535567045211792 + }, + { + "auxiliary_loss_clip": 0.01126013, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.05382919, + "balance_loss_mlp": 1.01949716, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.2760871264757916, + "language_loss": 0.83084732, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.85246664, + "num_input_tokens_seen": 89100905, + "step": 4136, + "time_per_iteration": 2.483304023742676 + }, + { + "auxiliary_loss_clip": 0.01117732, + "auxiliary_loss_mlp": 0.01043138, + "balance_loss_clip": 1.04868555, + "balance_loss_mlp": 1.02559066, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7619907470356395, + "language_loss": 0.71101105, + "learning_rate": 3.518767600693314e-06, + "loss": 0.73261982, + "num_input_tokens_seen": 89122630, + "step": 4137, + "time_per_iteration": 2.6622796058654785 + }, + { + "auxiliary_loss_clip": 0.0113645, + "auxiliary_loss_mlp": 0.00784337, + "balance_loss_clip": 1.04624128, + "balance_loss_mlp": 1.00094926, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.1140374610598123, + "language_loss": 0.67021823, + "learning_rate": 3.518514171403042e-06, + "loss": 0.68942606, + "num_input_tokens_seen": 89141050, + "step": 4138, + "time_per_iteration": 2.4742252826690674 + }, + { + "auxiliary_loss_clip": 0.01106394, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.04858387, + "balance_loss_mlp": 1.02128971, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 1.7724721539794022, + "language_loss": 0.8386035, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86003852, + "num_input_tokens_seen": 89160810, + "step": 4139, + "time_per_iteration": 2.603557825088501 + }, + { + "auxiliary_loss_clip": 0.01115399, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.04685235, + "balance_loss_mlp": 1.02626765, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.6030194203217032, + "language_loss": 0.78804374, + "learning_rate": 3.518007140085481e-06, + "loss": 0.80964321, + "num_input_tokens_seen": 89180610, + "step": 4140, + "time_per_iteration": 2.567558765411377 + }, + { + "auxiliary_loss_clip": 0.01047752, + "auxiliary_loss_mlp": 0.01008902, + "balance_loss_clip": 1.02925634, + "balance_loss_mlp": 1.00706673, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8270642982184896, + "language_loss": 0.60928965, + "learning_rate": 3.51775353807742e-06, + "loss": 0.62985623, + "num_input_tokens_seen": 89241880, + "step": 4141, + "time_per_iteration": 3.1445744037628174 + }, + { + "auxiliary_loss_clip": 0.0114887, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_clip": 1.05118203, + "balance_loss_mlp": 1.03528881, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 1.9385643741763774, + "language_loss": 0.72641498, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.7484259, + "num_input_tokens_seen": 89263340, + "step": 4142, + "time_per_iteration": 4.185422897338867 + }, + { + "auxiliary_loss_clip": 0.0113446, + "auxiliary_loss_mlp": 0.01047854, + "balance_loss_clip": 1.04732716, + "balance_loss_mlp": 1.03061688, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 1.926188361128317, + "language_loss": 0.81185842, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83368158, + "num_input_tokens_seen": 89282870, + "step": 4143, + "time_per_iteration": 2.505704402923584 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01044595, + "balance_loss_clip": 1.04662013, + "balance_loss_mlp": 1.02929449, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 2.0642678505798697, + "language_loss": 0.5876537, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.60927862, + "num_input_tokens_seen": 89303830, + "step": 4144, + "time_per_iteration": 2.571864604949951 + }, + { + "auxiliary_loss_clip": 0.01131527, + "auxiliary_loss_mlp": 0.01053518, + "balance_loss_clip": 1.04482603, + "balance_loss_mlp": 1.03642356, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.0874001902149164, + "language_loss": 0.78600854, + "learning_rate": 3.516738554607708e-06, + "loss": 0.80785894, + "num_input_tokens_seen": 89324350, + "step": 4145, + "time_per_iteration": 2.5839807987213135 + }, + { + "auxiliary_loss_clip": 0.01143602, + "auxiliary_loss_mlp": 0.0078868, + "balance_loss_clip": 1.050946, + "balance_loss_mlp": 1.00113451, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.0590884087701635, + "language_loss": 0.65526175, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.67458463, + "num_input_tokens_seen": 89342875, + "step": 4146, + "time_per_iteration": 2.5650393962860107 + }, + { + "auxiliary_loss_clip": 0.01032277, + "auxiliary_loss_mlp": 0.01056219, + "balance_loss_clip": 1.0200038, + "balance_loss_mlp": 1.05412054, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9656859858799551, + "language_loss": 0.67293644, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69382131, + "num_input_tokens_seen": 89404925, + "step": 4147, + "time_per_iteration": 3.222310781478882 + }, + { + "auxiliary_loss_clip": 0.01125932, + "auxiliary_loss_mlp": 0.0105816, + "balance_loss_clip": 1.05036271, + "balance_loss_mlp": 1.04080296, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 1.7371571027323902, + "language_loss": 0.89302826, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91486913, + "num_input_tokens_seen": 89425090, + "step": 4148, + "time_per_iteration": 2.6051135063171387 + }, + { + "auxiliary_loss_clip": 0.01102773, + "auxiliary_loss_mlp": 0.01056319, + "balance_loss_clip": 1.04796839, + "balance_loss_mlp": 1.03586292, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.8724585170956858, + "language_loss": 0.6799159, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70150679, + "num_input_tokens_seen": 89442615, + "step": 4149, + "time_per_iteration": 2.6062862873077393 + }, + { + "auxiliary_loss_clip": 0.01133905, + "auxiliary_loss_mlp": 0.01047018, + "balance_loss_clip": 1.04876518, + "balance_loss_mlp": 1.03022158, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 2.2937068122213295, + "language_loss": 0.71221924, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73402846, + "num_input_tokens_seen": 89463025, + "step": 4150, + "time_per_iteration": 2.543612480163574 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01049484, + "balance_loss_clip": 1.04427481, + "balance_loss_mlp": 1.03135228, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.710900516446868, + "language_loss": 0.7278316, + "learning_rate": 3.515214354149478e-06, + "loss": 0.74932957, + "num_input_tokens_seen": 89480225, + "step": 4151, + "time_per_iteration": 2.5709688663482666 + }, + { + "auxiliary_loss_clip": 0.01140287, + "auxiliary_loss_mlp": 0.01054067, + "balance_loss_clip": 1.04732513, + "balance_loss_mlp": 1.03573215, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 3.5529739166671934, + "language_loss": 0.63243288, + "learning_rate": 3.514960119583781e-06, + "loss": 0.65437639, + "num_input_tokens_seen": 89496985, + "step": 4152, + "time_per_iteration": 2.50262188911438 + }, + { + "auxiliary_loss_clip": 0.01128917, + "auxiliary_loss_mlp": 0.01037434, + "balance_loss_clip": 1.0503943, + "balance_loss_mlp": 1.02054203, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 1.9489625081964501, + "language_loss": 0.76844442, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79010797, + "num_input_tokens_seen": 89514420, + "step": 4153, + "time_per_iteration": 2.4909799098968506 + }, + { + "auxiliary_loss_clip": 0.01135626, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.05069375, + "balance_loss_mlp": 1.02298963, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.8132118623314253, + "language_loss": 0.76224923, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78401411, + "num_input_tokens_seen": 89532925, + "step": 4154, + "time_per_iteration": 2.5074691772460938 + }, + { + "auxiliary_loss_clip": 0.01134764, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_clip": 1.04821754, + "balance_loss_mlp": 1.02356017, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.1288931782458804, + "language_loss": 0.71065563, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.7324295, + "num_input_tokens_seen": 89552855, + "step": 4155, + "time_per_iteration": 2.609874963760376 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01050136, + "balance_loss_clip": 1.05087793, + "balance_loss_mlp": 1.03176618, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.5739355910511612, + "language_loss": 0.74567425, + "learning_rate": 3.513942606943036e-06, + "loss": 0.76746368, + "num_input_tokens_seen": 89572830, + "step": 4156, + "time_per_iteration": 2.5484790802001953 + }, + { + "auxiliary_loss_clip": 0.01128662, + "auxiliary_loss_mlp": 0.01041024, + "balance_loss_clip": 1.04823875, + "balance_loss_mlp": 1.02423894, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.0954197465778357, + "language_loss": 0.76873779, + "learning_rate": 3.513688085236591e-06, + "loss": 0.7904346, + "num_input_tokens_seen": 89590345, + "step": 4157, + "time_per_iteration": 2.491041421890259 + }, + { + "auxiliary_loss_clip": 0.01087233, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.04322696, + "balance_loss_mlp": 1.0295341, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.6100357084767871, + "language_loss": 0.81269139, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83403134, + "num_input_tokens_seen": 89610295, + "step": 4158, + "time_per_iteration": 2.606600522994995 + }, + { + "auxiliary_loss_clip": 0.01116661, + "auxiliary_loss_mlp": 0.0103794, + "balance_loss_clip": 1.04655612, + "balance_loss_mlp": 1.02080941, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 1.77115476708043, + "language_loss": 0.75395298, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77549899, + "num_input_tokens_seen": 89627795, + "step": 4159, + "time_per_iteration": 2.4825422763824463 + }, + { + "auxiliary_loss_clip": 0.01139697, + "auxiliary_loss_mlp": 0.01040514, + "balance_loss_clip": 1.04982269, + "balance_loss_mlp": 1.02226353, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 1.7484919426236962, + "language_loss": 0.71440315, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73620534, + "num_input_tokens_seen": 89648090, + "step": 4160, + "time_per_iteration": 2.5198967456817627 + }, + { + "auxiliary_loss_clip": 0.0105259, + "auxiliary_loss_mlp": 0.01024224, + "balance_loss_clip": 1.0184257, + "balance_loss_mlp": 1.02188742, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.8346721974530318, + "language_loss": 0.56744039, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.5882085, + "num_input_tokens_seen": 89710345, + "step": 4161, + "time_per_iteration": 3.0833563804626465 + }, + { + "auxiliary_loss_clip": 0.01143618, + "auxiliary_loss_mlp": 0.01045803, + "balance_loss_clip": 1.0534637, + "balance_loss_mlp": 1.02801728, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 2.338033058999873, + "language_loss": 0.80824262, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.83013684, + "num_input_tokens_seen": 89729390, + "step": 4162, + "time_per_iteration": 2.4879071712493896 + }, + { + "auxiliary_loss_clip": 0.01128047, + "auxiliary_loss_mlp": 0.00786669, + "balance_loss_clip": 1.04717684, + "balance_loss_mlp": 1.00113881, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.7406954392949183, + "language_loss": 0.87417394, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.8933211, + "num_input_tokens_seen": 89742805, + "step": 4163, + "time_per_iteration": 2.4929027557373047 + }, + { + "auxiliary_loss_clip": 0.01133329, + "auxiliary_loss_mlp": 0.01037119, + "balance_loss_clip": 1.05318654, + "balance_loss_mlp": 1.01960707, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.64901843888157, + "language_loss": 0.84022784, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.86193228, + "num_input_tokens_seen": 89761145, + "step": 4164, + "time_per_iteration": 4.078903913497925 + }, + { + "auxiliary_loss_clip": 0.0113307, + "auxiliary_loss_mlp": 0.01046427, + "balance_loss_clip": 1.05759108, + "balance_loss_mlp": 1.03005946, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 3.766897560561676, + "language_loss": 0.7397607, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76155561, + "num_input_tokens_seen": 89780905, + "step": 4165, + "time_per_iteration": 3.979234457015991 + }, + { + "auxiliary_loss_clip": 0.01111118, + "auxiliary_loss_mlp": 0.01048178, + "balance_loss_clip": 1.04586959, + "balance_loss_mlp": 1.03080964, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 1.8018269792869457, + "language_loss": 0.73996282, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76155573, + "num_input_tokens_seen": 89799230, + "step": 4166, + "time_per_iteration": 2.5583434104919434 + }, + { + "auxiliary_loss_clip": 0.01110727, + "auxiliary_loss_mlp": 0.01047008, + "balance_loss_clip": 1.04916739, + "balance_loss_mlp": 1.02993774, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.6844219579121882, + "language_loss": 0.82163131, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84320861, + "num_input_tokens_seen": 89818240, + "step": 4167, + "time_per_iteration": 2.5892493724823 + }, + { + "auxiliary_loss_clip": 0.01130936, + "auxiliary_loss_mlp": 0.01046829, + "balance_loss_clip": 1.0508244, + "balance_loss_mlp": 1.0308131, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 9.96258665795811, + "language_loss": 0.79293245, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81471014, + "num_input_tokens_seen": 89834485, + "step": 4168, + "time_per_iteration": 2.4975898265838623 + }, + { + "auxiliary_loss_clip": 0.01138944, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_clip": 1.05074155, + "balance_loss_mlp": 1.0274024, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.2409382014089783, + "language_loss": 0.699633, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72149032, + "num_input_tokens_seen": 89855645, + "step": 4169, + "time_per_iteration": 2.662720203399658 + }, + { + "auxiliary_loss_clip": 0.01112405, + "auxiliary_loss_mlp": 0.0105361, + "balance_loss_clip": 1.04785645, + "balance_loss_mlp": 1.03672981, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.877509665048157, + "language_loss": 0.78469855, + "learning_rate": 3.510374083241361e-06, + "loss": 0.8063587, + "num_input_tokens_seen": 89874895, + "step": 4170, + "time_per_iteration": 2.5956332683563232 + }, + { + "auxiliary_loss_clip": 0.01127071, + "auxiliary_loss_mlp": 0.01049277, + "balance_loss_clip": 1.05056679, + "balance_loss_mlp": 1.03193212, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.5097568092147258, + "language_loss": 0.76475841, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78652191, + "num_input_tokens_seen": 89891700, + "step": 4171, + "time_per_iteration": 2.502798318862915 + }, + { + "auxiliary_loss_clip": 0.01050811, + "auxiliary_loss_mlp": 0.01047034, + "balance_loss_clip": 1.01706576, + "balance_loss_mlp": 1.04492378, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8418259374502635, + "language_loss": 0.60021406, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62119251, + "num_input_tokens_seen": 89955775, + "step": 4172, + "time_per_iteration": 3.0321011543273926 + }, + { + "auxiliary_loss_clip": 0.01124471, + "auxiliary_loss_mlp": 0.01049992, + "balance_loss_clip": 1.04650378, + "balance_loss_mlp": 1.03214622, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 2.162754441407278, + "language_loss": 0.78959674, + "learning_rate": 3.509607938211409e-06, + "loss": 0.81134135, + "num_input_tokens_seen": 89977150, + "step": 4173, + "time_per_iteration": 3.970900297164917 + }, + { + "auxiliary_loss_clip": 0.01148519, + "auxiliary_loss_mlp": 0.01051313, + "balance_loss_clip": 1.05251741, + "balance_loss_mlp": 1.0335989, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 1.9439378487565475, + "language_loss": 0.83262718, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85462558, + "num_input_tokens_seen": 89994925, + "step": 4174, + "time_per_iteration": 2.4245073795318604 + }, + { + "auxiliary_loss_clip": 0.01098449, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_clip": 1.04676175, + "balance_loss_mlp": 1.02762389, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.148989527219702, + "language_loss": 0.71554929, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73699629, + "num_input_tokens_seen": 90013235, + "step": 4175, + "time_per_iteration": 2.6013448238372803 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.01038478, + "balance_loss_clip": 1.0460465, + "balance_loss_mlp": 1.02056122, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 1.885021178836003, + "language_loss": 0.81334496, + "learning_rate": 3.50884127798111e-06, + "loss": 0.83489221, + "num_input_tokens_seen": 90032150, + "step": 4176, + "time_per_iteration": 2.5988972187042236 + }, + { + "auxiliary_loss_clip": 0.01128817, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.05258441, + "balance_loss_mlp": 1.02118719, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.225549582794512, + "language_loss": 0.82862628, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.85032403, + "num_input_tokens_seen": 90049085, + "step": 4177, + "time_per_iteration": 2.5220916271209717 + }, + { + "auxiliary_loss_clip": 0.01107272, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.04737329, + "balance_loss_mlp": 1.03358412, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.1431518223492483, + "language_loss": 0.82743186, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84901756, + "num_input_tokens_seen": 90067695, + "step": 4178, + "time_per_iteration": 2.598783493041992 + }, + { + "auxiliary_loss_clip": 0.0114134, + "auxiliary_loss_mlp": 0.00785524, + "balance_loss_clip": 1.04861295, + "balance_loss_mlp": 1.00138235, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.27230871603115, + "language_loss": 0.75792855, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77719718, + "num_input_tokens_seen": 90083890, + "step": 4179, + "time_per_iteration": 2.4810893535614014 + }, + { + "auxiliary_loss_clip": 0.01108737, + "auxiliary_loss_mlp": 0.01053793, + "balance_loss_clip": 1.04645813, + "balance_loss_mlp": 1.03545845, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 1.7773385192704834, + "language_loss": 0.70231295, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72393829, + "num_input_tokens_seen": 90100995, + "step": 4180, + "time_per_iteration": 2.5421011447906494 + }, + { + "auxiliary_loss_clip": 0.01144906, + "auxiliary_loss_mlp": 0.01048458, + "balance_loss_clip": 1.05079031, + "balance_loss_mlp": 1.03050506, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 1.8127797937291308, + "language_loss": 0.8574959, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.87942958, + "num_input_tokens_seen": 90120365, + "step": 4181, + "time_per_iteration": 2.4481825828552246 + }, + { + "auxiliary_loss_clip": 0.01148819, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.05289292, + "balance_loss_mlp": 1.02690506, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 3.946439460293631, + "language_loss": 0.67784369, + "learning_rate": 3.507306412966238e-06, + "loss": 0.69977379, + "num_input_tokens_seen": 90142610, + "step": 4182, + "time_per_iteration": 4.102197885513306 + }, + { + "auxiliary_loss_clip": 0.01030624, + "auxiliary_loss_mlp": 0.01002476, + "balance_loss_clip": 1.01358557, + "balance_loss_mlp": 1.00023508, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.846990559514065, + "language_loss": 0.70149541, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72182643, + "num_input_tokens_seen": 90200555, + "step": 4183, + "time_per_iteration": 3.089541435241699 + }, + { + "auxiliary_loss_clip": 0.01130388, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.04959238, + "balance_loss_mlp": 1.02353847, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.5741439390734966, + "language_loss": 0.74279082, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76451927, + "num_input_tokens_seen": 90218120, + "step": 4184, + "time_per_iteration": 2.514235734939575 + }, + { + "auxiliary_loss_clip": 0.01138443, + "auxiliary_loss_mlp": 0.01052015, + "balance_loss_clip": 1.05238795, + "balance_loss_mlp": 1.03369296, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.7739414644924605, + "language_loss": 0.83183765, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85374224, + "num_input_tokens_seen": 90236790, + "step": 4185, + "time_per_iteration": 2.5073320865631104 + }, + { + "auxiliary_loss_clip": 0.01011764, + "auxiliary_loss_mlp": 0.01011934, + "balance_loss_clip": 1.02604532, + "balance_loss_mlp": 1.00972867, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.8000059357731768, + "language_loss": 0.61491048, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63514745, + "num_input_tokens_seen": 90297070, + "step": 4186, + "time_per_iteration": 3.033989667892456 + }, + { + "auxiliary_loss_clip": 0.01109126, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_clip": 1.04897809, + "balance_loss_mlp": 1.02312672, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 2.0050530493567127, + "language_loss": 0.79062533, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81213421, + "num_input_tokens_seen": 90315255, + "step": 4187, + "time_per_iteration": 2.5805776119232178 + }, + { + "auxiliary_loss_clip": 0.01089592, + "auxiliary_loss_mlp": 0.0105687, + "balance_loss_clip": 1.04641283, + "balance_loss_mlp": 1.03824997, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.568448211054377, + "language_loss": 0.79915273, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82061744, + "num_input_tokens_seen": 90334990, + "step": 4188, + "time_per_iteration": 2.6349778175354004 + }, + { + "auxiliary_loss_clip": 0.01135374, + "auxiliary_loss_mlp": 0.01048, + "balance_loss_clip": 1.05170429, + "balance_loss_mlp": 1.0305599, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.946460553141685, + "language_loss": 0.74624419, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76807797, + "num_input_tokens_seen": 90351825, + "step": 4189, + "time_per_iteration": 2.5331075191497803 + }, + { + "auxiliary_loss_clip": 0.0112264, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.04957008, + "balance_loss_mlp": 1.03284824, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.8704347954873133, + "language_loss": 0.84116697, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86289263, + "num_input_tokens_seen": 90369860, + "step": 4190, + "time_per_iteration": 2.583521842956543 + }, + { + "auxiliary_loss_clip": 0.01125365, + "auxiliary_loss_mlp": 0.01049962, + "balance_loss_clip": 1.04856408, + "balance_loss_mlp": 1.03049564, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 1.8219123559238888, + "language_loss": 0.75223017, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77398342, + "num_input_tokens_seen": 90389245, + "step": 4191, + "time_per_iteration": 2.5354340076446533 + }, + { + "auxiliary_loss_clip": 0.01042713, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.01812053, + "balance_loss_mlp": 1.03187966, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7250281200128408, + "language_loss": 0.57059073, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59135616, + "num_input_tokens_seen": 90456735, + "step": 4192, + "time_per_iteration": 3.136788845062256 + }, + { + "auxiliary_loss_clip": 0.01121731, + "auxiliary_loss_mlp": 0.01049091, + "balance_loss_clip": 1.05455256, + "balance_loss_mlp": 1.03074455, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.791585335055776, + "language_loss": 0.76249284, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78420103, + "num_input_tokens_seen": 90474165, + "step": 4193, + "time_per_iteration": 2.514895439147949 + }, + { + "auxiliary_loss_clip": 0.01139745, + "auxiliary_loss_mlp": 0.01051748, + "balance_loss_clip": 1.05255604, + "balance_loss_mlp": 1.03440285, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.1620253813803165, + "language_loss": 0.84457326, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86648822, + "num_input_tokens_seen": 90491660, + "step": 4194, + "time_per_iteration": 2.458416700363159 + }, + { + "auxiliary_loss_clip": 0.01151306, + "auxiliary_loss_mlp": 0.01056103, + "balance_loss_clip": 1.05310977, + "balance_loss_mlp": 1.03916335, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 2.285272424827999, + "language_loss": 0.88366491, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90573901, + "num_input_tokens_seen": 90514025, + "step": 4195, + "time_per_iteration": 2.495453119277954 + }, + { + "auxiliary_loss_clip": 0.01151039, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.05385804, + "balance_loss_mlp": 1.02726579, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 1.9692699172304382, + "language_loss": 0.86013579, + "learning_rate": 3.503717062883053e-06, + "loss": 0.88212258, + "num_input_tokens_seen": 90533530, + "step": 4196, + "time_per_iteration": 2.445268154144287 + }, + { + "auxiliary_loss_clip": 0.01139195, + "auxiliary_loss_mlp": 0.01048929, + "balance_loss_clip": 1.05301976, + "balance_loss_mlp": 1.03146493, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.6853405912094988, + "language_loss": 0.83073223, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85261345, + "num_input_tokens_seen": 90554025, + "step": 4197, + "time_per_iteration": 2.5175728797912598 + }, + { + "auxiliary_loss_clip": 0.01140525, + "auxiliary_loss_mlp": 0.0105577, + "balance_loss_clip": 1.05209112, + "balance_loss_mlp": 1.03575456, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.1245339496518523, + "language_loss": 0.72833985, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.75030279, + "num_input_tokens_seen": 90576930, + "step": 4198, + "time_per_iteration": 2.7435271739959717 + }, + { + "auxiliary_loss_clip": 0.01154816, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_clip": 1.05410898, + "balance_loss_mlp": 1.03211999, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.9111497506971624, + "language_loss": 0.76938164, + "learning_rate": 3.50294646148888e-06, + "loss": 0.79144788, + "num_input_tokens_seen": 90595710, + "step": 4199, + "time_per_iteration": 2.454005002975464 + }, + { + "auxiliary_loss_clip": 0.01128757, + "auxiliary_loss_mlp": 0.00786697, + "balance_loss_clip": 1.05476189, + "balance_loss_mlp": 1.00102639, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 2.0725984165253313, + "language_loss": 0.73048854, + "learning_rate": 3.502689480360739e-06, + "loss": 0.74964309, + "num_input_tokens_seen": 90617945, + "step": 4200, + "time_per_iteration": 2.6184909343719482 + }, + { + "auxiliary_loss_clip": 0.01140306, + "auxiliary_loss_mlp": 0.01052807, + "balance_loss_clip": 1.05264258, + "balance_loss_mlp": 1.03564048, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 2.1413044171842484, + "language_loss": 0.82636237, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84829342, + "num_input_tokens_seen": 90640855, + "step": 4201, + "time_per_iteration": 2.6942567825317383 + }, + { + "auxiliary_loss_clip": 0.01100121, + "auxiliary_loss_mlp": 0.01050918, + "balance_loss_clip": 1.05471587, + "balance_loss_mlp": 1.03270245, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.8037027386897562, + "language_loss": 0.75585461, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77736497, + "num_input_tokens_seen": 90661350, + "step": 4202, + "time_per_iteration": 2.5985515117645264 + }, + { + "auxiliary_loss_clip": 0.01139003, + "auxiliary_loss_mlp": 0.01043524, + "balance_loss_clip": 1.05512762, + "balance_loss_mlp": 1.02524877, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.811935677583664, + "language_loss": 0.73001003, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75183535, + "num_input_tokens_seen": 90680540, + "step": 4203, + "time_per_iteration": 2.4715802669525146 + }, + { + "auxiliary_loss_clip": 0.01131448, + "auxiliary_loss_mlp": 0.01040883, + "balance_loss_clip": 1.05470538, + "balance_loss_mlp": 1.02276373, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.3989993489202148, + "language_loss": 0.77711046, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79883379, + "num_input_tokens_seen": 90703460, + "step": 4204, + "time_per_iteration": 4.020840883255005 + }, + { + "auxiliary_loss_clip": 0.01123157, + "auxiliary_loss_mlp": 0.01049195, + "balance_loss_clip": 1.05867112, + "balance_loss_mlp": 1.03128982, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 1.972532166507055, + "language_loss": 0.72265422, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74437773, + "num_input_tokens_seen": 90718815, + "step": 4205, + "time_per_iteration": 4.00031304359436 + }, + { + "auxiliary_loss_clip": 0.01127119, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.050776, + "balance_loss_mlp": 1.02244425, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.33723049340745, + "language_loss": 0.75463045, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77629709, + "num_input_tokens_seen": 90742125, + "step": 4206, + "time_per_iteration": 2.739218235015869 + }, + { + "auxiliary_loss_clip": 0.01110515, + "auxiliary_loss_mlp": 0.01044637, + "balance_loss_clip": 1.04870033, + "balance_loss_mlp": 1.02743518, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.7491456812655901, + "language_loss": 0.79237336, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81392485, + "num_input_tokens_seen": 90760785, + "step": 4207, + "time_per_iteration": 2.5421462059020996 + }, + { + "auxiliary_loss_clip": 0.01133655, + "auxiliary_loss_mlp": 0.01046377, + "balance_loss_clip": 1.05404115, + "balance_loss_mlp": 1.02929473, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.7713473986626445, + "language_loss": 0.75933558, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78113592, + "num_input_tokens_seen": 90780045, + "step": 4208, + "time_per_iteration": 2.5085315704345703 + }, + { + "auxiliary_loss_clip": 0.0113363, + "auxiliary_loss_mlp": 0.01042022, + "balance_loss_clip": 1.05270696, + "balance_loss_mlp": 1.02497542, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 2.0392019063335547, + "language_loss": 0.6999824, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72173893, + "num_input_tokens_seen": 90797980, + "step": 4209, + "time_per_iteration": 2.5201916694641113 + }, + { + "auxiliary_loss_clip": 0.01052137, + "auxiliary_loss_mlp": 0.01007788, + "balance_loss_clip": 1.02601397, + "balance_loss_mlp": 1.00563014, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7644064413714408, + "language_loss": 0.55133933, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57193857, + "num_input_tokens_seen": 90864865, + "step": 4210, + "time_per_iteration": 3.131107807159424 + }, + { + "auxiliary_loss_clip": 0.01125391, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.06018281, + "balance_loss_mlp": 1.02234185, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 1.9892927717349331, + "language_loss": 0.80549681, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82714617, + "num_input_tokens_seen": 90882885, + "step": 4211, + "time_per_iteration": 2.544372320175171 + }, + { + "auxiliary_loss_clip": 0.01096948, + "auxiliary_loss_mlp": 0.01044814, + "balance_loss_clip": 1.04573679, + "balance_loss_mlp": 1.02825582, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.5327880328433139, + "language_loss": 0.78772032, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80913794, + "num_input_tokens_seen": 90902985, + "step": 4212, + "time_per_iteration": 4.020469903945923 + }, + { + "auxiliary_loss_clip": 0.01138069, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.05087662, + "balance_loss_mlp": 1.02079964, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 2.5596873921383687, + "language_loss": 0.5369705, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55874294, + "num_input_tokens_seen": 90923550, + "step": 4213, + "time_per_iteration": 2.523982524871826 + }, + { + "auxiliary_loss_clip": 0.01123016, + "auxiliary_loss_mlp": 0.01045243, + "balance_loss_clip": 1.04971182, + "balance_loss_mlp": 1.0267303, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.989338843118205, + "language_loss": 0.6509459, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67262852, + "num_input_tokens_seen": 90943260, + "step": 4214, + "time_per_iteration": 2.5137643814086914 + }, + { + "auxiliary_loss_clip": 0.01045568, + "auxiliary_loss_mlp": 0.01000493, + "balance_loss_clip": 1.01975965, + "balance_loss_mlp": 0.99819243, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8649701703506227, + "language_loss": 0.58057785, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60103846, + "num_input_tokens_seen": 90996295, + "step": 4215, + "time_per_iteration": 2.791050910949707 + }, + { + "auxiliary_loss_clip": 0.01129376, + "auxiliary_loss_mlp": 0.01047305, + "balance_loss_clip": 1.05641675, + "balance_loss_mlp": 1.02964997, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.5066316465599394, + "language_loss": 0.83679974, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85856658, + "num_input_tokens_seen": 91017545, + "step": 4216, + "time_per_iteration": 2.714120626449585 + }, + { + "auxiliary_loss_clip": 0.01136346, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.05632734, + "balance_loss_mlp": 1.02238226, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.8559259249414164, + "language_loss": 0.79805791, + "learning_rate": 3.498312090875666e-06, + "loss": 0.81982231, + "num_input_tokens_seen": 91037715, + "step": 4217, + "time_per_iteration": 2.5028791427612305 + }, + { + "auxiliary_loss_clip": 0.01121517, + "auxiliary_loss_mlp": 0.01043028, + "balance_loss_clip": 1.04598331, + "balance_loss_mlp": 1.02544475, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 4.891964550285274, + "language_loss": 0.75380158, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.77544713, + "num_input_tokens_seen": 91055295, + "step": 4218, + "time_per_iteration": 2.551560878753662 + }, + { + "auxiliary_loss_clip": 0.01140437, + "auxiliary_loss_mlp": 0.01041856, + "balance_loss_clip": 1.05200398, + "balance_loss_mlp": 1.02331924, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.6717746252657582, + "language_loss": 0.74226069, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76408356, + "num_input_tokens_seen": 91075485, + "step": 4219, + "time_per_iteration": 2.5130860805511475 + }, + { + "auxiliary_loss_clip": 0.01142705, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.05385971, + "balance_loss_mlp": 1.03144133, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 1.8027841784723526, + "language_loss": 0.8113215, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83324772, + "num_input_tokens_seen": 91093620, + "step": 4220, + "time_per_iteration": 2.5034022331237793 + }, + { + "auxiliary_loss_clip": 0.01114024, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_clip": 1.05880916, + "balance_loss_mlp": 1.03197765, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.1012553079225267, + "language_loss": 0.7062403, + "learning_rate": 3.497279728822468e-06, + "loss": 0.7278949, + "num_input_tokens_seen": 91114110, + "step": 4221, + "time_per_iteration": 4.142006874084473 + }, + { + "auxiliary_loss_clip": 0.01149901, + "auxiliary_loss_mlp": 0.01040862, + "balance_loss_clip": 1.05417919, + "balance_loss_mlp": 1.02381539, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 1.8188676703041589, + "language_loss": 0.61695099, + "learning_rate": 3.497021496342202e-06, + "loss": 0.63885868, + "num_input_tokens_seen": 91133135, + "step": 4222, + "time_per_iteration": 2.431489944458008 + }, + { + "auxiliary_loss_clip": 0.01144032, + "auxiliary_loss_mlp": 0.0105154, + "balance_loss_clip": 1.05738759, + "balance_loss_mlp": 1.03361142, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.5812051079614864, + "language_loss": 0.74310374, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76505947, + "num_input_tokens_seen": 91151805, + "step": 4223, + "time_per_iteration": 2.4979374408721924 + }, + { + "auxiliary_loss_clip": 0.01096136, + "auxiliary_loss_mlp": 0.010394, + "balance_loss_clip": 1.04836059, + "balance_loss_mlp": 1.02259147, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.5769974487344673, + "language_loss": 0.79839104, + "learning_rate": 3.49650486108985e-06, + "loss": 0.81974643, + "num_input_tokens_seen": 91172270, + "step": 4224, + "time_per_iteration": 2.6210360527038574 + }, + { + "auxiliary_loss_clip": 0.01136821, + "auxiliary_loss_mlp": 0.00784596, + "balance_loss_clip": 1.05467296, + "balance_loss_mlp": 1.00083351, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.456196133226205, + "language_loss": 0.77468491, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79389906, + "num_input_tokens_seen": 91192080, + "step": 4225, + "time_per_iteration": 2.57766056060791 + }, + { + "auxiliary_loss_clip": 0.01134668, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_clip": 1.05229139, + "balance_loss_mlp": 1.03571272, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 11.374393337141111, + "language_loss": 0.84563172, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86751413, + "num_input_tokens_seen": 91211450, + "step": 4226, + "time_per_iteration": 2.559020519256592 + }, + { + "auxiliary_loss_clip": 0.01146777, + "auxiliary_loss_mlp": 0.01051237, + "balance_loss_clip": 1.05284929, + "balance_loss_mlp": 1.03368974, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 1.4442073798297355, + "language_loss": 0.70953184, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73151195, + "num_input_tokens_seen": 91231835, + "step": 4227, + "time_per_iteration": 2.5211479663848877 + }, + { + "auxiliary_loss_clip": 0.01061391, + "auxiliary_loss_mlp": 0.01019659, + "balance_loss_clip": 1.02643919, + "balance_loss_mlp": 1.01745343, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 1.0104393221264578, + "language_loss": 0.61867779, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63948828, + "num_input_tokens_seen": 91288755, + "step": 4228, + "time_per_iteration": 2.8615856170654297 + }, + { + "auxiliary_loss_clip": 0.0113111, + "auxiliary_loss_mlp": 0.01045557, + "balance_loss_clip": 1.05087554, + "balance_loss_mlp": 1.02645934, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 2.5156745760257766, + "language_loss": 0.85849607, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88026273, + "num_input_tokens_seen": 91302485, + "step": 4229, + "time_per_iteration": 2.5576093196868896 + }, + { + "auxiliary_loss_clip": 0.01111701, + "auxiliary_loss_mlp": 0.01053227, + "balance_loss_clip": 1.0541048, + "balance_loss_mlp": 1.03434467, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 2.2450704526205953, + "language_loss": 0.7711705, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79281974, + "num_input_tokens_seen": 91321120, + "step": 4230, + "time_per_iteration": 2.566293716430664 + }, + { + "auxiliary_loss_clip": 0.01138409, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_clip": 1.0547204, + "balance_loss_mlp": 1.02778125, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 1.9074504105393295, + "language_loss": 0.75009215, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77193314, + "num_input_tokens_seen": 91338575, + "step": 4231, + "time_per_iteration": 2.4723281860351562 + }, + { + "auxiliary_loss_clip": 0.0113947, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.05508614, + "balance_loss_mlp": 1.02148426, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 1.9473293777575, + "language_loss": 0.74215686, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76393867, + "num_input_tokens_seen": 91357355, + "step": 4232, + "time_per_iteration": 2.4598240852355957 + }, + { + "auxiliary_loss_clip": 0.01154388, + "auxiliary_loss_mlp": 0.01046442, + "balance_loss_clip": 1.05906963, + "balance_loss_mlp": 1.02705836, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 2.493096629247247, + "language_loss": 0.86593163, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88793993, + "num_input_tokens_seen": 91376515, + "step": 4233, + "time_per_iteration": 2.509673595428467 + }, + { + "auxiliary_loss_clip": 0.01087946, + "auxiliary_loss_mlp": 0.01040516, + "balance_loss_clip": 1.05153668, + "balance_loss_mlp": 1.02433944, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.701801411754851, + "language_loss": 0.75154853, + "learning_rate": 3.493918281539737e-06, + "loss": 0.77283311, + "num_input_tokens_seen": 91397595, + "step": 4234, + "time_per_iteration": 2.655348777770996 + }, + { + "auxiliary_loss_clip": 0.01122884, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.05718994, + "balance_loss_mlp": 1.02698445, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.4518470466110753, + "language_loss": 0.74620008, + "learning_rate": 3.493659311850379e-06, + "loss": 0.76786584, + "num_input_tokens_seen": 91417775, + "step": 4235, + "time_per_iteration": 2.616102695465088 + }, + { + "auxiliary_loss_clip": 0.01129771, + "auxiliary_loss_mlp": 0.00787173, + "balance_loss_clip": 1.0562973, + "balance_loss_mlp": 1.00088227, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 1.8933064677734055, + "language_loss": 0.65039331, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6695627, + "num_input_tokens_seen": 91437665, + "step": 4236, + "time_per_iteration": 2.6292784214019775 + }, + { + "auxiliary_loss_clip": 0.01148791, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.05553067, + "balance_loss_mlp": 1.02109957, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.5403325714129257, + "language_loss": 0.66746449, + "learning_rate": 3.493141202562354e-06, + "loss": 0.68932164, + "num_input_tokens_seen": 91456705, + "step": 4237, + "time_per_iteration": 2.459176540374756 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01047297, + "balance_loss_clip": 1.05737782, + "balance_loss_mlp": 1.03002405, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 1.9384615928624243, + "language_loss": 0.7512008, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77321666, + "num_input_tokens_seen": 91475535, + "step": 4238, + "time_per_iteration": 2.501699209213257 + }, + { + "auxiliary_loss_clip": 0.01148947, + "auxiliary_loss_mlp": 0.01046208, + "balance_loss_clip": 1.06425738, + "balance_loss_mlp": 1.02784944, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.8281676158055582, + "language_loss": 0.80429363, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82624513, + "num_input_tokens_seen": 91499140, + "step": 4239, + "time_per_iteration": 2.563126564025879 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.05805826, + "balance_loss_mlp": 1.02825868, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.710879466143337, + "language_loss": 0.77195334, + "learning_rate": 3.492363614004407e-06, + "loss": 0.793764, + "num_input_tokens_seen": 91518335, + "step": 4240, + "time_per_iteration": 2.519099235534668 + }, + { + "auxiliary_loss_clip": 0.01155801, + "auxiliary_loss_mlp": 0.01040914, + "balance_loss_clip": 1.05624354, + "balance_loss_mlp": 1.02226925, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 3.116487077038156, + "language_loss": 0.83579624, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85776341, + "num_input_tokens_seen": 91537655, + "step": 4241, + "time_per_iteration": 2.48996901512146 + }, + { + "auxiliary_loss_clip": 0.01142623, + "auxiliary_loss_mlp": 0.01045805, + "balance_loss_clip": 1.0570693, + "balance_loss_mlp": 1.02873445, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.4963083462402806, + "language_loss": 0.73420358, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.7560879, + "num_input_tokens_seen": 91557545, + "step": 4242, + "time_per_iteration": 2.5822653770446777 + }, + { + "auxiliary_loss_clip": 0.01150803, + "auxiliary_loss_mlp": 0.00785082, + "balance_loss_clip": 1.05475712, + "balance_loss_mlp": 1.00102496, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.1358802791894105, + "language_loss": 0.72136962, + "learning_rate": 3.491585516131273e-06, + "loss": 0.7407285, + "num_input_tokens_seen": 91574405, + "step": 4243, + "time_per_iteration": 3.9812285900115967 + }, + { + "auxiliary_loss_clip": 0.0113503, + "auxiliary_loss_mlp": 0.01045897, + "balance_loss_clip": 1.0513134, + "balance_loss_mlp": 1.02782524, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 1.904700044265258, + "language_loss": 0.81562138, + "learning_rate": 3.491326037038301e-06, + "loss": 0.8374306, + "num_input_tokens_seen": 91593755, + "step": 4244, + "time_per_iteration": 2.6393589973449707 + }, + { + "auxiliary_loss_clip": 0.010429, + "auxiliary_loss_mlp": 0.01001761, + "balance_loss_clip": 1.02047276, + "balance_loss_mlp": 0.9997102, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.7150240690585302, + "language_loss": 0.57738906, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.59783572, + "num_input_tokens_seen": 91660335, + "step": 4245, + "time_per_iteration": 4.596735715866089 + }, + { + "auxiliary_loss_clip": 0.01150513, + "auxiliary_loss_mlp": 0.01052754, + "balance_loss_clip": 1.0543319, + "balance_loss_mlp": 1.03508723, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 2.1900948791966908, + "language_loss": 0.65016484, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67219758, + "num_input_tokens_seen": 91678500, + "step": 4246, + "time_per_iteration": 2.5287554264068604 + }, + { + "auxiliary_loss_clip": 0.01133182, + "auxiliary_loss_mlp": 0.0104732, + "balance_loss_clip": 1.05223298, + "balance_loss_mlp": 1.03111947, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.9199777444925346, + "language_loss": 0.81790018, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83970523, + "num_input_tokens_seen": 91696430, + "step": 4247, + "time_per_iteration": 2.5114729404449463 + }, + { + "auxiliary_loss_clip": 0.01144534, + "auxiliary_loss_mlp": 0.01049574, + "balance_loss_clip": 1.05490768, + "balance_loss_mlp": 1.02993989, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.4970531387884196, + "language_loss": 0.83257031, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85451132, + "num_input_tokens_seen": 91713270, + "step": 4248, + "time_per_iteration": 2.4522154331207275 + }, + { + "auxiliary_loss_clip": 0.01121218, + "auxiliary_loss_mlp": 0.01053137, + "balance_loss_clip": 1.04747808, + "balance_loss_mlp": 1.03486228, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 1.8117993005916004, + "language_loss": 0.84052229, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86226583, + "num_input_tokens_seen": 91728865, + "step": 4249, + "time_per_iteration": 2.4895169734954834 + }, + { + "auxiliary_loss_clip": 0.00995144, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.01527524, + "balance_loss_mlp": 1.02331626, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.757879542572879, + "language_loss": 0.56305182, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58326054, + "num_input_tokens_seen": 91787470, + "step": 4250, + "time_per_iteration": 3.14204740524292 + }, + { + "auxiliary_loss_clip": 0.0112338, + "auxiliary_loss_mlp": 0.01040296, + "balance_loss_clip": 1.05326068, + "balance_loss_mlp": 1.02199769, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.2045506884685206, + "language_loss": 0.80809867, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82973552, + "num_input_tokens_seen": 91805640, + "step": 4251, + "time_per_iteration": 2.5470354557037354 + }, + { + "auxiliary_loss_clip": 0.01036952, + "auxiliary_loss_mlp": 0.01005068, + "balance_loss_clip": 1.02963781, + "balance_loss_mlp": 1.00293386, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.8061892843351257, + "language_loss": 0.66120195, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68162215, + "num_input_tokens_seen": 91869695, + "step": 4252, + "time_per_iteration": 4.5481250286102295 + }, + { + "auxiliary_loss_clip": 0.0113168, + "auxiliary_loss_mlp": 0.01040731, + "balance_loss_clip": 1.04966927, + "balance_loss_mlp": 1.02433991, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 1.8012682420330617, + "language_loss": 0.73582113, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75754523, + "num_input_tokens_seen": 91889920, + "step": 4253, + "time_per_iteration": 2.634843349456787 + }, + { + "auxiliary_loss_clip": 0.01106426, + "auxiliary_loss_mlp": 0.01048322, + "balance_loss_clip": 1.04803467, + "balance_loss_mlp": 1.03053617, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 1.9389554963792208, + "language_loss": 0.72568697, + "learning_rate": 3.488728137415357e-06, + "loss": 0.74723446, + "num_input_tokens_seen": 91908665, + "step": 4254, + "time_per_iteration": 2.6489336490631104 + }, + { + "auxiliary_loss_clip": 0.01107461, + "auxiliary_loss_mlp": 0.00784709, + "balance_loss_clip": 1.05148447, + "balance_loss_mlp": 1.0010879, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 2.2368698855001448, + "language_loss": 0.81044406, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.82936573, + "num_input_tokens_seen": 91927855, + "step": 4255, + "time_per_iteration": 2.598292589187622 + }, + { + "auxiliary_loss_clip": 0.01128192, + "auxiliary_loss_mlp": 0.01041816, + "balance_loss_clip": 1.05458403, + "balance_loss_mlp": 1.02427983, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.4059427555113426, + "language_loss": 0.85230279, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87400281, + "num_input_tokens_seen": 91948500, + "step": 4256, + "time_per_iteration": 2.562544584274292 + }, + { + "auxiliary_loss_clip": 0.01107911, + "auxiliary_loss_mlp": 0.01048338, + "balance_loss_clip": 1.04838002, + "balance_loss_mlp": 1.02901399, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 1.9816553116168671, + "language_loss": 0.74787891, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.76944143, + "num_input_tokens_seen": 91968375, + "step": 4257, + "time_per_iteration": 2.583467483520508 + }, + { + "auxiliary_loss_clip": 0.01020704, + "auxiliary_loss_mlp": 0.01008834, + "balance_loss_clip": 1.03140533, + "balance_loss_mlp": 1.00673568, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.7975603543215121, + "language_loss": 0.65260261, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67289799, + "num_input_tokens_seen": 92028490, + "step": 4258, + "time_per_iteration": 3.0972113609313965 + }, + { + "auxiliary_loss_clip": 0.01098271, + "auxiliary_loss_mlp": 0.00783897, + "balance_loss_clip": 1.04949045, + "balance_loss_mlp": 1.00122678, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.6238074682529702, + "language_loss": 0.76817095, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78699261, + "num_input_tokens_seen": 92048060, + "step": 4259, + "time_per_iteration": 2.672785520553589 + }, + { + "auxiliary_loss_clip": 0.01021273, + "auxiliary_loss_mlp": 0.01012485, + "balance_loss_clip": 1.01653266, + "balance_loss_mlp": 1.01045859, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7957626111747687, + "language_loss": 0.58454323, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60488081, + "num_input_tokens_seen": 92118180, + "step": 4260, + "time_per_iteration": 3.1926217079162598 + }, + { + "auxiliary_loss_clip": 0.01140986, + "auxiliary_loss_mlp": 0.0105169, + "balance_loss_clip": 1.05380368, + "balance_loss_mlp": 1.03324866, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.8728238079578237, + "language_loss": 0.77213627, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.79406303, + "num_input_tokens_seen": 92137570, + "step": 4261, + "time_per_iteration": 4.0762529373168945 + }, + { + "auxiliary_loss_clip": 0.01148096, + "auxiliary_loss_mlp": 0.01044575, + "balance_loss_clip": 1.05455792, + "balance_loss_mlp": 1.0281117, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.707696724056662, + "language_loss": 0.83219874, + "learning_rate": 3.486645752648842e-06, + "loss": 0.8541255, + "num_input_tokens_seen": 92157625, + "step": 4262, + "time_per_iteration": 2.504525899887085 + }, + { + "auxiliary_loss_clip": 0.01139279, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_clip": 1.0553062, + "balance_loss_mlp": 1.02605939, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.7724188655475808, + "language_loss": 0.7357015, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.75754672, + "num_input_tokens_seen": 92175350, + "step": 4263, + "time_per_iteration": 2.548959493637085 + }, + { + "auxiliary_loss_clip": 0.01122952, + "auxiliary_loss_mlp": 0.00784411, + "balance_loss_clip": 1.06211972, + "balance_loss_mlp": 1.00103712, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.5738107842429954, + "language_loss": 0.82783854, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84691215, + "num_input_tokens_seen": 92196070, + "step": 4264, + "time_per_iteration": 2.667299270629883 + }, + { + "auxiliary_loss_clip": 0.01136819, + "auxiliary_loss_mlp": 0.01045304, + "balance_loss_clip": 1.05622411, + "balance_loss_mlp": 1.02712417, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.7220265961786176, + "language_loss": 0.74255049, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76437163, + "num_input_tokens_seen": 92216310, + "step": 4265, + "time_per_iteration": 2.546844244003296 + }, + { + "auxiliary_loss_clip": 0.0111739, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.05209601, + "balance_loss_mlp": 1.02011919, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 2.307484163601693, + "language_loss": 0.81715399, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83869529, + "num_input_tokens_seen": 92234510, + "step": 4266, + "time_per_iteration": 2.5421433448791504 + }, + { + "auxiliary_loss_clip": 0.01084559, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_clip": 1.04685044, + "balance_loss_mlp": 1.02775574, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.6530202166477808, + "language_loss": 0.79021883, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.8115375, + "num_input_tokens_seen": 92254070, + "step": 4267, + "time_per_iteration": 2.6138057708740234 + }, + { + "auxiliary_loss_clip": 0.01101625, + "auxiliary_loss_mlp": 0.01050059, + "balance_loss_clip": 1.05024934, + "balance_loss_mlp": 1.03166544, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.5638129034215889, + "language_loss": 0.79171836, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81323516, + "num_input_tokens_seen": 92275060, + "step": 4268, + "time_per_iteration": 2.557682514190674 + }, + { + "auxiliary_loss_clip": 0.01108087, + "auxiliary_loss_mlp": 0.00789726, + "balance_loss_clip": 1.05004072, + "balance_loss_mlp": 1.00097871, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 5.65117493200262, + "language_loss": 0.6803264, + "learning_rate": 3.484820706183595e-06, + "loss": 0.69930458, + "num_input_tokens_seen": 92293610, + "step": 4269, + "time_per_iteration": 2.5842230319976807 + }, + { + "auxiliary_loss_clip": 0.01125795, + "auxiliary_loss_mlp": 0.01039329, + "balance_loss_clip": 1.05199671, + "balance_loss_mlp": 1.02122104, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 3.027426363183545, + "language_loss": 0.78563499, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80728626, + "num_input_tokens_seen": 92308305, + "step": 4270, + "time_per_iteration": 2.467323064804077 + }, + { + "auxiliary_loss_clip": 0.01100906, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_clip": 1.0452559, + "balance_loss_mlp": 1.02353036, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 1.9456316088254126, + "language_loss": 0.68006003, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.70150626, + "num_input_tokens_seen": 92329875, + "step": 4271, + "time_per_iteration": 2.6713297367095947 + }, + { + "auxiliary_loss_clip": 0.01137193, + "auxiliary_loss_mlp": 0.00785316, + "balance_loss_clip": 1.0524013, + "balance_loss_mlp": 1.00117648, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 1.3608678212631247, + "language_loss": 0.87269962, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.89192468, + "num_input_tokens_seen": 92348780, + "step": 4272, + "time_per_iteration": 2.5223581790924072 + }, + { + "auxiliary_loss_clip": 0.0112738, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.05698419, + "balance_loss_mlp": 1.02669048, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.8007691617189263, + "language_loss": 0.81788266, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83961451, + "num_input_tokens_seen": 92368175, + "step": 4273, + "time_per_iteration": 2.5267250537872314 + }, + { + "auxiliary_loss_clip": 0.01104379, + "auxiliary_loss_mlp": 0.01043666, + "balance_loss_clip": 1.04607487, + "balance_loss_mlp": 1.02566552, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.5655470165703806, + "language_loss": 0.77453381, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79601431, + "num_input_tokens_seen": 92387755, + "step": 4274, + "time_per_iteration": 2.6068427562713623 + }, + { + "auxiliary_loss_clip": 0.01117554, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.04610765, + "balance_loss_mlp": 1.02075934, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.989866522115164, + "language_loss": 0.83879352, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86035752, + "num_input_tokens_seen": 92409850, + "step": 4275, + "time_per_iteration": 2.5668723583221436 + }, + { + "auxiliary_loss_clip": 0.01123244, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.05384839, + "balance_loss_mlp": 1.01998615, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 2.096585336735831, + "language_loss": 0.78505731, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80666649, + "num_input_tokens_seen": 92431250, + "step": 4276, + "time_per_iteration": 2.5784027576446533 + }, + { + "auxiliary_loss_clip": 0.01137171, + "auxiliary_loss_mlp": 0.01043351, + "balance_loss_clip": 1.05362511, + "balance_loss_mlp": 1.02581573, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.6914297479340539, + "language_loss": 0.79256427, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.8143695, + "num_input_tokens_seen": 92452065, + "step": 4277, + "time_per_iteration": 2.5631396770477295 + }, + { + "auxiliary_loss_clip": 0.0114668, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.05265534, + "balance_loss_mlp": 1.02794552, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.755296566732077, + "language_loss": 0.78808677, + "learning_rate": 3.482470164419295e-06, + "loss": 0.81000102, + "num_input_tokens_seen": 92470025, + "step": 4278, + "time_per_iteration": 2.469327449798584 + }, + { + "auxiliary_loss_clip": 0.01127985, + "auxiliary_loss_mlp": 0.01040591, + "balance_loss_clip": 1.05396438, + "balance_loss_mlp": 1.02315128, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.705277230739619, + "language_loss": 0.74630553, + "learning_rate": 3.482208711902952e-06, + "loss": 0.7679913, + "num_input_tokens_seen": 92489825, + "step": 4279, + "time_per_iteration": 2.5543646812438965 + }, + { + "auxiliary_loss_clip": 0.0113488, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_clip": 1.05054045, + "balance_loss_mlp": 1.03124642, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.009760198029986, + "language_loss": 0.8518033, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87363839, + "num_input_tokens_seen": 92507270, + "step": 4280, + "time_per_iteration": 2.4912052154541016 + }, + { + "auxiliary_loss_clip": 0.01136965, + "auxiliary_loss_mlp": 0.01039605, + "balance_loss_clip": 1.05168343, + "balance_loss_mlp": 1.02178288, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.1686958414776685, + "language_loss": 0.7864061, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.80817175, + "num_input_tokens_seen": 92526300, + "step": 4281, + "time_per_iteration": 2.5071704387664795 + }, + { + "auxiliary_loss_clip": 0.01113125, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.05071831, + "balance_loss_mlp": 1.02063847, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.6152849125878166, + "language_loss": 0.87311864, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89463103, + "num_input_tokens_seen": 92546465, + "step": 4282, + "time_per_iteration": 4.0128655433654785 + }, + { + "auxiliary_loss_clip": 0.0114946, + "auxiliary_loss_mlp": 0.01043474, + "balance_loss_clip": 1.05288768, + "balance_loss_mlp": 1.02664161, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.4073136943629674, + "language_loss": 0.70283985, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72476923, + "num_input_tokens_seen": 92567260, + "step": 4283, + "time_per_iteration": 2.4702985286712646 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.00783241, + "balance_loss_clip": 1.05405009, + "balance_loss_mlp": 1.00096226, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 1.7465251683144558, + "language_loss": 0.80623639, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82550311, + "num_input_tokens_seen": 92585425, + "step": 4284, + "time_per_iteration": 3.9944214820861816 + }, + { + "auxiliary_loss_clip": 0.01105232, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.0572648, + "balance_loss_mlp": 1.0235343, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 3.4155812298843204, + "language_loss": 0.70932549, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.73077077, + "num_input_tokens_seen": 92604770, + "step": 4285, + "time_per_iteration": 2.763275384902954 + }, + { + "auxiliary_loss_clip": 0.01130998, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.05442119, + "balance_loss_mlp": 1.02876782, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 1.9974648339438306, + "language_loss": 0.58822203, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60998577, + "num_input_tokens_seen": 92622635, + "step": 4286, + "time_per_iteration": 2.5110626220703125 + }, + { + "auxiliary_loss_clip": 0.01138982, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.05280554, + "balance_loss_mlp": 1.03304935, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.5007438349275148, + "language_loss": 0.64248991, + "learning_rate": 3.480115069207354e-06, + "loss": 0.6643815, + "num_input_tokens_seen": 92642960, + "step": 4287, + "time_per_iteration": 2.497556686401367 + }, + { + "auxiliary_loss_clip": 0.01127268, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_clip": 1.05301094, + "balance_loss_mlp": 1.02843654, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 1.6930903314164252, + "language_loss": 0.71786201, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.73960608, + "num_input_tokens_seen": 92662455, + "step": 4288, + "time_per_iteration": 2.5303261280059814 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01038703, + "balance_loss_clip": 1.05146909, + "balance_loss_mlp": 1.02258611, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.4321976073117673, + "language_loss": 0.77299494, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79449999, + "num_input_tokens_seen": 92683520, + "step": 4289, + "time_per_iteration": 2.59500789642334 + }, + { + "auxiliary_loss_clip": 0.0114551, + "auxiliary_loss_mlp": 0.00784832, + "balance_loss_clip": 1.05200696, + "balance_loss_mlp": 1.0009923, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 1.954609210419254, + "language_loss": 0.85725904, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87656242, + "num_input_tokens_seen": 92701450, + "step": 4290, + "time_per_iteration": 2.4671168327331543 + }, + { + "auxiliary_loss_clip": 0.01115793, + "auxiliary_loss_mlp": 0.01059771, + "balance_loss_clip": 1.05043578, + "balance_loss_mlp": 1.0391953, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.5632505692453744, + "language_loss": 0.72238702, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74414259, + "num_input_tokens_seen": 92720355, + "step": 4291, + "time_per_iteration": 3.947413206100464 + }, + { + "auxiliary_loss_clip": 0.01151235, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.05448687, + "balance_loss_mlp": 1.02799988, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.428593563717132, + "language_loss": 0.80779117, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.82975829, + "num_input_tokens_seen": 92736755, + "step": 4292, + "time_per_iteration": 2.4145405292510986 + }, + { + "auxiliary_loss_clip": 0.01152718, + "auxiliary_loss_mlp": 0.01044634, + "balance_loss_clip": 1.05645108, + "balance_loss_mlp": 1.02691925, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 1.9005808572848233, + "language_loss": 0.67611009, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69808364, + "num_input_tokens_seen": 92757655, + "step": 4293, + "time_per_iteration": 2.550565719604492 + }, + { + "auxiliary_loss_clip": 0.0110474, + "auxiliary_loss_mlp": 0.01044077, + "balance_loss_clip": 1.0499239, + "balance_loss_mlp": 1.02803183, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.872931201777213, + "language_loss": 0.75470418, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77619231, + "num_input_tokens_seen": 92776100, + "step": 4294, + "time_per_iteration": 2.5686025619506836 + }, + { + "auxiliary_loss_clip": 0.01094537, + "auxiliary_loss_mlp": 0.01065131, + "balance_loss_clip": 1.04695487, + "balance_loss_mlp": 1.04541397, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 1.8199804653201888, + "language_loss": 0.80476356, + "learning_rate": 3.478017834441318e-06, + "loss": 0.82636023, + "num_input_tokens_seen": 92798880, + "step": 4295, + "time_per_iteration": 2.7038867473602295 + }, + { + "auxiliary_loss_clip": 0.01075025, + "auxiliary_loss_mlp": 0.0105058, + "balance_loss_clip": 1.05653715, + "balance_loss_mlp": 1.03231764, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 3.8770830022428497, + "language_loss": 0.72947562, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.75073159, + "num_input_tokens_seen": 92817750, + "step": 4296, + "time_per_iteration": 2.733715057373047 + }, + { + "auxiliary_loss_clip": 0.01093649, + "auxiliary_loss_mlp": 0.01039503, + "balance_loss_clip": 1.05539405, + "balance_loss_mlp": 1.02201533, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.8294855008820847, + "language_loss": 0.86901963, + "learning_rate": 3.477492965085067e-06, + "loss": 0.89035112, + "num_input_tokens_seen": 92837995, + "step": 4297, + "time_per_iteration": 2.67280650138855 + }, + { + "auxiliary_loss_clip": 0.01148887, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_clip": 1.0542047, + "balance_loss_mlp": 1.03409076, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 1.7035454416866367, + "language_loss": 0.84766191, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86965597, + "num_input_tokens_seen": 92857245, + "step": 4298, + "time_per_iteration": 2.5043516159057617 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.00784475, + "balance_loss_clip": 1.0565027, + "balance_loss_mlp": 1.00096774, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 4.239070565648704, + "language_loss": 0.837143, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85639179, + "num_input_tokens_seen": 92873265, + "step": 4299, + "time_per_iteration": 2.483638286590576 + }, + { + "auxiliary_loss_clip": 0.01116534, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.04858196, + "balance_loss_mlp": 1.02114367, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.230138330380157, + "language_loss": 0.82926047, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.85080796, + "num_input_tokens_seen": 92890880, + "step": 4300, + "time_per_iteration": 4.017697334289551 + }, + { + "auxiliary_loss_clip": 0.01138848, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.05397463, + "balance_loss_mlp": 1.02475655, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.1618360831590895, + "language_loss": 0.67288464, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69469261, + "num_input_tokens_seen": 92910770, + "step": 4301, + "time_per_iteration": 2.601411819458008 + }, + { + "auxiliary_loss_clip": 0.01137446, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.05054235, + "balance_loss_mlp": 1.02677715, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.60343645409517, + "language_loss": 0.81353581, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83535743, + "num_input_tokens_seen": 92929520, + "step": 4302, + "time_per_iteration": 2.45104718208313 + }, + { + "auxiliary_loss_clip": 0.01109742, + "auxiliary_loss_mlp": 0.01051379, + "balance_loss_clip": 1.0560534, + "balance_loss_mlp": 1.03362894, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 1.6741952200575585, + "language_loss": 0.92165267, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94326389, + "num_input_tokens_seen": 92947890, + "step": 4303, + "time_per_iteration": 2.5736641883850098 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.05751443, + "balance_loss_mlp": 1.02128041, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.8360756254848865, + "language_loss": 0.67777473, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69959837, + "num_input_tokens_seen": 92967690, + "step": 4304, + "time_per_iteration": 2.543250799179077 + }, + { + "auxiliary_loss_clip": 0.01124511, + "auxiliary_loss_mlp": 0.01044925, + "balance_loss_clip": 1.05943859, + "balance_loss_mlp": 1.02774668, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 3.2987369985523, + "language_loss": 0.72203553, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.74372983, + "num_input_tokens_seen": 92986830, + "step": 4305, + "time_per_iteration": 2.6081931591033936 + }, + { + "auxiliary_loss_clip": 0.011131, + "auxiliary_loss_mlp": 0.00786606, + "balance_loss_clip": 1.05398774, + "balance_loss_mlp": 1.00112796, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.1952581702830103, + "language_loss": 0.75707382, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77607089, + "num_input_tokens_seen": 93002740, + "step": 4306, + "time_per_iteration": 2.56027889251709 + }, + { + "auxiliary_loss_clip": 0.01049113, + "auxiliary_loss_mlp": 0.01010061, + "balance_loss_clip": 1.03420639, + "balance_loss_mlp": 1.00764072, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.852429975227473, + "language_loss": 0.57172793, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59231973, + "num_input_tokens_seen": 93058645, + "step": 4307, + "time_per_iteration": 3.0087146759033203 + }, + { + "auxiliary_loss_clip": 0.01128346, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.05528021, + "balance_loss_mlp": 1.02782154, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.5989598410626427, + "language_loss": 0.71803558, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73976433, + "num_input_tokens_seen": 93077140, + "step": 4308, + "time_per_iteration": 2.5280613899230957 + }, + { + "auxiliary_loss_clip": 0.01153764, + "auxiliary_loss_mlp": 0.01050663, + "balance_loss_clip": 1.05617845, + "balance_loss_mlp": 1.03340149, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 1.976118326444215, + "language_loss": 0.84368813, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86573243, + "num_input_tokens_seen": 93093580, + "step": 4309, + "time_per_iteration": 2.4314279556274414 + }, + { + "auxiliary_loss_clip": 0.01138509, + "auxiliary_loss_mlp": 0.01044027, + "balance_loss_clip": 1.05636823, + "balance_loss_mlp": 1.02787375, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.5527318822675478, + "language_loss": 0.84687924, + "learning_rate": 3.474075855228966e-06, + "loss": 0.86870462, + "num_input_tokens_seen": 93112345, + "step": 4310, + "time_per_iteration": 2.483252763748169 + }, + { + "auxiliary_loss_clip": 0.01141355, + "auxiliary_loss_mlp": 0.01048907, + "balance_loss_clip": 1.05562162, + "balance_loss_mlp": 1.03227723, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 1.8712135565110397, + "language_loss": 0.77082479, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79272741, + "num_input_tokens_seen": 93131545, + "step": 4311, + "time_per_iteration": 2.529372215270996 + }, + { + "auxiliary_loss_clip": 0.01112444, + "auxiliary_loss_mlp": 0.01051224, + "balance_loss_clip": 1.04541087, + "balance_loss_mlp": 1.03317547, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 1.6681453037771328, + "language_loss": 0.72303152, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74466825, + "num_input_tokens_seen": 93150730, + "step": 4312, + "time_per_iteration": 2.586435556411743 + }, + { + "auxiliary_loss_clip": 0.0114907, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.05381298, + "balance_loss_mlp": 1.02897549, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 1.7668158623907493, + "language_loss": 0.69901079, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72096199, + "num_input_tokens_seen": 93167895, + "step": 4313, + "time_per_iteration": 2.4111430644989014 + }, + { + "auxiliary_loss_clip": 0.0114888, + "auxiliary_loss_mlp": 0.01049507, + "balance_loss_clip": 1.05538249, + "balance_loss_mlp": 1.03353274, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.764229783371169, + "language_loss": 0.80263591, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82461977, + "num_input_tokens_seen": 93187650, + "step": 4314, + "time_per_iteration": 2.4736714363098145 + }, + { + "auxiliary_loss_clip": 0.01110116, + "auxiliary_loss_mlp": 0.01048879, + "balance_loss_clip": 1.04769576, + "balance_loss_mlp": 1.03041315, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.054364756984374, + "language_loss": 0.67653787, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69812787, + "num_input_tokens_seen": 93207370, + "step": 4315, + "time_per_iteration": 2.6336207389831543 + }, + { + "auxiliary_loss_clip": 0.01103706, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_clip": 1.05475998, + "balance_loss_mlp": 1.03241229, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 2.201229547052533, + "language_loss": 0.79375517, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81528282, + "num_input_tokens_seen": 93227925, + "step": 4316, + "time_per_iteration": 2.592108726501465 + }, + { + "auxiliary_loss_clip": 0.01099411, + "auxiliary_loss_mlp": 0.0104445, + "balance_loss_clip": 1.0522244, + "balance_loss_mlp": 1.02701008, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 2.654199194274498, + "language_loss": 0.771896, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.7933346, + "num_input_tokens_seen": 93250020, + "step": 4317, + "time_per_iteration": 2.6521756649017334 + }, + { + "auxiliary_loss_clip": 0.01149691, + "auxiliary_loss_mlp": 0.01049756, + "balance_loss_clip": 1.05556154, + "balance_loss_mlp": 1.03189802, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.06046133206744, + "language_loss": 0.77616644, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79816091, + "num_input_tokens_seen": 93269070, + "step": 4318, + "time_per_iteration": 2.4678714275360107 + }, + { + "auxiliary_loss_clip": 0.0114663, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.05252028, + "balance_loss_mlp": 1.02620339, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.6687338862508108, + "language_loss": 0.76381832, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78572953, + "num_input_tokens_seen": 93290250, + "step": 4319, + "time_per_iteration": 2.4738268852233887 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.05131125, + "balance_loss_mlp": 1.02401662, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.7410437372069298, + "language_loss": 0.7617408, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78336531, + "num_input_tokens_seen": 93310090, + "step": 4320, + "time_per_iteration": 2.5253331661224365 + }, + { + "auxiliary_loss_clip": 0.01119457, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.05330098, + "balance_loss_mlp": 1.02241778, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.920787590979671, + "language_loss": 0.7116918, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73328024, + "num_input_tokens_seen": 93329570, + "step": 4321, + "time_per_iteration": 4.0393226146698 + }, + { + "auxiliary_loss_clip": 0.01127869, + "auxiliary_loss_mlp": 0.01047953, + "balance_loss_clip": 1.05329013, + "balance_loss_mlp": 1.02856922, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.0672392831622903, + "language_loss": 0.74778318, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76954138, + "num_input_tokens_seen": 93347920, + "step": 4322, + "time_per_iteration": 2.6153087615966797 + }, + { + "auxiliary_loss_clip": 0.01119371, + "auxiliary_loss_mlp": 0.01047557, + "balance_loss_clip": 1.05472946, + "balance_loss_mlp": 1.02986622, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 2.6638978389483325, + "language_loss": 0.73189211, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75356138, + "num_input_tokens_seen": 93367145, + "step": 4323, + "time_per_iteration": 4.127241849899292 + }, + { + "auxiliary_loss_clip": 0.01143989, + "auxiliary_loss_mlp": 0.00786117, + "balance_loss_clip": 1.05471587, + "balance_loss_mlp": 1.00117862, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.2880580695165573, + "language_loss": 0.66617441, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.68547553, + "num_input_tokens_seen": 93386555, + "step": 4324, + "time_per_iteration": 2.528109550476074 + }, + { + "auxiliary_loss_clip": 0.01101724, + "auxiliary_loss_mlp": 0.01039641, + "balance_loss_clip": 1.05564547, + "balance_loss_mlp": 1.02454317, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 1.8773611677511985, + "language_loss": 0.71023875, + "learning_rate": 3.470121299177082e-06, + "loss": 0.73165238, + "num_input_tokens_seen": 93405590, + "step": 4325, + "time_per_iteration": 2.6618854999542236 + }, + { + "auxiliary_loss_clip": 0.01137307, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.05266929, + "balance_loss_mlp": 1.02057362, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 2.237960325885443, + "language_loss": 0.73284268, + "learning_rate": 3.469857215756257e-06, + "loss": 0.7545951, + "num_input_tokens_seen": 93424750, + "step": 4326, + "time_per_iteration": 2.5619940757751465 + }, + { + "auxiliary_loss_clip": 0.01120791, + "auxiliary_loss_mlp": 0.00786192, + "balance_loss_clip": 1.05162024, + "balance_loss_mlp": 1.00109613, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.8633376368761947, + "language_loss": 0.86718738, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88625723, + "num_input_tokens_seen": 93443465, + "step": 4327, + "time_per_iteration": 2.563509225845337 + }, + { + "auxiliary_loss_clip": 0.01151144, + "auxiliary_loss_mlp": 0.00785535, + "balance_loss_clip": 1.05342054, + "balance_loss_mlp": 1.0009017, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 1.5296176994836044, + "language_loss": 0.80284595, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82221276, + "num_input_tokens_seen": 93462580, + "step": 4328, + "time_per_iteration": 2.4585459232330322 + }, + { + "auxiliary_loss_clip": 0.01125713, + "auxiliary_loss_mlp": 0.00784713, + "balance_loss_clip": 1.05292571, + "balance_loss_mlp": 1.00097704, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.4838917877347257, + "language_loss": 0.87938356, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89848781, + "num_input_tokens_seen": 93482790, + "step": 4329, + "time_per_iteration": 2.5663352012634277 + }, + { + "auxiliary_loss_clip": 0.01147105, + "auxiliary_loss_mlp": 0.01043112, + "balance_loss_clip": 1.05568171, + "balance_loss_mlp": 1.0271852, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.049984300441374, + "language_loss": 0.77959192, + "learning_rate": 3.468800324801802e-06, + "loss": 0.80149412, + "num_input_tokens_seen": 93498795, + "step": 4330, + "time_per_iteration": 3.894934892654419 + }, + { + "auxiliary_loss_clip": 0.01152474, + "auxiliary_loss_mlp": 0.01053907, + "balance_loss_clip": 1.05655336, + "balance_loss_mlp": 1.03639483, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 1.5269218301920382, + "language_loss": 0.75603759, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77810138, + "num_input_tokens_seen": 93518335, + "step": 4331, + "time_per_iteration": 2.470248222351074 + }, + { + "auxiliary_loss_clip": 0.01132059, + "auxiliary_loss_mlp": 0.01042759, + "balance_loss_clip": 1.06005073, + "balance_loss_mlp": 1.02653456, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.4541229411823569, + "language_loss": 0.69147545, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71322358, + "num_input_tokens_seen": 93539170, + "step": 4332, + "time_per_iteration": 2.546664237976074 + }, + { + "auxiliary_loss_clip": 0.01120917, + "auxiliary_loss_mlp": 0.01044386, + "balance_loss_clip": 1.0512501, + "balance_loss_mlp": 1.02722025, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 1.914250245184138, + "language_loss": 0.798356, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.82000911, + "num_input_tokens_seen": 93558480, + "step": 4333, + "time_per_iteration": 2.604072332382202 + }, + { + "auxiliary_loss_clip": 0.01147362, + "auxiliary_loss_mlp": 0.01039219, + "balance_loss_clip": 1.0555495, + "balance_loss_mlp": 1.02329278, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 2.178561261459584, + "language_loss": 0.80866289, + "learning_rate": 3.467742542694501e-06, + "loss": 0.83052874, + "num_input_tokens_seen": 93575220, + "step": 4334, + "time_per_iteration": 2.4010841846466064 + }, + { + "auxiliary_loss_clip": 0.01125927, + "auxiliary_loss_mlp": 0.01042444, + "balance_loss_clip": 1.05258346, + "balance_loss_mlp": 1.0250392, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.9689128190715197, + "language_loss": 0.80295038, + "learning_rate": 3.46747795800024e-06, + "loss": 0.82463408, + "num_input_tokens_seen": 93597015, + "step": 4335, + "time_per_iteration": 2.5431251525878906 + }, + { + "auxiliary_loss_clip": 0.01049975, + "auxiliary_loss_mlp": 0.01007542, + "balance_loss_clip": 1.0233779, + "balance_loss_mlp": 1.00537252, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.8470162188085442, + "language_loss": 0.60901338, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62958854, + "num_input_tokens_seen": 93657775, + "step": 4336, + "time_per_iteration": 3.0296366214752197 + }, + { + "auxiliary_loss_clip": 0.01115168, + "auxiliary_loss_mlp": 0.01053734, + "balance_loss_clip": 1.05408561, + "balance_loss_mlp": 1.03669882, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 2.3131830041730854, + "language_loss": 0.77122474, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79291379, + "num_input_tokens_seen": 93676145, + "step": 4337, + "time_per_iteration": 2.5657029151916504 + }, + { + "auxiliary_loss_clip": 0.01131436, + "auxiliary_loss_mlp": 0.0104528, + "balance_loss_clip": 1.05845928, + "balance_loss_mlp": 1.02726769, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.1737281163511355, + "language_loss": 0.74333447, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76510167, + "num_input_tokens_seen": 93692480, + "step": 4338, + "time_per_iteration": 2.493720054626465 + }, + { + "auxiliary_loss_clip": 0.0114103, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.05284286, + "balance_loss_mlp": 1.02613246, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.5815044989359466, + "language_loss": 0.80631185, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82815528, + "num_input_tokens_seen": 93710165, + "step": 4339, + "time_per_iteration": 2.4629428386688232 + }, + { + "auxiliary_loss_clip": 0.01098172, + "auxiliary_loss_mlp": 0.01041881, + "balance_loss_clip": 1.0540055, + "balance_loss_mlp": 1.0264852, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 3.3961797304449455, + "language_loss": 0.76758444, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78898495, + "num_input_tokens_seen": 93730185, + "step": 4340, + "time_per_iteration": 4.189427852630615 + }, + { + "auxiliary_loss_clip": 0.01084398, + "auxiliary_loss_mlp": 0.01043381, + "balance_loss_clip": 1.04675567, + "balance_loss_mlp": 1.02658486, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.437782950995744, + "language_loss": 0.82637131, + "learning_rate": 3.465889281600845e-06, + "loss": 0.8476491, + "num_input_tokens_seen": 93747690, + "step": 4341, + "time_per_iteration": 2.6518304347991943 + }, + { + "auxiliary_loss_clip": 0.01152053, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.05851948, + "balance_loss_mlp": 1.02636981, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 2.3618626245122667, + "language_loss": 0.76851308, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.79046816, + "num_input_tokens_seen": 93767405, + "step": 4342, + "time_per_iteration": 2.5210578441619873 + }, + { + "auxiliary_loss_clip": 0.01139706, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.05525219, + "balance_loss_mlp": 1.01676655, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.8033066402838482, + "language_loss": 0.66087294, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68261409, + "num_input_tokens_seen": 93789950, + "step": 4343, + "time_per_iteration": 2.6444671154022217 + }, + { + "auxiliary_loss_clip": 0.01080605, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.04550123, + "balance_loss_mlp": 1.03118479, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.0802797502140833, + "language_loss": 0.73422921, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75552493, + "num_input_tokens_seen": 93807835, + "step": 4344, + "time_per_iteration": 2.677457094192505 + }, + { + "auxiliary_loss_clip": 0.01152686, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.05782747, + "balance_loss_mlp": 1.02300322, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 2.4479267606243713, + "language_loss": 0.86712307, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88904786, + "num_input_tokens_seen": 93825670, + "step": 4345, + "time_per_iteration": 2.450141668319702 + }, + { + "auxiliary_loss_clip": 0.0112546, + "auxiliary_loss_mlp": 0.01045561, + "balance_loss_clip": 1.05406415, + "balance_loss_mlp": 1.02869296, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.8950926919222375, + "language_loss": 0.76362741, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78533763, + "num_input_tokens_seen": 93844045, + "step": 4346, + "time_per_iteration": 2.511852741241455 + }, + { + "auxiliary_loss_clip": 0.01142264, + "auxiliary_loss_mlp": 0.01043663, + "balance_loss_clip": 1.05667341, + "balance_loss_mlp": 1.02722418, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.838858951628969, + "language_loss": 0.75657791, + "learning_rate": 3.464298604081606e-06, + "loss": 0.77843714, + "num_input_tokens_seen": 93864380, + "step": 4347, + "time_per_iteration": 2.521695613861084 + }, + { + "auxiliary_loss_clip": 0.01109681, + "auxiliary_loss_mlp": 0.01043792, + "balance_loss_clip": 1.05119371, + "balance_loss_mlp": 1.02667308, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.3146426626925387, + "language_loss": 0.73347193, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75500667, + "num_input_tokens_seen": 93885475, + "step": 4348, + "time_per_iteration": 2.620659112930298 + }, + { + "auxiliary_loss_clip": 0.01113004, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.0537827, + "balance_loss_mlp": 1.0319109, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.8166773259650184, + "language_loss": 0.91559535, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93721342, + "num_input_tokens_seen": 93905545, + "step": 4349, + "time_per_iteration": 2.6055514812469482 + }, + { + "auxiliary_loss_clip": 0.0113769, + "auxiliary_loss_mlp": 0.01048119, + "balance_loss_clip": 1.05476975, + "balance_loss_mlp": 1.03177524, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 2.0733085912402496, + "language_loss": 0.80381191, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82567, + "num_input_tokens_seen": 93924185, + "step": 4350, + "time_per_iteration": 2.467235565185547 + }, + { + "auxiliary_loss_clip": 0.01134006, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.05439055, + "balance_loss_mlp": 1.030164, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.7572406497064696, + "language_loss": 0.62182605, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64363581, + "num_input_tokens_seen": 93942825, + "step": 4351, + "time_per_iteration": 2.4680063724517822 + }, + { + "auxiliary_loss_clip": 0.01141485, + "auxiliary_loss_mlp": 0.01043695, + "balance_loss_clip": 1.0538795, + "balance_loss_mlp": 1.02697039, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 1.8451328957648605, + "language_loss": 0.84045023, + "learning_rate": 3.462971512415555e-06, + "loss": 0.86230206, + "num_input_tokens_seen": 93962045, + "step": 4352, + "time_per_iteration": 2.503002166748047 + }, + { + "auxiliary_loss_clip": 0.01052894, + "auxiliary_loss_mlp": 0.01018261, + "balance_loss_clip": 1.02747369, + "balance_loss_mlp": 1.01582909, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.7943191765165677, + "language_loss": 0.70591652, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72662807, + "num_input_tokens_seen": 94021175, + "step": 4353, + "time_per_iteration": 2.980511426925659 + }, + { + "auxiliary_loss_clip": 0.01117382, + "auxiliary_loss_mlp": 0.01061485, + "balance_loss_clip": 1.0472002, + "balance_loss_mlp": 1.04176748, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 2.4578668268297585, + "language_loss": 0.7742815, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79607016, + "num_input_tokens_seen": 94043370, + "step": 4354, + "time_per_iteration": 2.556792974472046 + }, + { + "auxiliary_loss_clip": 0.01090148, + "auxiliary_loss_mlp": 0.01049231, + "balance_loss_clip": 1.04486239, + "balance_loss_mlp": 1.03168368, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 2.349639409352032, + "language_loss": 0.67793334, + "learning_rate": 3.462174591623085e-06, + "loss": 0.69932717, + "num_input_tokens_seen": 94063510, + "step": 4355, + "time_per_iteration": 2.617372512817383 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.05225265, + "balance_loss_mlp": 1.01984096, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.91966593607315, + "language_loss": 0.67258871, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69397897, + "num_input_tokens_seen": 94083865, + "step": 4356, + "time_per_iteration": 2.624020576477051 + }, + { + "auxiliary_loss_clip": 0.01052346, + "auxiliary_loss_mlp": 0.01001873, + "balance_loss_clip": 1.02698457, + "balance_loss_mlp": 0.99944162, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6842145733805646, + "language_loss": 0.53147012, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55201226, + "num_input_tokens_seen": 94144095, + "step": 4357, + "time_per_iteration": 2.962489366531372 + }, + { + "auxiliary_loss_clip": 0.0113905, + "auxiliary_loss_mlp": 0.01045784, + "balance_loss_clip": 1.05264068, + "balance_loss_mlp": 1.0289396, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 2.176743576425972, + "language_loss": 0.84315181, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86500013, + "num_input_tokens_seen": 94163035, + "step": 4358, + "time_per_iteration": 2.563415765762329 + }, + { + "auxiliary_loss_clip": 0.01127367, + "auxiliary_loss_mlp": 0.0104471, + "balance_loss_clip": 1.04680741, + "balance_loss_mlp": 1.02546942, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 3.2460694013577016, + "language_loss": 0.67294812, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69466889, + "num_input_tokens_seen": 94182520, + "step": 4359, + "time_per_iteration": 2.6085143089294434 + }, + { + "auxiliary_loss_clip": 0.01118662, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.04482198, + "balance_loss_mlp": 1.02453327, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.8155932460034345, + "language_loss": 0.7840305, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80562848, + "num_input_tokens_seen": 94201795, + "step": 4360, + "time_per_iteration": 2.5615334510803223 + }, + { + "auxiliary_loss_clip": 0.01119166, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.04754019, + "balance_loss_mlp": 1.03044355, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.6955910227520556, + "language_loss": 0.68184131, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70350164, + "num_input_tokens_seen": 94222390, + "step": 4361, + "time_per_iteration": 4.028102397918701 + }, + { + "auxiliary_loss_clip": 0.01138344, + "auxiliary_loss_mlp": 0.01052359, + "balance_loss_clip": 1.05254912, + "balance_loss_mlp": 1.03502655, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.761925595678345, + "language_loss": 0.84256977, + "learning_rate": 3.46031316964119e-06, + "loss": 0.8644768, + "num_input_tokens_seen": 94239980, + "step": 4362, + "time_per_iteration": 2.490452289581299 + }, + { + "auxiliary_loss_clip": 0.01107168, + "auxiliary_loss_mlp": 0.0104963, + "balance_loss_clip": 1.0481714, + "balance_loss_mlp": 1.03164077, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 3.9548495246653776, + "language_loss": 0.64853334, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67010134, + "num_input_tokens_seen": 94260715, + "step": 4363, + "time_per_iteration": 4.177897214889526 + }, + { + "auxiliary_loss_clip": 0.01038314, + "auxiliary_loss_mlp": 0.01012283, + "balance_loss_clip": 1.0240314, + "balance_loss_mlp": 1.01003027, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8915227672468254, + "language_loss": 0.61118948, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63169539, + "num_input_tokens_seen": 94321285, + "step": 4364, + "time_per_iteration": 3.2271807193756104 + }, + { + "auxiliary_loss_clip": 0.01151868, + "auxiliary_loss_mlp": 0.01053214, + "balance_loss_clip": 1.05564106, + "balance_loss_mlp": 1.03462946, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.587989471116019, + "language_loss": 0.71860641, + "learning_rate": 3.459514586533184e-06, + "loss": 0.74065721, + "num_input_tokens_seen": 94335420, + "step": 4365, + "time_per_iteration": 2.490250587463379 + }, + { + "auxiliary_loss_clip": 0.01124038, + "auxiliary_loss_mlp": 0.00786536, + "balance_loss_clip": 1.05154741, + "balance_loss_mlp": 1.00101042, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.64860029675423, + "language_loss": 0.77134234, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79044807, + "num_input_tokens_seen": 94357440, + "step": 4366, + "time_per_iteration": 2.6729514598846436 + }, + { + "auxiliary_loss_clip": 0.01148891, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_clip": 1.0541476, + "balance_loss_mlp": 1.02953982, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 1.6075265022103897, + "language_loss": 0.76102871, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78298497, + "num_input_tokens_seen": 94375690, + "step": 4367, + "time_per_iteration": 2.5086495876312256 + }, + { + "auxiliary_loss_clip": 0.01137951, + "auxiliary_loss_mlp": 0.01043466, + "balance_loss_clip": 1.05380201, + "balance_loss_mlp": 1.02734888, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 1.5850121095186058, + "language_loss": 0.69566768, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71748185, + "num_input_tokens_seen": 94393190, + "step": 4368, + "time_per_iteration": 2.539346218109131 + }, + { + "auxiliary_loss_clip": 0.01126966, + "auxiliary_loss_mlp": 0.01041888, + "balance_loss_clip": 1.05424881, + "balance_loss_mlp": 1.02471018, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 2.17656690283496, + "language_loss": 0.78722537, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80891395, + "num_input_tokens_seen": 94410975, + "step": 4369, + "time_per_iteration": 3.988262414932251 + }, + { + "auxiliary_loss_clip": 0.01119751, + "auxiliary_loss_mlp": 0.01044485, + "balance_loss_clip": 1.05387783, + "balance_loss_mlp": 1.02809405, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 1.825550104464965, + "language_loss": 0.83276463, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85440695, + "num_input_tokens_seen": 94429985, + "step": 4370, + "time_per_iteration": 2.5268282890319824 + }, + { + "auxiliary_loss_clip": 0.01141038, + "auxiliary_loss_mlp": 0.01054077, + "balance_loss_clip": 1.05354857, + "balance_loss_mlp": 1.03427601, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 2.0632986755785074, + "language_loss": 0.71301228, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73496342, + "num_input_tokens_seen": 94448660, + "step": 4371, + "time_per_iteration": 2.452887535095215 + }, + { + "auxiliary_loss_clip": 0.01053287, + "auxiliary_loss_mlp": 0.01004541, + "balance_loss_clip": 1.01800632, + "balance_loss_mlp": 1.00253797, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.6868978288301767, + "language_loss": 0.56392848, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58450675, + "num_input_tokens_seen": 94515630, + "step": 4372, + "time_per_iteration": 3.1277830600738525 + }, + { + "auxiliary_loss_clip": 0.01126215, + "auxiliary_loss_mlp": 0.01038513, + "balance_loss_clip": 1.05531096, + "balance_loss_mlp": 1.02216935, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.5793383884768448, + "language_loss": 0.77326924, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79491651, + "num_input_tokens_seen": 94535385, + "step": 4373, + "time_per_iteration": 2.561229944229126 + }, + { + "auxiliary_loss_clip": 0.01105555, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_clip": 1.05198622, + "balance_loss_mlp": 1.02627087, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 2.2583958642101103, + "language_loss": 0.71718764, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73866487, + "num_input_tokens_seen": 94552650, + "step": 4374, + "time_per_iteration": 2.531568765640259 + }, + { + "auxiliary_loss_clip": 0.01120803, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.05474114, + "balance_loss_mlp": 1.02759457, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 1.7166439080756786, + "language_loss": 0.80962026, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83128446, + "num_input_tokens_seen": 94574075, + "step": 4375, + "time_per_iteration": 2.543200731277466 + }, + { + "auxiliary_loss_clip": 0.01118293, + "auxiliary_loss_mlp": 0.01038612, + "balance_loss_clip": 1.04863191, + "balance_loss_mlp": 1.02248323, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 1.6994837980228124, + "language_loss": 0.65815043, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.67971951, + "num_input_tokens_seen": 94594255, + "step": 4376, + "time_per_iteration": 2.60133695602417 + }, + { + "auxiliary_loss_clip": 0.01094427, + "auxiliary_loss_mlp": 0.0104973, + "balance_loss_clip": 1.04411685, + "balance_loss_mlp": 1.03234935, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 2.2881043229253635, + "language_loss": 0.69257081, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71401238, + "num_input_tokens_seen": 94611410, + "step": 4377, + "time_per_iteration": 2.548459053039551 + }, + { + "auxiliary_loss_clip": 0.01137508, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.05345225, + "balance_loss_mlp": 1.02704036, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 2.3127359484920444, + "language_loss": 0.78976178, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.81157732, + "num_input_tokens_seen": 94636575, + "step": 4378, + "time_per_iteration": 2.7470948696136475 + }, + { + "auxiliary_loss_clip": 0.01127409, + "auxiliary_loss_mlp": 0.01052139, + "balance_loss_clip": 1.05639648, + "balance_loss_mlp": 1.03674912, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.3942065140993782, + "language_loss": 0.76063037, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78242582, + "num_input_tokens_seen": 94654345, + "step": 4379, + "time_per_iteration": 4.024099826812744 + }, + { + "auxiliary_loss_clip": 0.01113525, + "auxiliary_loss_mlp": 0.01042664, + "balance_loss_clip": 1.05291772, + "balance_loss_mlp": 1.02448452, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.650615911535985, + "language_loss": 0.77635819, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.79791999, + "num_input_tokens_seen": 94673985, + "step": 4380, + "time_per_iteration": 2.6534109115600586 + }, + { + "auxiliary_loss_clip": 0.01126336, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_clip": 1.04924297, + "balance_loss_mlp": 1.02934158, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 8.515696602677119, + "language_loss": 0.63946295, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.66119373, + "num_input_tokens_seen": 94693145, + "step": 4381, + "time_per_iteration": 2.5607175827026367 + }, + { + "auxiliary_loss_clip": 0.01136641, + "auxiliary_loss_mlp": 0.01042247, + "balance_loss_clip": 1.05102599, + "balance_loss_mlp": 1.02596331, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.7357843757436158, + "language_loss": 0.82691514, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84870404, + "num_input_tokens_seen": 94710185, + "step": 4382, + "time_per_iteration": 2.4773714542388916 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.04580951, + "balance_loss_mlp": 1.03341067, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.0663217626824504, + "language_loss": 0.69852525, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.72007823, + "num_input_tokens_seen": 94730280, + "step": 4383, + "time_per_iteration": 2.6785387992858887 + }, + { + "auxiliary_loss_clip": 0.01137345, + "auxiliary_loss_mlp": 0.01046728, + "balance_loss_clip": 1.05110383, + "balance_loss_mlp": 1.03089726, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.7976938380429432, + "language_loss": 0.69347328, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71531403, + "num_input_tokens_seen": 94748560, + "step": 4384, + "time_per_iteration": 2.4718544483184814 + }, + { + "auxiliary_loss_clip": 0.01133557, + "auxiliary_loss_mlp": 0.01037798, + "balance_loss_clip": 1.05010378, + "balance_loss_mlp": 1.02143073, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.3918752055241286, + "language_loss": 0.69968611, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72139966, + "num_input_tokens_seen": 94767570, + "step": 4385, + "time_per_iteration": 2.523627519607544 + }, + { + "auxiliary_loss_clip": 0.01116201, + "auxiliary_loss_mlp": 0.01053735, + "balance_loss_clip": 1.04965281, + "balance_loss_mlp": 1.03667593, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.820766104796089, + "language_loss": 0.85486746, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87656677, + "num_input_tokens_seen": 94784985, + "step": 4386, + "time_per_iteration": 2.5538125038146973 + }, + { + "auxiliary_loss_clip": 0.01125556, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.0506525, + "balance_loss_mlp": 1.03094244, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 3.8598904859741485, + "language_loss": 0.77289248, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.7946282, + "num_input_tokens_seen": 94802545, + "step": 4387, + "time_per_iteration": 2.479799270629883 + }, + { + "auxiliary_loss_clip": 0.01131195, + "auxiliary_loss_mlp": 0.01049179, + "balance_loss_clip": 1.05394125, + "balance_loss_mlp": 1.03316927, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 1.8785929030308572, + "language_loss": 0.7667433, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78854704, + "num_input_tokens_seen": 94820730, + "step": 4388, + "time_per_iteration": 2.4617838859558105 + }, + { + "auxiliary_loss_clip": 0.01142413, + "auxiliary_loss_mlp": 0.01041188, + "balance_loss_clip": 1.049124, + "balance_loss_mlp": 1.02423596, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 2.3363240149192435, + "language_loss": 0.86060399, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88244003, + "num_input_tokens_seen": 94839175, + "step": 4389, + "time_per_iteration": 2.4292171001434326 + }, + { + "auxiliary_loss_clip": 0.01046376, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.02842593, + "balance_loss_mlp": 1.03603005, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8083152419100875, + "language_loss": 0.6029647, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62380856, + "num_input_tokens_seen": 94898865, + "step": 4390, + "time_per_iteration": 3.0608839988708496 + }, + { + "auxiliary_loss_clip": 0.01129019, + "auxiliary_loss_mlp": 0.01038346, + "balance_loss_clip": 1.05097961, + "balance_loss_mlp": 1.02095282, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.502055257970001, + "language_loss": 0.77565175, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79732537, + "num_input_tokens_seen": 94917490, + "step": 4391, + "time_per_iteration": 2.532970905303955 + }, + { + "auxiliary_loss_clip": 0.01028334, + "auxiliary_loss_mlp": 0.00757581, + "balance_loss_clip": 1.01909137, + "balance_loss_mlp": 1.00113606, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.9106721174502845, + "language_loss": 0.5870055, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60486466, + "num_input_tokens_seen": 94969065, + "step": 4392, + "time_per_iteration": 3.076883316040039 + }, + { + "auxiliary_loss_clip": 0.01137209, + "auxiliary_loss_mlp": 0.01041388, + "balance_loss_clip": 1.05152202, + "balance_loss_mlp": 1.02490115, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 1.9143486024571899, + "language_loss": 0.68753332, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70931923, + "num_input_tokens_seen": 94988540, + "step": 4393, + "time_per_iteration": 2.4889614582061768 + }, + { + "auxiliary_loss_clip": 0.01135649, + "auxiliary_loss_mlp": 0.01042655, + "balance_loss_clip": 1.04967117, + "balance_loss_mlp": 1.02375984, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.9350037232282433, + "language_loss": 0.84153986, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.86332285, + "num_input_tokens_seen": 95004810, + "step": 4394, + "time_per_iteration": 2.446119785308838 + }, + { + "auxiliary_loss_clip": 0.01128235, + "auxiliary_loss_mlp": 0.01046714, + "balance_loss_clip": 1.05132568, + "balance_loss_mlp": 1.02709186, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.127021168149743, + "language_loss": 0.7022174, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.7239669, + "num_input_tokens_seen": 95024085, + "step": 4395, + "time_per_iteration": 2.499789237976074 + }, + { + "auxiliary_loss_clip": 0.01115178, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.05064976, + "balance_loss_mlp": 1.01805317, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.6419945426970257, + "language_loss": 0.86142576, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.88292658, + "num_input_tokens_seen": 95042515, + "step": 4396, + "time_per_iteration": 2.520883321762085 + }, + { + "auxiliary_loss_clip": 0.01009852, + "auxiliary_loss_mlp": 0.01007417, + "balance_loss_clip": 1.0170536, + "balance_loss_mlp": 1.00468683, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7843970147819423, + "language_loss": 0.5504216, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57059431, + "num_input_tokens_seen": 95094835, + "step": 4397, + "time_per_iteration": 2.9247937202453613 + }, + { + "auxiliary_loss_clip": 0.01134922, + "auxiliary_loss_mlp": 0.01049104, + "balance_loss_clip": 1.05120623, + "balance_loss_mlp": 1.03212881, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.0275101746640223, + "language_loss": 0.78483152, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80667174, + "num_input_tokens_seen": 95113480, + "step": 4398, + "time_per_iteration": 2.572181463241577 + }, + { + "auxiliary_loss_clip": 0.01137126, + "auxiliary_loss_mlp": 0.0104114, + "balance_loss_clip": 1.05254126, + "balance_loss_mlp": 1.02303231, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 1.790947251626843, + "language_loss": 0.67091876, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69270146, + "num_input_tokens_seen": 95132580, + "step": 4399, + "time_per_iteration": 2.4680752754211426 + }, + { + "auxiliary_loss_clip": 0.01096937, + "auxiliary_loss_mlp": 0.01043235, + "balance_loss_clip": 1.04833245, + "balance_loss_mlp": 1.0260216, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 1.6629825056218948, + "language_loss": 0.86155134, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88295305, + "num_input_tokens_seen": 95152375, + "step": 4400, + "time_per_iteration": 4.002071380615234 + }, + { + "auxiliary_loss_clip": 0.01122302, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.04898059, + "balance_loss_mlp": 1.01989961, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 4.021050575169417, + "language_loss": 0.75716949, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77877176, + "num_input_tokens_seen": 95170265, + "step": 4401, + "time_per_iteration": 2.4898812770843506 + }, + { + "auxiliary_loss_clip": 0.01103012, + "auxiliary_loss_mlp": 0.01049882, + "balance_loss_clip": 1.04745245, + "balance_loss_mlp": 1.03073716, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.9450879197914273, + "language_loss": 0.88096559, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90249455, + "num_input_tokens_seen": 95188655, + "step": 4402, + "time_per_iteration": 4.009501695632935 + }, + { + "auxiliary_loss_clip": 0.01107104, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_clip": 1.04963446, + "balance_loss_mlp": 1.02610302, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.5695708628532092, + "language_loss": 0.78101468, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80252099, + "num_input_tokens_seen": 95209615, + "step": 4403, + "time_per_iteration": 2.5604255199432373 + }, + { + "auxiliary_loss_clip": 0.01130327, + "auxiliary_loss_mlp": 0.01038418, + "balance_loss_clip": 1.04744053, + "balance_loss_mlp": 1.02066791, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.9533757043211772, + "language_loss": 0.88303649, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.90472388, + "num_input_tokens_seen": 95227810, + "step": 4404, + "time_per_iteration": 2.492366075515747 + }, + { + "auxiliary_loss_clip": 0.01124976, + "auxiliary_loss_mlp": 0.01038812, + "balance_loss_clip": 1.04822898, + "balance_loss_mlp": 1.0226357, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.6541452923755282, + "language_loss": 0.76154727, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78318512, + "num_input_tokens_seen": 95245890, + "step": 4405, + "time_per_iteration": 2.4788894653320312 + }, + { + "auxiliary_loss_clip": 0.01147314, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_clip": 1.05307126, + "balance_loss_mlp": 1.0255878, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 2.5992965886146147, + "language_loss": 0.69639802, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.7183069, + "num_input_tokens_seen": 95264955, + "step": 4406, + "time_per_iteration": 2.43261456489563 + }, + { + "auxiliary_loss_clip": 0.01121725, + "auxiliary_loss_mlp": 0.01049734, + "balance_loss_clip": 1.05417562, + "balance_loss_mlp": 1.03361654, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.6207398827477841, + "language_loss": 0.83547497, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85718954, + "num_input_tokens_seen": 95284245, + "step": 4407, + "time_per_iteration": 2.5237436294555664 + }, + { + "auxiliary_loss_clip": 0.01111104, + "auxiliary_loss_mlp": 0.0103443, + "balance_loss_clip": 1.04836845, + "balance_loss_mlp": 1.01697826, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 2.3531869970794546, + "language_loss": 0.75751877, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.77897418, + "num_input_tokens_seen": 95307125, + "step": 4408, + "time_per_iteration": 2.661073684692383 + }, + { + "auxiliary_loss_clip": 0.01098028, + "auxiliary_loss_mlp": 0.01040706, + "balance_loss_clip": 1.04552686, + "balance_loss_mlp": 1.02316999, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 2.51704798755987, + "language_loss": 0.70917439, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73056173, + "num_input_tokens_seen": 95329150, + "step": 4409, + "time_per_iteration": 4.134162902832031 + }, + { + "auxiliary_loss_clip": 0.01134975, + "auxiliary_loss_mlp": 0.01040973, + "balance_loss_clip": 1.05270541, + "balance_loss_mlp": 1.02275813, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 6.730226738574389, + "language_loss": 0.73385406, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75561357, + "num_input_tokens_seen": 95349880, + "step": 4410, + "time_per_iteration": 2.505915403366089 + }, + { + "auxiliary_loss_clip": 0.01140541, + "auxiliary_loss_mlp": 0.0104685, + "balance_loss_clip": 1.05156696, + "balance_loss_mlp": 1.03002989, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 2.2144818782206928, + "language_loss": 0.73833245, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.7602064, + "num_input_tokens_seen": 95368570, + "step": 4411, + "time_per_iteration": 2.482593536376953 + }, + { + "auxiliary_loss_clip": 0.01101118, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_clip": 1.05226159, + "balance_loss_mlp": 1.02795815, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.0721618963116355, + "language_loss": 0.82449913, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84595752, + "num_input_tokens_seen": 95387065, + "step": 4412, + "time_per_iteration": 2.57322359085083 + }, + { + "auxiliary_loss_clip": 0.01132726, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.0501076, + "balance_loss_mlp": 1.02915335, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.692154683406137, + "language_loss": 0.7403869, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76216561, + "num_input_tokens_seen": 95406345, + "step": 4413, + "time_per_iteration": 2.467965602874756 + }, + { + "auxiliary_loss_clip": 0.01062594, + "auxiliary_loss_mlp": 0.01017535, + "balance_loss_clip": 1.03614306, + "balance_loss_mlp": 1.01518655, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8758784267462648, + "language_loss": 0.56908292, + "learning_rate": 3.446400750732793e-06, + "loss": 0.58988422, + "num_input_tokens_seen": 95463595, + "step": 4414, + "time_per_iteration": 2.9731955528259277 + }, + { + "auxiliary_loss_clip": 0.01109707, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.04701364, + "balance_loss_mlp": 1.03221226, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.8411503558990423, + "language_loss": 0.74639529, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76797354, + "num_input_tokens_seen": 95484115, + "step": 4415, + "time_per_iteration": 2.626398801803589 + }, + { + "auxiliary_loss_clip": 0.01097421, + "auxiliary_loss_mlp": 0.01046065, + "balance_loss_clip": 1.04302192, + "balance_loss_mlp": 1.02671778, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.37608254066515, + "language_loss": 0.86747611, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88891095, + "num_input_tokens_seen": 95501435, + "step": 4416, + "time_per_iteration": 2.5517356395721436 + }, + { + "auxiliary_loss_clip": 0.01137635, + "auxiliary_loss_mlp": 0.01043024, + "balance_loss_clip": 1.05197954, + "balance_loss_mlp": 1.02543461, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6826461382241233, + "language_loss": 0.76107526, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78288186, + "num_input_tokens_seen": 95520135, + "step": 4417, + "time_per_iteration": 2.5659263134002686 + }, + { + "auxiliary_loss_clip": 0.01124838, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.05568337, + "balance_loss_mlp": 1.02560258, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.5275814065559765, + "language_loss": 0.7994926, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.8211894, + "num_input_tokens_seen": 95541705, + "step": 4418, + "time_per_iteration": 4.112991809844971 + }, + { + "auxiliary_loss_clip": 0.01135981, + "auxiliary_loss_mlp": 0.01045029, + "balance_loss_clip": 1.05097508, + "balance_loss_mlp": 1.02779114, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.2455996941074914, + "language_loss": 0.67076635, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69257641, + "num_input_tokens_seen": 95560300, + "step": 4419, + "time_per_iteration": 2.474632740020752 + }, + { + "auxiliary_loss_clip": 0.01146655, + "auxiliary_loss_mlp": 0.01046602, + "balance_loss_clip": 1.05207515, + "balance_loss_mlp": 1.02866101, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 1.7632409423863835, + "language_loss": 0.79552829, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81746089, + "num_input_tokens_seen": 95580150, + "step": 4420, + "time_per_iteration": 2.578508138656616 + }, + { + "auxiliary_loss_clip": 0.01128664, + "auxiliary_loss_mlp": 0.01053597, + "balance_loss_clip": 1.05621004, + "balance_loss_mlp": 1.03407025, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.1383812257899852, + "language_loss": 0.81333196, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83515459, + "num_input_tokens_seen": 95597570, + "step": 4421, + "time_per_iteration": 2.507627248764038 + }, + { + "auxiliary_loss_clip": 0.0113609, + "auxiliary_loss_mlp": 0.01050354, + "balance_loss_clip": 1.05544996, + "balance_loss_mlp": 1.03384316, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.515706098079723, + "language_loss": 0.65597129, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67783576, + "num_input_tokens_seen": 95619415, + "step": 4422, + "time_per_iteration": 2.6962735652923584 + }, + { + "auxiliary_loss_clip": 0.01131816, + "auxiliary_loss_mlp": 0.01050605, + "balance_loss_clip": 1.05162704, + "balance_loss_mlp": 1.03416598, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.244284506027902, + "language_loss": 0.7400372, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76186138, + "num_input_tokens_seen": 95639155, + "step": 4423, + "time_per_iteration": 2.539856433868408 + }, + { + "auxiliary_loss_clip": 0.0112795, + "auxiliary_loss_mlp": 0.01062115, + "balance_loss_clip": 1.04631233, + "balance_loss_mlp": 1.04395938, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.6255331875819394, + "language_loss": 0.77235782, + "learning_rate": 3.443708238639522e-06, + "loss": 0.79425848, + "num_input_tokens_seen": 95663320, + "step": 4424, + "time_per_iteration": 2.679809808731079 + }, + { + "auxiliary_loss_clip": 0.01130758, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.04949808, + "balance_loss_mlp": 1.03308439, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.70488558504388, + "language_loss": 0.79406762, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81587994, + "num_input_tokens_seen": 95680260, + "step": 4425, + "time_per_iteration": 2.4358434677124023 + }, + { + "auxiliary_loss_clip": 0.01119112, + "auxiliary_loss_mlp": 0.01046333, + "balance_loss_clip": 1.04998708, + "balance_loss_mlp": 1.03114557, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.628523350093176, + "language_loss": 0.80221987, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82387435, + "num_input_tokens_seen": 95701140, + "step": 4426, + "time_per_iteration": 2.533320188522339 + }, + { + "auxiliary_loss_clip": 0.01147411, + "auxiliary_loss_mlp": 0.01053943, + "balance_loss_clip": 1.05416179, + "balance_loss_mlp": 1.03712213, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.7940306850966532, + "language_loss": 0.7711817, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79319519, + "num_input_tokens_seen": 95722060, + "step": 4427, + "time_per_iteration": 2.525289535522461 + }, + { + "auxiliary_loss_clip": 0.01112357, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.0523591, + "balance_loss_mlp": 1.02027869, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.7773067823898778, + "language_loss": 0.76862657, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.79011512, + "num_input_tokens_seen": 95742495, + "step": 4428, + "time_per_iteration": 2.6059038639068604 + }, + { + "auxiliary_loss_clip": 0.01109686, + "auxiliary_loss_mlp": 0.00784446, + "balance_loss_clip": 1.04764867, + "balance_loss_mlp": 1.0008775, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 2.3845714757336633, + "language_loss": 0.82868099, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.84762239, + "num_input_tokens_seen": 95761510, + "step": 4429, + "time_per_iteration": 2.5284392833709717 + }, + { + "auxiliary_loss_clip": 0.01108065, + "auxiliary_loss_mlp": 0.01044343, + "balance_loss_clip": 1.05145681, + "balance_loss_mlp": 1.02796388, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 11.618562208756579, + "language_loss": 0.72275126, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74427533, + "num_input_tokens_seen": 95782385, + "step": 4430, + "time_per_iteration": 2.641021728515625 + }, + { + "auxiliary_loss_clip": 0.01144642, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.05161381, + "balance_loss_mlp": 1.02889526, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 3.9158341365706266, + "language_loss": 0.81831843, + "learning_rate": 3.441820222206035e-06, + "loss": 0.84022975, + "num_input_tokens_seen": 95800595, + "step": 4431, + "time_per_iteration": 2.4167845249176025 + }, + { + "auxiliary_loss_clip": 0.01140567, + "auxiliary_loss_mlp": 0.01046293, + "balance_loss_clip": 1.05255818, + "balance_loss_mlp": 1.02880526, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.1665265850763626, + "language_loss": 0.76151419, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78338277, + "num_input_tokens_seen": 95818480, + "step": 4432, + "time_per_iteration": 2.473278760910034 + }, + { + "auxiliary_loss_clip": 0.01086873, + "auxiliary_loss_mlp": 0.01050951, + "balance_loss_clip": 1.04305625, + "balance_loss_mlp": 1.03135335, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.3081034528951765, + "language_loss": 0.83168763, + "learning_rate": 3.441280296720154e-06, + "loss": 0.85306585, + "num_input_tokens_seen": 95837205, + "step": 4433, + "time_per_iteration": 2.5812506675720215 + }, + { + "auxiliary_loss_clip": 0.0113605, + "auxiliary_loss_mlp": 0.01044756, + "balance_loss_clip": 1.05193198, + "balance_loss_mlp": 1.02722049, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.2675374367195906, + "language_loss": 0.76701182, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78881991, + "num_input_tokens_seen": 95858395, + "step": 4434, + "time_per_iteration": 2.525172233581543 + }, + { + "auxiliary_loss_clip": 0.01144168, + "auxiliary_loss_mlp": 0.01042233, + "balance_loss_clip": 1.05128717, + "balance_loss_mlp": 1.02573395, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 1.837894282229045, + "language_loss": 0.8241362, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84600025, + "num_input_tokens_seen": 95877875, + "step": 4435, + "time_per_iteration": 2.43868350982666 + }, + { + "auxiliary_loss_clip": 0.0110491, + "auxiliary_loss_mlp": 0.01055628, + "balance_loss_clip": 1.04790759, + "balance_loss_mlp": 1.03674555, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.5331908567735733, + "language_loss": 0.87545216, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89705759, + "num_input_tokens_seen": 95895820, + "step": 4436, + "time_per_iteration": 2.616894245147705 + }, + { + "auxiliary_loss_clip": 0.01122688, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.05057716, + "balance_loss_mlp": 1.03073764, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.528196607324217, + "language_loss": 0.78651822, + "learning_rate": 3.440199789988407e-06, + "loss": 0.80821323, + "num_input_tokens_seen": 95918025, + "step": 4437, + "time_per_iteration": 2.5885586738586426 + }, + { + "auxiliary_loss_clip": 0.01088209, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.04521477, + "balance_loss_mlp": 1.02613914, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 2.364293082387162, + "language_loss": 0.63724381, + "learning_rate": 3.439929526748556e-06, + "loss": 0.65855336, + "num_input_tokens_seen": 95937725, + "step": 4438, + "time_per_iteration": 2.7644472122192383 + }, + { + "auxiliary_loss_clip": 0.01078515, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.04148448, + "balance_loss_mlp": 1.0203321, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 2.6682430047338377, + "language_loss": 0.75782502, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77898383, + "num_input_tokens_seen": 95956335, + "step": 4439, + "time_per_iteration": 4.181002616882324 + }, + { + "auxiliary_loss_clip": 0.01090693, + "auxiliary_loss_mlp": 0.01044475, + "balance_loss_clip": 1.05279243, + "balance_loss_mlp": 1.02604485, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.6677331809855174, + "language_loss": 0.71772468, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.73907638, + "num_input_tokens_seen": 95977135, + "step": 4440, + "time_per_iteration": 2.669436454772949 + }, + { + "auxiliary_loss_clip": 0.01118925, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.04412413, + "balance_loss_mlp": 1.02319026, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.9950810294720922, + "language_loss": 0.66769683, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68929797, + "num_input_tokens_seen": 95995435, + "step": 4441, + "time_per_iteration": 4.025890588760376 + }, + { + "auxiliary_loss_clip": 0.01132435, + "auxiliary_loss_mlp": 0.01040697, + "balance_loss_clip": 1.04720616, + "balance_loss_mlp": 1.02231526, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.58420826992097, + "language_loss": 0.76144904, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78318036, + "num_input_tokens_seen": 96016340, + "step": 4442, + "time_per_iteration": 2.598992109298706 + }, + { + "auxiliary_loss_clip": 0.01028038, + "auxiliary_loss_mlp": 0.01052912, + "balance_loss_clip": 1.0481925, + "balance_loss_mlp": 1.05065918, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.9482300054512773, + "language_loss": 0.61250812, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63331759, + "num_input_tokens_seen": 96071205, + "step": 4443, + "time_per_iteration": 3.076490640640259 + }, + { + "auxiliary_loss_clip": 0.01117687, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.05197024, + "balance_loss_mlp": 1.02174687, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.614187019961944, + "language_loss": 0.76201987, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78358144, + "num_input_tokens_seen": 96094240, + "step": 4444, + "time_per_iteration": 2.751551628112793 + }, + { + "auxiliary_loss_clip": 0.01134833, + "auxiliary_loss_mlp": 0.01039758, + "balance_loss_clip": 1.05142784, + "balance_loss_mlp": 1.02196038, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 2.607037710666305, + "language_loss": 0.80589592, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82764184, + "num_input_tokens_seen": 96114105, + "step": 4445, + "time_per_iteration": 2.5191569328308105 + }, + { + "auxiliary_loss_clip": 0.01118839, + "auxiliary_loss_mlp": 0.0103771, + "balance_loss_clip": 1.04950118, + "balance_loss_mlp": 1.01943529, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.05712903741589, + "language_loss": 0.88506532, + "learning_rate": 3.43776545600926e-06, + "loss": 0.90663075, + "num_input_tokens_seen": 96132140, + "step": 4446, + "time_per_iteration": 2.484579563140869 + }, + { + "auxiliary_loss_clip": 0.01138582, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.05281615, + "balance_loss_mlp": 1.02831984, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.9079983497097528, + "language_loss": 0.68037552, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70221496, + "num_input_tokens_seen": 96152090, + "step": 4447, + "time_per_iteration": 2.539320468902588 + }, + { + "auxiliary_loss_clip": 0.01135174, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.05002809, + "balance_loss_mlp": 1.02519464, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 3.1225467230884236, + "language_loss": 0.82825398, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85002887, + "num_input_tokens_seen": 96170015, + "step": 4448, + "time_per_iteration": 3.9025964736938477 + }, + { + "auxiliary_loss_clip": 0.01112887, + "auxiliary_loss_mlp": 0.01051355, + "balance_loss_clip": 1.05161989, + "balance_loss_mlp": 1.03339016, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 1.6238274305819747, + "language_loss": 0.84679341, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86843586, + "num_input_tokens_seen": 96188065, + "step": 4449, + "time_per_iteration": 2.5864017009735107 + }, + { + "auxiliary_loss_clip": 0.01132396, + "auxiliary_loss_mlp": 0.01050905, + "balance_loss_clip": 1.05342102, + "balance_loss_mlp": 1.0309968, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.712048606236659, + "language_loss": 0.84143317, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86326623, + "num_input_tokens_seen": 96205780, + "step": 4450, + "time_per_iteration": 2.5662238597869873 + }, + { + "auxiliary_loss_clip": 0.01104087, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_clip": 1.04482579, + "balance_loss_mlp": 1.02762175, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.7418026433562168, + "language_loss": 0.80777937, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.82925904, + "num_input_tokens_seen": 96224990, + "step": 4451, + "time_per_iteration": 2.561830997467041 + }, + { + "auxiliary_loss_clip": 0.01134601, + "auxiliary_loss_mlp": 0.01046595, + "balance_loss_clip": 1.05088902, + "balance_loss_mlp": 1.03019214, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.8169668842350655, + "language_loss": 0.86353707, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88534898, + "num_input_tokens_seen": 96245345, + "step": 4452, + "time_per_iteration": 2.547192096710205 + }, + { + "auxiliary_loss_clip": 0.01128581, + "auxiliary_loss_mlp": 0.0104657, + "balance_loss_clip": 1.0511694, + "balance_loss_mlp": 1.02867627, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 2.1424976372697127, + "language_loss": 0.83056855, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85232008, + "num_input_tokens_seen": 96259000, + "step": 4453, + "time_per_iteration": 2.469618797302246 + }, + { + "auxiliary_loss_clip": 0.01134897, + "auxiliary_loss_mlp": 0.01053249, + "balance_loss_clip": 1.05200243, + "balance_loss_mlp": 1.03486669, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.9363562978649411, + "language_loss": 0.79713511, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81901658, + "num_input_tokens_seen": 96277000, + "step": 4454, + "time_per_iteration": 2.481304407119751 + }, + { + "auxiliary_loss_clip": 0.01128485, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_clip": 1.05290067, + "balance_loss_mlp": 1.02698207, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.6530149187654257, + "language_loss": 0.72849238, + "learning_rate": 3.435326705894206e-06, + "loss": 0.75022316, + "num_input_tokens_seen": 96297010, + "step": 4455, + "time_per_iteration": 2.543928623199463 + }, + { + "auxiliary_loss_clip": 0.01112659, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.05036592, + "balance_loss_mlp": 1.03019023, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5755080271319872, + "language_loss": 0.73651236, + "learning_rate": 3.435055461383471e-06, + "loss": 0.75810874, + "num_input_tokens_seen": 96315780, + "step": 4456, + "time_per_iteration": 2.571659564971924 + }, + { + "auxiliary_loss_clip": 0.01137434, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_clip": 1.05199957, + "balance_loss_mlp": 1.02664208, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 4.1957703137622975, + "language_loss": 0.71699578, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73881745, + "num_input_tokens_seen": 96333465, + "step": 4457, + "time_per_iteration": 2.4814865589141846 + }, + { + "auxiliary_loss_clip": 0.01108867, + "auxiliary_loss_mlp": 0.0105513, + "balance_loss_clip": 1.04810452, + "balance_loss_mlp": 1.03681958, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 1.6141274028467034, + "language_loss": 0.78907275, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81071275, + "num_input_tokens_seen": 96352005, + "step": 4458, + "time_per_iteration": 4.058926343917847 + }, + { + "auxiliary_loss_clip": 0.01023704, + "auxiliary_loss_mlp": 0.01047473, + "balance_loss_clip": 1.02997017, + "balance_loss_mlp": 1.04483891, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8673935974026934, + "language_loss": 0.58702672, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60773849, + "num_input_tokens_seen": 96406265, + "step": 4459, + "time_per_iteration": 3.0853021144866943 + }, + { + "auxiliary_loss_clip": 0.01092409, + "auxiliary_loss_mlp": 0.01052229, + "balance_loss_clip": 1.04604912, + "balance_loss_mlp": 1.03445458, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.011804731768121, + "language_loss": 0.84954268, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87098902, + "num_input_tokens_seen": 96425225, + "step": 4460, + "time_per_iteration": 2.594776153564453 + }, + { + "auxiliary_loss_clip": 0.01130169, + "auxiliary_loss_mlp": 0.0105508, + "balance_loss_clip": 1.04916871, + "balance_loss_mlp": 1.03469491, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 1.8520925584069585, + "language_loss": 0.68602288, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70787537, + "num_input_tokens_seen": 96443780, + "step": 4461, + "time_per_iteration": 2.4537739753723145 + }, + { + "auxiliary_loss_clip": 0.01112478, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.04739714, + "balance_loss_mlp": 1.03383446, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.8820439227167467, + "language_loss": 0.67021775, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69186211, + "num_input_tokens_seen": 96464530, + "step": 4462, + "time_per_iteration": 2.5921342372894287 + }, + { + "auxiliary_loss_clip": 0.01112179, + "auxiliary_loss_mlp": 0.01041802, + "balance_loss_clip": 1.050107, + "balance_loss_mlp": 1.02358687, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.726739076563061, + "language_loss": 0.69517601, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71671587, + "num_input_tokens_seen": 96483345, + "step": 4463, + "time_per_iteration": 2.5922768115997314 + }, + { + "auxiliary_loss_clip": 0.01115685, + "auxiliary_loss_mlp": 0.01048148, + "balance_loss_clip": 1.04518294, + "balance_loss_mlp": 1.02824008, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.4594081145162634, + "language_loss": 0.78148401, + "learning_rate": 3.432883547133931e-06, + "loss": 0.80312228, + "num_input_tokens_seen": 96498305, + "step": 4464, + "time_per_iteration": 2.455007791519165 + }, + { + "auxiliary_loss_clip": 0.01129771, + "auxiliary_loss_mlp": 0.01045613, + "balance_loss_clip": 1.05031657, + "balance_loss_mlp": 1.02711189, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.7897071620209781, + "language_loss": 0.70973843, + "learning_rate": 3.432611813236704e-06, + "loss": 0.73149228, + "num_input_tokens_seen": 96519740, + "step": 4465, + "time_per_iteration": 2.571929693222046 + }, + { + "auxiliary_loss_clip": 0.0105202, + "auxiliary_loss_mlp": 0.0101301, + "balance_loss_clip": 1.03361118, + "balance_loss_mlp": 1.01023281, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6911729020862802, + "language_loss": 0.5301007, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55075097, + "num_input_tokens_seen": 96588870, + "step": 4466, + "time_per_iteration": 3.2382030487060547 + }, + { + "auxiliary_loss_clip": 0.01115409, + "auxiliary_loss_mlp": 0.01050388, + "balance_loss_clip": 1.04505062, + "balance_loss_mlp": 1.02968144, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.065629058940034, + "language_loss": 0.74020565, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.76186359, + "num_input_tokens_seen": 96605100, + "step": 4467, + "time_per_iteration": 2.483659267425537 + }, + { + "auxiliary_loss_clip": 0.01122951, + "auxiliary_loss_mlp": 0.00787522, + "balance_loss_clip": 1.04855871, + "balance_loss_mlp": 1.00123882, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.3221468676315986, + "language_loss": 0.80858415, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82768881, + "num_input_tokens_seen": 96621410, + "step": 4468, + "time_per_iteration": 2.4916818141937256 + }, + { + "auxiliary_loss_clip": 0.01066461, + "auxiliary_loss_mlp": 0.01011108, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.00822341, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8778931931158048, + "language_loss": 0.59603643, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61681217, + "num_input_tokens_seen": 96684810, + "step": 4469, + "time_per_iteration": 3.0907301902770996 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01047717, + "balance_loss_clip": 1.05242217, + "balance_loss_mlp": 1.02834582, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.0898568532619546, + "language_loss": 0.81181091, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83377874, + "num_input_tokens_seen": 96701920, + "step": 4470, + "time_per_iteration": 2.4377241134643555 + }, + { + "auxiliary_loss_clip": 0.01113739, + "auxiliary_loss_mlp": 0.01040982, + "balance_loss_clip": 1.04842114, + "balance_loss_mlp": 1.02320838, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.5810122288335937, + "language_loss": 0.8306489, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.8521961, + "num_input_tokens_seen": 96721260, + "step": 4471, + "time_per_iteration": 2.512296676635742 + }, + { + "auxiliary_loss_clip": 0.01129656, + "auxiliary_loss_mlp": 0.01044277, + "balance_loss_clip": 1.05209482, + "balance_loss_mlp": 1.02708709, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 2.0650027758221805, + "language_loss": 0.69321203, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71495134, + "num_input_tokens_seen": 96740385, + "step": 4472, + "time_per_iteration": 2.5511422157287598 + }, + { + "auxiliary_loss_clip": 0.01145475, + "auxiliary_loss_mlp": 0.01040861, + "balance_loss_clip": 1.05172253, + "balance_loss_mlp": 1.02350402, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.8213708904915578, + "language_loss": 0.67929238, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70115578, + "num_input_tokens_seen": 96761860, + "step": 4473, + "time_per_iteration": 2.497408151626587 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01052177, + "balance_loss_clip": 1.05150485, + "balance_loss_mlp": 1.0351423, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.7471182472783309, + "language_loss": 0.83055919, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85222697, + "num_input_tokens_seen": 96781890, + "step": 4474, + "time_per_iteration": 2.510756015777588 + }, + { + "auxiliary_loss_clip": 0.01131624, + "auxiliary_loss_mlp": 0.01049737, + "balance_loss_clip": 1.04965782, + "balance_loss_mlp": 1.0323447, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 1.8023769328784967, + "language_loss": 0.70397091, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72578454, + "num_input_tokens_seen": 96800390, + "step": 4475, + "time_per_iteration": 2.4653947353363037 + }, + { + "auxiliary_loss_clip": 0.01110526, + "auxiliary_loss_mlp": 0.0078569, + "balance_loss_clip": 1.05006492, + "balance_loss_mlp": 1.00088072, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.7733100047111465, + "language_loss": 0.73264581, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75160801, + "num_input_tokens_seen": 96816685, + "step": 4476, + "time_per_iteration": 2.541938543319702 + }, + { + "auxiliary_loss_clip": 0.01115718, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.04928195, + "balance_loss_mlp": 1.02764487, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 2.511417105766588, + "language_loss": 0.80639476, + "learning_rate": 3.429346772085922e-06, + "loss": 0.82800335, + "num_input_tokens_seen": 96836285, + "step": 4477, + "time_per_iteration": 2.5017964839935303 + }, + { + "auxiliary_loss_clip": 0.01090875, + "auxiliary_loss_mlp": 0.01054948, + "balance_loss_clip": 1.04597127, + "balance_loss_mlp": 1.03707814, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.6146230260116183, + "language_loss": 0.65541625, + "learning_rate": 3.429074332770984e-06, + "loss": 0.6768744, + "num_input_tokens_seen": 96857745, + "step": 4478, + "time_per_iteration": 4.180776596069336 + }, + { + "auxiliary_loss_clip": 0.01127308, + "auxiliary_loss_mlp": 0.01049576, + "balance_loss_clip": 1.04579735, + "balance_loss_mlp": 1.03176665, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.938809050126264, + "language_loss": 0.81000745, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83177632, + "num_input_tokens_seen": 96877295, + "step": 4479, + "time_per_iteration": 2.5019378662109375 + }, + { + "auxiliary_loss_clip": 0.01122249, + "auxiliary_loss_mlp": 0.0078492, + "balance_loss_clip": 1.04656768, + "balance_loss_mlp": 1.00129342, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 1.976683632780144, + "language_loss": 0.80609798, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.82516968, + "num_input_tokens_seen": 96896160, + "step": 4480, + "time_per_iteration": 2.5213892459869385 + }, + { + "auxiliary_loss_clip": 0.01096987, + "auxiliary_loss_mlp": 0.01052558, + "balance_loss_clip": 1.04691625, + "balance_loss_mlp": 1.03421211, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.6523643845054352, + "language_loss": 0.77763081, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.79912627, + "num_input_tokens_seen": 96915410, + "step": 4481, + "time_per_iteration": 4.424159049987793 + }, + { + "auxiliary_loss_clip": 0.01132798, + "auxiliary_loss_mlp": 0.01050455, + "balance_loss_clip": 1.04658461, + "balance_loss_mlp": 1.03189433, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 2.307956073893963, + "language_loss": 0.73965406, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76148653, + "num_input_tokens_seen": 96937865, + "step": 4482, + "time_per_iteration": 2.5688889026641846 + }, + { + "auxiliary_loss_clip": 0.01116634, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_clip": 1.04678059, + "balance_loss_mlp": 1.0304786, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 2.64034775923599, + "language_loss": 0.73175383, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.75340414, + "num_input_tokens_seen": 96957710, + "step": 4483, + "time_per_iteration": 2.5146889686584473 + }, + { + "auxiliary_loss_clip": 0.01131266, + "auxiliary_loss_mlp": 0.01050722, + "balance_loss_clip": 1.04362535, + "balance_loss_mlp": 1.0319941, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.291363962724417, + "language_loss": 0.87120152, + "learning_rate": 3.427438559239605e-06, + "loss": 0.89302135, + "num_input_tokens_seen": 96975890, + "step": 4484, + "time_per_iteration": 2.457109212875366 + }, + { + "auxiliary_loss_clip": 0.01132061, + "auxiliary_loss_mlp": 0.01052406, + "balance_loss_clip": 1.04740906, + "balance_loss_mlp": 1.03558552, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.5475691304870263, + "language_loss": 0.6645472, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68639195, + "num_input_tokens_seen": 96998595, + "step": 4485, + "time_per_iteration": 2.568849802017212 + }, + { + "auxiliary_loss_clip": 0.01110469, + "auxiliary_loss_mlp": 0.01046825, + "balance_loss_clip": 1.0421524, + "balance_loss_mlp": 1.02892601, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.858730953892121, + "language_loss": 0.73164964, + "learning_rate": 3.426892868256604e-06, + "loss": 0.75322258, + "num_input_tokens_seen": 97013715, + "step": 4486, + "time_per_iteration": 2.4695682525634766 + }, + { + "auxiliary_loss_clip": 0.01147259, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.05099869, + "balance_loss_mlp": 1.02257895, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.976120832522292, + "language_loss": 0.84227931, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.86414611, + "num_input_tokens_seen": 97031570, + "step": 4487, + "time_per_iteration": 3.8551321029663086 + }, + { + "auxiliary_loss_clip": 0.01119455, + "auxiliary_loss_mlp": 0.01043589, + "balance_loss_clip": 1.05095124, + "balance_loss_mlp": 1.02576733, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.633000458277339, + "language_loss": 0.71633291, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73796332, + "num_input_tokens_seen": 97049815, + "step": 4488, + "time_per_iteration": 2.585075855255127 + }, + { + "auxiliary_loss_clip": 0.01067797, + "auxiliary_loss_mlp": 0.0105784, + "balance_loss_clip": 1.0419569, + "balance_loss_mlp": 1.03884983, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.7646184005756373, + "language_loss": 0.8362689, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85752523, + "num_input_tokens_seen": 97067570, + "step": 4489, + "time_per_iteration": 2.723968029022217 + }, + { + "auxiliary_loss_clip": 0.01119531, + "auxiliary_loss_mlp": 0.01052127, + "balance_loss_clip": 1.04980826, + "balance_loss_mlp": 1.03392339, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.2809187533836663, + "language_loss": 0.90008396, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92180055, + "num_input_tokens_seen": 97082180, + "step": 4490, + "time_per_iteration": 2.4590158462524414 + }, + { + "auxiliary_loss_clip": 0.0107387, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.04312134, + "balance_loss_mlp": 1.02961421, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.7832504646130782, + "language_loss": 0.73256683, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75378275, + "num_input_tokens_seen": 97103470, + "step": 4491, + "time_per_iteration": 2.7449419498443604 + }, + { + "auxiliary_loss_clip": 0.01150597, + "auxiliary_loss_mlp": 0.0104869, + "balance_loss_clip": 1.05503941, + "balance_loss_mlp": 1.03110683, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 2.4417173201487405, + "language_loss": 0.74878645, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.77077925, + "num_input_tokens_seen": 97118100, + "step": 4492, + "time_per_iteration": 2.4038875102996826 + }, + { + "auxiliary_loss_clip": 0.01121409, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.0472796, + "balance_loss_mlp": 1.02493954, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.8535730740628338, + "language_loss": 0.89377809, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91541862, + "num_input_tokens_seen": 97136765, + "step": 4493, + "time_per_iteration": 2.52477765083313 + }, + { + "auxiliary_loss_clip": 0.01134418, + "auxiliary_loss_mlp": 0.01046603, + "balance_loss_clip": 1.0487771, + "balance_loss_mlp": 1.02993715, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.5779714562735545, + "language_loss": 0.71319473, + "learning_rate": 3.424707940835998e-06, + "loss": 0.7350049, + "num_input_tokens_seen": 97157470, + "step": 4494, + "time_per_iteration": 2.5777320861816406 + }, + { + "auxiliary_loss_clip": 0.01117448, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.05019164, + "balance_loss_mlp": 1.02630651, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 1.9819525885366265, + "language_loss": 0.86186522, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88346499, + "num_input_tokens_seen": 97176905, + "step": 4495, + "time_per_iteration": 2.568443536758423 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01048214, + "balance_loss_clip": 1.05166864, + "balance_loss_mlp": 1.03080964, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.873537443484336, + "language_loss": 0.76412034, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78605437, + "num_input_tokens_seen": 97196380, + "step": 4496, + "time_per_iteration": 2.4694643020629883 + }, + { + "auxiliary_loss_clip": 0.01068031, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.03335333, + "balance_loss_mlp": 1.02983439, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7042807521781806, + "language_loss": 0.50202435, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52302694, + "num_input_tokens_seen": 97260100, + "step": 4497, + "time_per_iteration": 3.1563494205474854 + }, + { + "auxiliary_loss_clip": 0.01100473, + "auxiliary_loss_mlp": 0.01045472, + "balance_loss_clip": 1.05001748, + "balance_loss_mlp": 1.02903318, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.8126718237103105, + "language_loss": 0.72449815, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74595761, + "num_input_tokens_seen": 97277935, + "step": 4498, + "time_per_iteration": 4.281804084777832 + }, + { + "auxiliary_loss_clip": 0.01042614, + "auxiliary_loss_mlp": 0.010111, + "balance_loss_clip": 1.03299713, + "balance_loss_mlp": 1.0086565, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7532017237726487, + "language_loss": 0.59192479, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61246192, + "num_input_tokens_seen": 97338845, + "step": 4499, + "time_per_iteration": 3.246152400970459 + }, + { + "auxiliary_loss_clip": 0.01123424, + "auxiliary_loss_mlp": 0.01038914, + "balance_loss_clip": 1.05033541, + "balance_loss_mlp": 1.02162886, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 5.129484216047558, + "language_loss": 0.73814934, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75977272, + "num_input_tokens_seen": 97356640, + "step": 4500, + "time_per_iteration": 2.6717517375946045 + }, + { + "auxiliary_loss_clip": 0.01112135, + "auxiliary_loss_mlp": 0.01045667, + "balance_loss_clip": 1.04338861, + "balance_loss_mlp": 1.02721381, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.4211074389894063, + "language_loss": 0.8098402, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.83141828, + "num_input_tokens_seen": 97372585, + "step": 4501, + "time_per_iteration": 2.5814003944396973 + }, + { + "auxiliary_loss_clip": 0.01103084, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_clip": 1.04814172, + "balance_loss_mlp": 1.03227496, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.9553018114234697, + "language_loss": 0.72522753, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74677789, + "num_input_tokens_seen": 97393315, + "step": 4502, + "time_per_iteration": 2.660416841506958 + }, + { + "auxiliary_loss_clip": 0.01127535, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.05014634, + "balance_loss_mlp": 1.0246973, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.9015036429656986, + "language_loss": 0.68449241, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70620781, + "num_input_tokens_seen": 97417860, + "step": 4503, + "time_per_iteration": 2.7431435585021973 + }, + { + "auxiliary_loss_clip": 0.01102294, + "auxiliary_loss_mlp": 0.01051285, + "balance_loss_clip": 1.04865515, + "balance_loss_mlp": 1.03212857, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 1.7836974249976658, + "language_loss": 0.6814971, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70303297, + "num_input_tokens_seen": 97436780, + "step": 4504, + "time_per_iteration": 2.62640380859375 + }, + { + "auxiliary_loss_clip": 0.01136893, + "auxiliary_loss_mlp": 0.01042095, + "balance_loss_clip": 1.05346811, + "balance_loss_mlp": 1.02546561, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.4911011941463803, + "language_loss": 0.75981969, + "learning_rate": 3.421698021097902e-06, + "loss": 0.78160954, + "num_input_tokens_seen": 97456190, + "step": 4505, + "time_per_iteration": 2.543025016784668 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.0105548, + "balance_loss_clip": 1.0496521, + "balance_loss_mlp": 1.03618002, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 2.5372051897156034, + "language_loss": 0.73569232, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75772989, + "num_input_tokens_seen": 97474545, + "step": 4506, + "time_per_iteration": 2.4327454566955566 + }, + { + "auxiliary_loss_clip": 0.01127923, + "auxiliary_loss_mlp": 0.01044174, + "balance_loss_clip": 1.04970121, + "balance_loss_mlp": 1.02557731, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.2054325227856757, + "language_loss": 0.80567062, + "learning_rate": 3.421150061716715e-06, + "loss": 0.82739156, + "num_input_tokens_seen": 97494520, + "step": 4507, + "time_per_iteration": 2.5437347888946533 + }, + { + "auxiliary_loss_clip": 0.01054629, + "auxiliary_loss_mlp": 0.01006694, + "balance_loss_clip": 1.03743076, + "balance_loss_mlp": 1.00414312, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7435266223892865, + "language_loss": 0.5089438, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52955705, + "num_input_tokens_seen": 97552455, + "step": 4508, + "time_per_iteration": 3.095015525817871 + }, + { + "auxiliary_loss_clip": 0.01073641, + "auxiliary_loss_mlp": 0.01042685, + "balance_loss_clip": 1.04260027, + "balance_loss_mlp": 1.0255425, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 2.0898667372092623, + "language_loss": 0.74821305, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76937628, + "num_input_tokens_seen": 97572650, + "step": 4509, + "time_per_iteration": 2.6667897701263428 + }, + { + "auxiliary_loss_clip": 0.01132418, + "auxiliary_loss_mlp": 0.01039907, + "balance_loss_clip": 1.05272746, + "balance_loss_mlp": 1.02437425, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.6744165616928035, + "language_loss": 0.71472347, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73644674, + "num_input_tokens_seen": 97591150, + "step": 4510, + "time_per_iteration": 2.4921998977661133 + }, + { + "auxiliary_loss_clip": 0.01133352, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.05302536, + "balance_loss_mlp": 1.02202153, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 2.486576424657511, + "language_loss": 0.69980347, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72152215, + "num_input_tokens_seen": 97607410, + "step": 4511, + "time_per_iteration": 2.5294697284698486 + }, + { + "auxiliary_loss_clip": 0.01122135, + "auxiliary_loss_mlp": 0.0104881, + "balance_loss_clip": 1.05103135, + "balance_loss_mlp": 1.0305357, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.2726206698896667, + "language_loss": 0.81147766, + "learning_rate": 3.419779220367979e-06, + "loss": 0.8331871, + "num_input_tokens_seen": 97626870, + "step": 4512, + "time_per_iteration": 2.715088367462158 + }, + { + "auxiliary_loss_clip": 0.0114517, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.05388916, + "balance_loss_mlp": 1.0183022, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.7438811994192547, + "language_loss": 0.80450547, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82629263, + "num_input_tokens_seen": 97646595, + "step": 4513, + "time_per_iteration": 2.6261167526245117 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.04905725, + "balance_loss_mlp": 1.02681255, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.7657922846274174, + "language_loss": 0.88257134, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90422446, + "num_input_tokens_seen": 97665485, + "step": 4514, + "time_per_iteration": 2.5456960201263428 + }, + { + "auxiliary_loss_clip": 0.01124225, + "auxiliary_loss_mlp": 0.01043168, + "balance_loss_clip": 1.05118155, + "balance_loss_mlp": 1.02600133, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 1.710221330160683, + "language_loss": 0.92091322, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94258714, + "num_input_tokens_seen": 97683800, + "step": 4515, + "time_per_iteration": 2.526195764541626 + }, + { + "auxiliary_loss_clip": 0.01101617, + "auxiliary_loss_mlp": 0.01055796, + "balance_loss_clip": 1.05009425, + "balance_loss_mlp": 1.03659093, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.309502919282556, + "language_loss": 0.7365675, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.75814164, + "num_input_tokens_seen": 97700505, + "step": 4516, + "time_per_iteration": 4.146208047866821 + }, + { + "auxiliary_loss_clip": 0.0113915, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.05573308, + "balance_loss_mlp": 1.02359724, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 2.400311415692359, + "language_loss": 0.76042163, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78222698, + "num_input_tokens_seen": 97717410, + "step": 4517, + "time_per_iteration": 2.583601236343384 + }, + { + "auxiliary_loss_clip": 0.01098994, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.04298592, + "balance_loss_mlp": 1.02549434, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.2808339102312583, + "language_loss": 0.76684088, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.78826863, + "num_input_tokens_seen": 97734545, + "step": 4518, + "time_per_iteration": 2.671778917312622 + }, + { + "auxiliary_loss_clip": 0.01117754, + "auxiliary_loss_mlp": 0.01044682, + "balance_loss_clip": 1.05130088, + "balance_loss_mlp": 1.02861297, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.8198245349977458, + "language_loss": 0.68644071, + "learning_rate": 3.41785778156811e-06, + "loss": 0.70806509, + "num_input_tokens_seen": 97754000, + "step": 4519, + "time_per_iteration": 2.680100917816162 + }, + { + "auxiliary_loss_clip": 0.01134753, + "auxiliary_loss_mlp": 0.01040637, + "balance_loss_clip": 1.05264187, + "balance_loss_mlp": 1.02480578, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.222165351318766, + "language_loss": 0.7591331, + "learning_rate": 3.417583075166451e-06, + "loss": 0.78088701, + "num_input_tokens_seen": 97772080, + "step": 4520, + "time_per_iteration": 4.1779539585113525 + }, + { + "auxiliary_loss_clip": 0.01135948, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_clip": 1.0507375, + "balance_loss_mlp": 1.03184438, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.127806789106889, + "language_loss": 0.76434344, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78620934, + "num_input_tokens_seen": 97789370, + "step": 4521, + "time_per_iteration": 2.5897436141967773 + }, + { + "auxiliary_loss_clip": 0.01118941, + "auxiliary_loss_mlp": 0.0105559, + "balance_loss_clip": 1.04558182, + "balance_loss_mlp": 1.03762519, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.335763377962478, + "language_loss": 0.75255781, + "learning_rate": 3.417033501108875e-06, + "loss": 0.7743032, + "num_input_tokens_seen": 97807385, + "step": 4522, + "time_per_iteration": 2.5622358322143555 + }, + { + "auxiliary_loss_clip": 0.0115215, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_clip": 1.05717504, + "balance_loss_mlp": 1.02878904, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 1.7803926954771776, + "language_loss": 0.72717988, + "learning_rate": 3.416758633473798e-06, + "loss": 0.74915689, + "num_input_tokens_seen": 97827930, + "step": 4523, + "time_per_iteration": 2.5034055709838867 + }, + { + "auxiliary_loss_clip": 0.01119386, + "auxiliary_loss_mlp": 0.01040289, + "balance_loss_clip": 1.0509212, + "balance_loss_mlp": 1.0230391, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.66853093519462, + "language_loss": 0.74450517, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.7661019, + "num_input_tokens_seen": 97847440, + "step": 4524, + "time_per_iteration": 2.5198357105255127 + }, + { + "auxiliary_loss_clip": 0.01151904, + "auxiliary_loss_mlp": 0.01042263, + "balance_loss_clip": 1.05832171, + "balance_loss_mlp": 1.02514482, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 1.6941977200200191, + "language_loss": 0.76546603, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78740776, + "num_input_tokens_seen": 97867620, + "step": 4525, + "time_per_iteration": 2.485917329788208 + }, + { + "auxiliary_loss_clip": 0.01137111, + "auxiliary_loss_mlp": 0.01052956, + "balance_loss_clip": 1.054708, + "balance_loss_mlp": 1.03648126, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.8291500509371383, + "language_loss": 0.81795967, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.83986032, + "num_input_tokens_seen": 97884345, + "step": 4526, + "time_per_iteration": 2.471776247024536 + }, + { + "auxiliary_loss_clip": 0.01153146, + "auxiliary_loss_mlp": 0.01049117, + "balance_loss_clip": 1.05487609, + "balance_loss_mlp": 1.03067505, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 2.0104540382399554, + "language_loss": 0.77418596, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79620862, + "num_input_tokens_seen": 97901500, + "step": 4527, + "time_per_iteration": 3.8019580841064453 + }, + { + "auxiliary_loss_clip": 0.01110651, + "auxiliary_loss_mlp": 0.00786226, + "balance_loss_clip": 1.05198526, + "balance_loss_mlp": 1.00102711, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.269569950856385, + "language_loss": 0.82063931, + "learning_rate": 3.415383489652503e-06, + "loss": 0.83960807, + "num_input_tokens_seen": 97917800, + "step": 4528, + "time_per_iteration": 2.5258939266204834 + }, + { + "auxiliary_loss_clip": 0.01113803, + "auxiliary_loss_mlp": 0.01050134, + "balance_loss_clip": 1.04667568, + "balance_loss_mlp": 1.03300381, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 2.160266538878257, + "language_loss": 0.7736699, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79530931, + "num_input_tokens_seen": 97937225, + "step": 4529, + "time_per_iteration": 2.559467077255249 + }, + { + "auxiliary_loss_clip": 0.0112548, + "auxiliary_loss_mlp": 0.01051279, + "balance_loss_clip": 1.05546951, + "balance_loss_mlp": 1.03480458, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.162193095538029, + "language_loss": 0.82145441, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84322202, + "num_input_tokens_seen": 97956845, + "step": 4530, + "time_per_iteration": 2.5052084922790527 + }, + { + "auxiliary_loss_clip": 0.01137814, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_clip": 1.05437505, + "balance_loss_mlp": 1.03017473, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.2814569430812774, + "language_loss": 0.91929722, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.94115186, + "num_input_tokens_seen": 97972465, + "step": 4531, + "time_per_iteration": 2.4496092796325684 + }, + { + "auxiliary_loss_clip": 0.0113859, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.05347157, + "balance_loss_mlp": 1.03003895, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 2.1920956146122723, + "language_loss": 0.76879293, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.79065233, + "num_input_tokens_seen": 97990770, + "step": 4532, + "time_per_iteration": 2.499540328979492 + }, + { + "auxiliary_loss_clip": 0.01114962, + "auxiliary_loss_mlp": 0.01036305, + "balance_loss_clip": 1.05397415, + "balance_loss_mlp": 1.02052188, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 2.2932280340818494, + "language_loss": 0.88970113, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91121387, + "num_input_tokens_seen": 98005775, + "step": 4533, + "time_per_iteration": 2.510505437850952 + }, + { + "auxiliary_loss_clip": 0.01122721, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.05163515, + "balance_loss_mlp": 1.02401316, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 1.8238724519017984, + "language_loss": 0.7148751, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73650515, + "num_input_tokens_seen": 98025750, + "step": 4534, + "time_per_iteration": 2.5454013347625732 + }, + { + "auxiliary_loss_clip": 0.01122215, + "auxiliary_loss_mlp": 0.01045086, + "balance_loss_clip": 1.05287981, + "balance_loss_mlp": 1.02702558, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.6466620257879068, + "language_loss": 0.91332006, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93499309, + "num_input_tokens_seen": 98044955, + "step": 4535, + "time_per_iteration": 2.5319464206695557 + }, + { + "auxiliary_loss_clip": 0.01128134, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.05347526, + "balance_loss_mlp": 1.03078043, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.69873795954919, + "language_loss": 0.73223555, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.75399613, + "num_input_tokens_seen": 98065860, + "step": 4536, + "time_per_iteration": 2.5671348571777344 + }, + { + "auxiliary_loss_clip": 0.01136985, + "auxiliary_loss_mlp": 0.0104296, + "balance_loss_clip": 1.05131149, + "balance_loss_mlp": 1.02591288, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.7604037236644212, + "language_loss": 0.72052824, + "learning_rate": 3.41290485034781e-06, + "loss": 0.74232769, + "num_input_tokens_seen": 98085450, + "step": 4537, + "time_per_iteration": 4.1642539501190186 + }, + { + "auxiliary_loss_clip": 0.0111963, + "auxiliary_loss_mlp": 0.01045976, + "balance_loss_clip": 1.04858875, + "balance_loss_mlp": 1.02791572, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.1600341425304816, + "language_loss": 0.78563428, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80729032, + "num_input_tokens_seen": 98099115, + "step": 4538, + "time_per_iteration": 2.4602839946746826 + }, + { + "auxiliary_loss_clip": 0.0113161, + "auxiliary_loss_mlp": 0.01048372, + "balance_loss_clip": 1.0511744, + "balance_loss_mlp": 1.0321238, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.453693846057514, + "language_loss": 0.90148616, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92328596, + "num_input_tokens_seen": 98118415, + "step": 4539, + "time_per_iteration": 2.504084825515747 + }, + { + "auxiliary_loss_clip": 0.01123097, + "auxiliary_loss_mlp": 0.01041382, + "balance_loss_clip": 1.05009592, + "balance_loss_mlp": 1.02344084, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.096992418916607, + "language_loss": 0.87833178, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.89997649, + "num_input_tokens_seen": 98136300, + "step": 4540, + "time_per_iteration": 2.4838449954986572 + }, + { + "auxiliary_loss_clip": 0.01134208, + "auxiliary_loss_mlp": 0.00782504, + "balance_loss_clip": 1.04820371, + "balance_loss_mlp": 1.00110161, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.9746631210453107, + "language_loss": 0.81865782, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.83782494, + "num_input_tokens_seen": 98154580, + "step": 4541, + "time_per_iteration": 2.4776899814605713 + }, + { + "auxiliary_loss_clip": 0.01124278, + "auxiliary_loss_mlp": 0.01045197, + "balance_loss_clip": 1.05289292, + "balance_loss_mlp": 1.0284245, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.8214078846147002, + "language_loss": 0.798329, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.82002378, + "num_input_tokens_seen": 98173115, + "step": 4542, + "time_per_iteration": 2.549328088760376 + }, + { + "auxiliary_loss_clip": 0.01131163, + "auxiliary_loss_mlp": 0.01040237, + "balance_loss_clip": 1.05807257, + "balance_loss_mlp": 1.0237627, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 1.8994141027401674, + "language_loss": 0.8963927, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91810662, + "num_input_tokens_seen": 98190260, + "step": 4543, + "time_per_iteration": 2.502227306365967 + }, + { + "auxiliary_loss_clip": 0.01119135, + "auxiliary_loss_mlp": 0.00785098, + "balance_loss_clip": 1.04954994, + "balance_loss_mlp": 1.00131464, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.2578947146485566, + "language_loss": 0.62925065, + "learning_rate": 3.410974019048255e-06, + "loss": 0.64829296, + "num_input_tokens_seen": 98207115, + "step": 4544, + "time_per_iteration": 2.4947197437286377 + }, + { + "auxiliary_loss_clip": 0.01123678, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_clip": 1.05624795, + "balance_loss_mlp": 1.02695811, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.5300113133442266, + "language_loss": 0.70089656, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72257674, + "num_input_tokens_seen": 98230610, + "step": 4545, + "time_per_iteration": 2.6393468379974365 + }, + { + "auxiliary_loss_clip": 0.01055459, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.03958738, + "balance_loss_mlp": 1.02665043, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7303026133324019, + "language_loss": 0.61658132, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63742542, + "num_input_tokens_seen": 98293585, + "step": 4546, + "time_per_iteration": 3.132059335708618 + }, + { + "auxiliary_loss_clip": 0.010571, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_clip": 1.04582357, + "balance_loss_mlp": 1.02950537, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 4.986310592423544, + "language_loss": 0.64890933, + "learning_rate": 3.410145717146488e-06, + "loss": 0.66995478, + "num_input_tokens_seen": 98311680, + "step": 4547, + "time_per_iteration": 2.698814630508423 + }, + { + "auxiliary_loss_clip": 0.01122853, + "auxiliary_loss_mlp": 0.00781465, + "balance_loss_clip": 1.05341172, + "balance_loss_mlp": 1.00121903, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 2.2541952549341615, + "language_loss": 0.77863193, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.79767513, + "num_input_tokens_seen": 98330770, + "step": 4548, + "time_per_iteration": 2.7435379028320312 + }, + { + "auxiliary_loss_clip": 0.0111891, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.052755, + "balance_loss_mlp": 1.02632141, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 2.1422246085632053, + "language_loss": 0.8259564, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84755391, + "num_input_tokens_seen": 98349860, + "step": 4549, + "time_per_iteration": 2.5222222805023193 + }, + { + "auxiliary_loss_clip": 0.01133903, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.05052137, + "balance_loss_mlp": 1.0247879, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 3.4626594952812426, + "language_loss": 0.70829046, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73005557, + "num_input_tokens_seen": 98367040, + "step": 4550, + "time_per_iteration": 2.4704928398132324 + }, + { + "auxiliary_loss_clip": 0.01106349, + "auxiliary_loss_mlp": 0.01037804, + "balance_loss_clip": 1.0518291, + "balance_loss_mlp": 1.02270031, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.4210222317817762, + "language_loss": 0.78730941, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80875099, + "num_input_tokens_seen": 98384010, + "step": 4551, + "time_per_iteration": 2.537870407104492 + }, + { + "auxiliary_loss_clip": 0.0110697, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_clip": 1.0480442, + "balance_loss_mlp": 1.03114772, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.554064352964875, + "language_loss": 0.71064389, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.73220503, + "num_input_tokens_seen": 98399625, + "step": 4552, + "time_per_iteration": 2.5127055644989014 + }, + { + "auxiliary_loss_clip": 0.01124911, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.0553509, + "balance_loss_mlp": 1.01775217, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 1.9319082058805779, + "language_loss": 0.71803814, + "learning_rate": 3.408487669858431e-06, + "loss": 0.7396307, + "num_input_tokens_seen": 98417310, + "step": 4553, + "time_per_iteration": 2.496958017349243 + }, + { + "auxiliary_loss_clip": 0.0113269, + "auxiliary_loss_mlp": 0.0103429, + "balance_loss_clip": 1.0514369, + "balance_loss_mlp": 1.01779175, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.604880823787006, + "language_loss": 0.59057331, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61224312, + "num_input_tokens_seen": 98438670, + "step": 4554, + "time_per_iteration": 2.5189766883850098 + }, + { + "auxiliary_loss_clip": 0.01128745, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.05510974, + "balance_loss_mlp": 1.01807964, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 2.142063287553859, + "language_loss": 0.73575377, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.75739497, + "num_input_tokens_seen": 98456060, + "step": 4555, + "time_per_iteration": 2.484898090362549 + }, + { + "auxiliary_loss_clip": 0.01137334, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.05466986, + "balance_loss_mlp": 1.01868129, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 2.1401454121399475, + "language_loss": 0.77800584, + "learning_rate": 3.407657925038002e-06, + "loss": 0.79972625, + "num_input_tokens_seen": 98473765, + "step": 4556, + "time_per_iteration": 4.0241780281066895 + }, + { + "auxiliary_loss_clip": 0.01142103, + "auxiliary_loss_mlp": 0.01045153, + "balance_loss_clip": 1.04873681, + "balance_loss_mlp": 1.02683043, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 2.2779489337663614, + "language_loss": 0.82485747, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84673005, + "num_input_tokens_seen": 98490590, + "step": 4557, + "time_per_iteration": 2.443976640701294 + }, + { + "auxiliary_loss_clip": 0.01092481, + "auxiliary_loss_mlp": 0.01040465, + "balance_loss_clip": 1.0431422, + "balance_loss_mlp": 1.02413344, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.9818047891782304, + "language_loss": 0.72865164, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74998111, + "num_input_tokens_seen": 98510590, + "step": 4558, + "time_per_iteration": 2.623702049255371 + }, + { + "auxiliary_loss_clip": 0.01122419, + "auxiliary_loss_mlp": 0.01046894, + "balance_loss_clip": 1.0503881, + "balance_loss_mlp": 1.030074, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.416985289787847, + "language_loss": 0.67920411, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70089722, + "num_input_tokens_seen": 98527875, + "step": 4559, + "time_per_iteration": 2.50895619392395 + }, + { + "auxiliary_loss_clip": 0.01122245, + "auxiliary_loss_mlp": 0.01049372, + "balance_loss_clip": 1.05014133, + "balance_loss_mlp": 1.03206289, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 1.8458778800754583, + "language_loss": 0.72000605, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74172223, + "num_input_tokens_seen": 98547575, + "step": 4560, + "time_per_iteration": 4.0747761726379395 + }, + { + "auxiliary_loss_clip": 0.0112484, + "auxiliary_loss_mlp": 0.0104254, + "balance_loss_clip": 1.05201924, + "balance_loss_mlp": 1.02637553, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.7819158008603377, + "language_loss": 0.81309813, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83477193, + "num_input_tokens_seen": 98566290, + "step": 4561, + "time_per_iteration": 2.622807741165161 + }, + { + "auxiliary_loss_clip": 0.01146458, + "auxiliary_loss_mlp": 0.01038124, + "balance_loss_clip": 1.05089366, + "balance_loss_mlp": 1.02197123, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 2.3677946335697526, + "language_loss": 0.75039524, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77224106, + "num_input_tokens_seen": 98586255, + "step": 4562, + "time_per_iteration": 2.4516754150390625 + }, + { + "auxiliary_loss_clip": 0.0114542, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.05269909, + "balance_loss_mlp": 1.0225544, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.715317124130069, + "language_loss": 0.74598938, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76783091, + "num_input_tokens_seen": 98606030, + "step": 4563, + "time_per_iteration": 2.46769642829895 + }, + { + "auxiliary_loss_clip": 0.01120178, + "auxiliary_loss_mlp": 0.01046945, + "balance_loss_clip": 1.04970407, + "balance_loss_mlp": 1.02831244, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.8354751412361483, + "language_loss": 0.62518585, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64685708, + "num_input_tokens_seen": 98625225, + "step": 4564, + "time_per_iteration": 2.5506794452667236 + }, + { + "auxiliary_loss_clip": 0.01124336, + "auxiliary_loss_mlp": 0.01045805, + "balance_loss_clip": 1.04957294, + "balance_loss_mlp": 1.02847242, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.666608871461366, + "language_loss": 0.7858398, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80754125, + "num_input_tokens_seen": 98649470, + "step": 4565, + "time_per_iteration": 2.6952884197235107 + }, + { + "auxiliary_loss_clip": 0.01096352, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_clip": 1.04855859, + "balance_loss_mlp": 1.03367162, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 1.914952878818026, + "language_loss": 0.68432707, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70579338, + "num_input_tokens_seen": 98666915, + "step": 4566, + "time_per_iteration": 3.9421775341033936 + }, + { + "auxiliary_loss_clip": 0.01136611, + "auxiliary_loss_mlp": 0.01047682, + "balance_loss_clip": 1.05473137, + "balance_loss_mlp": 1.03201234, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 3.1594846816764504, + "language_loss": 0.61161125, + "learning_rate": 3.404611419371723e-06, + "loss": 0.6334542, + "num_input_tokens_seen": 98688240, + "step": 4567, + "time_per_iteration": 2.5377652645111084 + }, + { + "auxiliary_loss_clip": 0.01133337, + "auxiliary_loss_mlp": 0.01048211, + "balance_loss_clip": 1.05211627, + "balance_loss_mlp": 1.02907801, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.8905626697248807, + "language_loss": 0.82891339, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.85072887, + "num_input_tokens_seen": 98708245, + "step": 4568, + "time_per_iteration": 2.4763715267181396 + }, + { + "auxiliary_loss_clip": 0.01133195, + "auxiliary_loss_mlp": 0.01038745, + "balance_loss_clip": 1.05293894, + "balance_loss_mlp": 1.02190042, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.1799029784776884, + "language_loss": 0.6865437, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.7082631, + "num_input_tokens_seen": 98724575, + "step": 4569, + "time_per_iteration": 2.4585423469543457 + }, + { + "auxiliary_loss_clip": 0.01116824, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.04622579, + "balance_loss_mlp": 1.02699685, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.23928161448103, + "language_loss": 0.70749867, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.7291187, + "num_input_tokens_seen": 98740700, + "step": 4570, + "time_per_iteration": 2.477006435394287 + }, + { + "auxiliary_loss_clip": 0.0104763, + "auxiliary_loss_mlp": 0.01005942, + "balance_loss_clip": 1.0402019, + "balance_loss_mlp": 1.00368881, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7323882189223816, + "language_loss": 0.55771875, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57825446, + "num_input_tokens_seen": 98803030, + "step": 4571, + "time_per_iteration": 3.2479918003082275 + }, + { + "auxiliary_loss_clip": 0.01102069, + "auxiliary_loss_mlp": 0.01049457, + "balance_loss_clip": 1.05268312, + "balance_loss_mlp": 1.03171897, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 2.666561187789905, + "language_loss": 0.77841073, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79992592, + "num_input_tokens_seen": 98820505, + "step": 4572, + "time_per_iteration": 2.577847957611084 + }, + { + "auxiliary_loss_clip": 0.01142557, + "auxiliary_loss_mlp": 0.01040335, + "balance_loss_clip": 1.05318379, + "balance_loss_mlp": 1.02530861, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.5997498975256006, + "language_loss": 0.81559581, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83742476, + "num_input_tokens_seen": 98842150, + "step": 4573, + "time_per_iteration": 2.5281972885131836 + }, + { + "auxiliary_loss_clip": 0.01133674, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.05294323, + "balance_loss_mlp": 1.02622676, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.7008351892654772, + "language_loss": 0.79133278, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81309342, + "num_input_tokens_seen": 98861050, + "step": 4574, + "time_per_iteration": 2.4905545711517334 + }, + { + "auxiliary_loss_clip": 0.01100187, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.04949212, + "balance_loss_mlp": 1.03509057, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 2.51634085873824, + "language_loss": 0.73825312, + "learning_rate": 3.402391730100936e-06, + "loss": 0.75976181, + "num_input_tokens_seen": 98879695, + "step": 4575, + "time_per_iteration": 2.615412712097168 + }, + { + "auxiliary_loss_clip": 0.01122015, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_clip": 1.04960752, + "balance_loss_mlp": 1.02658105, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 2.0050139957728845, + "language_loss": 0.71895039, + "learning_rate": 3.402114029526814e-06, + "loss": 0.74058783, + "num_input_tokens_seen": 98902035, + "step": 4576, + "time_per_iteration": 2.6705374717712402 + }, + { + "auxiliary_loss_clip": 0.01104415, + "auxiliary_loss_mlp": 0.0078734, + "balance_loss_clip": 1.05162287, + "balance_loss_mlp": 1.00115585, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.710766291522849, + "language_loss": 0.73126072, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75017828, + "num_input_tokens_seen": 98921835, + "step": 4577, + "time_per_iteration": 4.126567602157593 + }, + { + "auxiliary_loss_clip": 0.01128106, + "auxiliary_loss_mlp": 0.0103842, + "balance_loss_clip": 1.0555613, + "balance_loss_mlp": 1.02128983, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 1.8518022091312805, + "language_loss": 0.75935555, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78102076, + "num_input_tokens_seen": 98939610, + "step": 4578, + "time_per_iteration": 2.551079750061035 + }, + { + "auxiliary_loss_clip": 0.01117341, + "auxiliary_loss_mlp": 0.01044858, + "balance_loss_clip": 1.05181289, + "balance_loss_mlp": 1.02559388, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 1.4171559256735222, + "language_loss": 0.66593945, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68756139, + "num_input_tokens_seen": 98962250, + "step": 4579, + "time_per_iteration": 2.5931150913238525 + }, + { + "auxiliary_loss_clip": 0.0111181, + "auxiliary_loss_mlp": 0.01050723, + "balance_loss_clip": 1.04473305, + "balance_loss_mlp": 1.03185248, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 2.0666557064592626, + "language_loss": 0.8027041, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82432938, + "num_input_tokens_seen": 98981845, + "step": 4580, + "time_per_iteration": 2.563176155090332 + }, + { + "auxiliary_loss_clip": 0.01136254, + "auxiliary_loss_mlp": 0.01047131, + "balance_loss_clip": 1.05324221, + "balance_loss_mlp": 1.02798581, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.6670676417659729, + "language_loss": 0.67260385, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.69443768, + "num_input_tokens_seen": 99001855, + "step": 4581, + "time_per_iteration": 2.479998826980591 + }, + { + "auxiliary_loss_clip": 0.01129439, + "auxiliary_loss_mlp": 0.0105261, + "balance_loss_clip": 1.0538311, + "balance_loss_mlp": 1.03624272, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.9007871675479802, + "language_loss": 0.77954251, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80136299, + "num_input_tokens_seen": 99019880, + "step": 4582, + "time_per_iteration": 2.4750583171844482 + }, + { + "auxiliary_loss_clip": 0.01098257, + "auxiliary_loss_mlp": 0.01040111, + "balance_loss_clip": 1.04822493, + "balance_loss_mlp": 1.02488756, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.8540803821240703, + "language_loss": 0.84721595, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86859959, + "num_input_tokens_seen": 99037570, + "step": 4583, + "time_per_iteration": 2.546025514602661 + }, + { + "auxiliary_loss_clip": 0.01134455, + "auxiliary_loss_mlp": 0.01044156, + "balance_loss_clip": 1.05021167, + "balance_loss_mlp": 1.02758598, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 1.7300417808155322, + "language_loss": 0.67032307, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69210923, + "num_input_tokens_seen": 99056875, + "step": 4584, + "time_per_iteration": 2.4845807552337646 + }, + { + "auxiliary_loss_clip": 0.01083299, + "auxiliary_loss_mlp": 0.01049927, + "balance_loss_clip": 1.04260039, + "balance_loss_mlp": 1.03189015, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.8372099859479565, + "language_loss": 0.77067924, + "learning_rate": 3.399612333050327e-06, + "loss": 0.7920115, + "num_input_tokens_seen": 99074685, + "step": 4585, + "time_per_iteration": 2.562483787536621 + }, + { + "auxiliary_loss_clip": 0.01137884, + "auxiliary_loss_mlp": 0.00783719, + "balance_loss_clip": 1.0531081, + "balance_loss_mlp": 1.0012517, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.6416375740628975, + "language_loss": 0.71999574, + "learning_rate": 3.399334101267362e-06, + "loss": 0.7392118, + "num_input_tokens_seen": 99095300, + "step": 4586, + "time_per_iteration": 2.4986183643341064 + }, + { + "auxiliary_loss_clip": 0.0112386, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.05288982, + "balance_loss_mlp": 1.01742697, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.5795594853820492, + "language_loss": 0.8060357, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.8276087, + "num_input_tokens_seen": 99115965, + "step": 4587, + "time_per_iteration": 2.5193231105804443 + }, + { + "auxiliary_loss_clip": 0.01133133, + "auxiliary_loss_mlp": 0.01040954, + "balance_loss_clip": 1.05017495, + "balance_loss_mlp": 1.02478325, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 1.873898185488575, + "language_loss": 0.82962561, + "learning_rate": 3.398777478523316e-06, + "loss": 0.85136646, + "num_input_tokens_seen": 99134265, + "step": 4588, + "time_per_iteration": 2.4575297832489014 + }, + { + "auxiliary_loss_clip": 0.01106968, + "auxiliary_loss_mlp": 0.01040272, + "balance_loss_clip": 1.04538047, + "balance_loss_mlp": 1.02408314, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3634159881152321, + "language_loss": 0.75481868, + "learning_rate": 3.398499087583342e-06, + "loss": 0.77629107, + "num_input_tokens_seen": 99156185, + "step": 4589, + "time_per_iteration": 2.5684919357299805 + }, + { + "auxiliary_loss_clip": 0.0113132, + "auxiliary_loss_mlp": 0.01042407, + "balance_loss_clip": 1.04993153, + "balance_loss_mlp": 1.02603912, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.8112493198695145, + "language_loss": 0.88647556, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90821278, + "num_input_tokens_seen": 99176735, + "step": 4590, + "time_per_iteration": 2.5002198219299316 + }, + { + "auxiliary_loss_clip": 0.01128708, + "auxiliary_loss_mlp": 0.01045189, + "balance_loss_clip": 1.05126548, + "balance_loss_mlp": 1.0271883, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.5313785169392453, + "language_loss": 0.71363759, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73537654, + "num_input_tokens_seen": 99199765, + "step": 4591, + "time_per_iteration": 2.6638622283935547 + }, + { + "auxiliary_loss_clip": 0.01112057, + "auxiliary_loss_mlp": 0.01044858, + "balance_loss_clip": 1.05179119, + "balance_loss_mlp": 1.02769184, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 2.7655845442474556, + "language_loss": 0.8027451, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82431436, + "num_input_tokens_seen": 99218435, + "step": 4592, + "time_per_iteration": 2.6474905014038086 + }, + { + "auxiliary_loss_clip": 0.01058453, + "auxiliary_loss_mlp": 0.0075843, + "balance_loss_clip": 1.03170776, + "balance_loss_mlp": 1.0013417, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7113504303404093, + "language_loss": 0.61657578, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63474458, + "num_input_tokens_seen": 99276200, + "step": 4593, + "time_per_iteration": 3.030778646469116 + }, + { + "auxiliary_loss_clip": 0.01128494, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_clip": 1.05200768, + "balance_loss_mlp": 1.0271641, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.995740898227936, + "language_loss": 0.77427214, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79599297, + "num_input_tokens_seen": 99297625, + "step": 4594, + "time_per_iteration": 2.5354161262512207 + }, + { + "auxiliary_loss_clip": 0.01132294, + "auxiliary_loss_mlp": 0.01038631, + "balance_loss_clip": 1.05241799, + "balance_loss_mlp": 1.02240705, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.5581075148412704, + "language_loss": 0.91730261, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93901193, + "num_input_tokens_seen": 99315790, + "step": 4595, + "time_per_iteration": 2.454477310180664 + }, + { + "auxiliary_loss_clip": 0.01129185, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.05009699, + "balance_loss_mlp": 1.02895927, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 2.2467807229669523, + "language_loss": 0.69285357, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71461308, + "num_input_tokens_seen": 99334615, + "step": 4596, + "time_per_iteration": 3.9356775283813477 + }, + { + "auxiliary_loss_clip": 0.01120531, + "auxiliary_loss_mlp": 0.01044894, + "balance_loss_clip": 1.04689002, + "balance_loss_mlp": 1.02807355, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.4623247891528073, + "language_loss": 0.6410042, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.66265845, + "num_input_tokens_seen": 99356685, + "step": 4597, + "time_per_iteration": 2.602224826812744 + }, + { + "auxiliary_loss_clip": 0.01145778, + "auxiliary_loss_mlp": 0.01041938, + "balance_loss_clip": 1.05543184, + "balance_loss_mlp": 1.0266788, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 1.8239243888886876, + "language_loss": 0.86011648, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88199365, + "num_input_tokens_seen": 99374810, + "step": 4598, + "time_per_iteration": 2.428710460662842 + }, + { + "auxiliary_loss_clip": 0.01145807, + "auxiliary_loss_mlp": 0.01043043, + "balance_loss_clip": 1.05281496, + "balance_loss_mlp": 1.02580523, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.461639677766991, + "language_loss": 0.80137491, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82326341, + "num_input_tokens_seen": 99391290, + "step": 4599, + "time_per_iteration": 2.445615291595459 + }, + { + "auxiliary_loss_clip": 0.0112568, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_clip": 1.04876447, + "balance_loss_mlp": 1.03116322, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.7995976371848998, + "language_loss": 0.79034019, + "learning_rate": 3.395433289506639e-06, + "loss": 0.81206852, + "num_input_tokens_seen": 99409120, + "step": 4600, + "time_per_iteration": 4.034775495529175 + }, + { + "auxiliary_loss_clip": 0.01116405, + "auxiliary_loss_mlp": 0.01043975, + "balance_loss_clip": 1.05266619, + "balance_loss_mlp": 1.0270946, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 6.801870312517918, + "language_loss": 0.73060882, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75221252, + "num_input_tokens_seen": 99426180, + "step": 4601, + "time_per_iteration": 2.506972551345825 + }, + { + "auxiliary_loss_clip": 0.01132011, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_clip": 1.05069923, + "balance_loss_mlp": 1.02465069, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.7058509178884098, + "language_loss": 0.79824841, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.81998587, + "num_input_tokens_seen": 99447720, + "step": 4602, + "time_per_iteration": 2.5259463787078857 + }, + { + "auxiliary_loss_clip": 0.01128199, + "auxiliary_loss_mlp": 0.01049294, + "balance_loss_clip": 1.05057263, + "balance_loss_mlp": 1.03090024, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.040459993512615, + "language_loss": 0.77129602, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79307091, + "num_input_tokens_seen": 99464720, + "step": 4603, + "time_per_iteration": 2.4715592861175537 + }, + { + "auxiliary_loss_clip": 0.01114131, + "auxiliary_loss_mlp": 0.01044564, + "balance_loss_clip": 1.05075383, + "balance_loss_mlp": 1.02911472, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.497745737022551, + "language_loss": 0.8149426, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83652949, + "num_input_tokens_seen": 99482310, + "step": 4604, + "time_per_iteration": 2.464113235473633 + }, + { + "auxiliary_loss_clip": 0.01099421, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.04987955, + "balance_loss_mlp": 1.01933873, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 2.2879628765713336, + "language_loss": 0.69959104, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.7209456, + "num_input_tokens_seen": 99501255, + "step": 4605, + "time_per_iteration": 2.580632448196411 + }, + { + "auxiliary_loss_clip": 0.01059127, + "auxiliary_loss_mlp": 0.01017709, + "balance_loss_clip": 1.03403926, + "balance_loss_mlp": 1.01538396, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7013589411998721, + "language_loss": 0.57186711, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59263551, + "num_input_tokens_seen": 99568925, + "step": 4606, + "time_per_iteration": 4.60093355178833 + }, + { + "auxiliary_loss_clip": 0.01121039, + "auxiliary_loss_mlp": 0.01044087, + "balance_loss_clip": 1.05049706, + "balance_loss_mlp": 1.02708781, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 3.5387757045167585, + "language_loss": 0.69800001, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71965122, + "num_input_tokens_seen": 99588455, + "step": 4607, + "time_per_iteration": 2.527561664581299 + }, + { + "auxiliary_loss_clip": 0.01124533, + "auxiliary_loss_mlp": 0.01037154, + "balance_loss_clip": 1.0506947, + "balance_loss_mlp": 1.02191877, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 1.7492427917798563, + "language_loss": 0.69714892, + "learning_rate": 3.393199595837555e-06, + "loss": 0.71876574, + "num_input_tokens_seen": 99609355, + "step": 4608, + "time_per_iteration": 2.521918535232544 + }, + { + "auxiliary_loss_clip": 0.01100608, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.05262637, + "balance_loss_mlp": 1.02064943, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 2.3776090991564773, + "language_loss": 0.72975278, + "learning_rate": 3.392920146281499e-06, + "loss": 0.75112987, + "num_input_tokens_seen": 99628780, + "step": 4609, + "time_per_iteration": 2.6568756103515625 + }, + { + "auxiliary_loss_clip": 0.0110717, + "auxiliary_loss_mlp": 0.01052438, + "balance_loss_clip": 1.04435587, + "balance_loss_mlp": 1.03421068, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.796215716421013, + "language_loss": 0.83745444, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.85905051, + "num_input_tokens_seen": 99644545, + "step": 4610, + "time_per_iteration": 2.5145180225372314 + }, + { + "auxiliary_loss_clip": 0.01082462, + "auxiliary_loss_mlp": 0.00790375, + "balance_loss_clip": 1.04620504, + "balance_loss_mlp": 1.00140166, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.0267246972682327, + "language_loss": 0.68776685, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.70649529, + "num_input_tokens_seen": 99663125, + "step": 4611, + "time_per_iteration": 2.597388744354248 + }, + { + "auxiliary_loss_clip": 0.0114022, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.05208755, + "balance_loss_mlp": 1.02234828, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.0740634504169915, + "language_loss": 0.73446697, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75626147, + "num_input_tokens_seen": 99682645, + "step": 4612, + "time_per_iteration": 2.4528143405914307 + }, + { + "auxiliary_loss_clip": 0.01135492, + "auxiliary_loss_mlp": 0.00784292, + "balance_loss_clip": 1.05101657, + "balance_loss_mlp": 1.00139952, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 3.757315833414822, + "language_loss": 0.66138053, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68057847, + "num_input_tokens_seen": 99700520, + "step": 4613, + "time_per_iteration": 2.474560260772705 + }, + { + "auxiliary_loss_clip": 0.0109732, + "auxiliary_loss_mlp": 0.01047284, + "balance_loss_clip": 1.04651713, + "balance_loss_mlp": 1.02918804, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 2.167391545446536, + "language_loss": 0.79918683, + "learning_rate": 3.39152210641815e-06, + "loss": 0.82063287, + "num_input_tokens_seen": 99720355, + "step": 4614, + "time_per_iteration": 2.588061571121216 + }, + { + "auxiliary_loss_clip": 0.01128574, + "auxiliary_loss_mlp": 0.0104489, + "balance_loss_clip": 1.04717457, + "balance_loss_mlp": 1.02744925, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.888797261776476, + "language_loss": 0.80430937, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.82604402, + "num_input_tokens_seen": 99736090, + "step": 4615, + "time_per_iteration": 2.4642422199249268 + }, + { + "auxiliary_loss_clip": 0.011128, + "auxiliary_loss_mlp": 0.01045457, + "balance_loss_clip": 1.04493785, + "balance_loss_mlp": 1.02861333, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 3.021300545065197, + "language_loss": 0.63290226, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.65448487, + "num_input_tokens_seen": 99751805, + "step": 4616, + "time_per_iteration": 4.0049378871917725 + }, + { + "auxiliary_loss_clip": 0.0112674, + "auxiliary_loss_mlp": 0.01039924, + "balance_loss_clip": 1.04880023, + "balance_loss_mlp": 1.02288854, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 1.980902812426658, + "language_loss": 0.82858711, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.8502537, + "num_input_tokens_seen": 99770610, + "step": 4617, + "time_per_iteration": 2.480153799057007 + }, + { + "auxiliary_loss_clip": 0.01142484, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.04863441, + "balance_loss_mlp": 1.03074861, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.204724792630827, + "language_loss": 0.76551723, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.78741342, + "num_input_tokens_seen": 99787305, + "step": 4618, + "time_per_iteration": 2.423149824142456 + }, + { + "auxiliary_loss_clip": 0.01144871, + "auxiliary_loss_mlp": 0.01039968, + "balance_loss_clip": 1.05169868, + "balance_loss_mlp": 1.02480459, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.8725585336536428, + "language_loss": 0.85185897, + "learning_rate": 3.390122747388459e-06, + "loss": 0.87370729, + "num_input_tokens_seen": 99808940, + "step": 4619, + "time_per_iteration": 2.489283323287964 + }, + { + "auxiliary_loss_clip": 0.01119841, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.04943943, + "balance_loss_mlp": 1.02482009, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.573899721044231, + "language_loss": 0.77095163, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.79254717, + "num_input_tokens_seen": 99829575, + "step": 4620, + "time_per_iteration": 2.5246949195861816 + }, + { + "auxiliary_loss_clip": 0.01089901, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.04051375, + "balance_loss_mlp": 1.02304733, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 1.733870276345747, + "language_loss": 0.78285325, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80414546, + "num_input_tokens_seen": 99847575, + "step": 4621, + "time_per_iteration": 2.534444808959961 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01049604, + "balance_loss_clip": 1.05201292, + "balance_loss_mlp": 1.03148437, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 2.3833559628853536, + "language_loss": 0.8749916, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89665341, + "num_input_tokens_seen": 99864995, + "step": 4622, + "time_per_iteration": 2.5568079948425293 + }, + { + "auxiliary_loss_clip": 0.01093278, + "auxiliary_loss_mlp": 0.01046265, + "balance_loss_clip": 1.04541481, + "balance_loss_mlp": 1.02900982, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 2.7493054213375068, + "language_loss": 0.81165063, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83304608, + "num_input_tokens_seen": 99881540, + "step": 4623, + "time_per_iteration": 2.5402886867523193 + }, + { + "auxiliary_loss_clip": 0.01114986, + "auxiliary_loss_mlp": 0.01039291, + "balance_loss_clip": 1.05060101, + "balance_loss_mlp": 1.02291179, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 2.4588637917425866, + "language_loss": 0.81324136, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83478415, + "num_input_tokens_seen": 99899595, + "step": 4624, + "time_per_iteration": 2.5397424697875977 + }, + { + "auxiliary_loss_clip": 0.01109184, + "auxiliary_loss_mlp": 0.00786659, + "balance_loss_clip": 1.04686427, + "balance_loss_mlp": 1.0012331, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 2.1833873908009274, + "language_loss": 0.76952767, + "learning_rate": 3.388441777121191e-06, + "loss": 0.78848612, + "num_input_tokens_seen": 99913020, + "step": 4625, + "time_per_iteration": 2.456921100616455 + }, + { + "auxiliary_loss_clip": 0.01107141, + "auxiliary_loss_mlp": 0.01041313, + "balance_loss_clip": 1.04174566, + "balance_loss_mlp": 1.02402818, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 1.9995843358648404, + "language_loss": 0.69938731, + "learning_rate": 3.388161431073511e-06, + "loss": 0.72087187, + "num_input_tokens_seen": 99931405, + "step": 4626, + "time_per_iteration": 2.4939353466033936 + }, + { + "auxiliary_loss_clip": 0.01107278, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.04966128, + "balance_loss_mlp": 1.02241039, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 6.7011662407832295, + "language_loss": 0.92406356, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94554031, + "num_input_tokens_seen": 99948100, + "step": 4627, + "time_per_iteration": 2.538581609725952 + }, + { + "auxiliary_loss_clip": 0.01144003, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.04951346, + "balance_loss_mlp": 1.02265561, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 2.0209154626288557, + "language_loss": 0.85408396, + "learning_rate": 3.387600581071121e-06, + "loss": 0.8759073, + "num_input_tokens_seen": 99966470, + "step": 4628, + "time_per_iteration": 2.477536916732788 + }, + { + "auxiliary_loss_clip": 0.01115967, + "auxiliary_loss_mlp": 0.01039596, + "balance_loss_clip": 1.0505842, + "balance_loss_mlp": 1.02384233, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.5218450983538339, + "language_loss": 0.79432476, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81588042, + "num_input_tokens_seen": 99985930, + "step": 4629, + "time_per_iteration": 2.5386621952056885 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01040958, + "balance_loss_clip": 1.0477041, + "balance_loss_mlp": 1.02530622, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.543228273219506, + "language_loss": 0.84623402, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86766136, + "num_input_tokens_seen": 100006235, + "step": 4630, + "time_per_iteration": 2.582383871078491 + }, + { + "auxiliary_loss_clip": 0.01122906, + "auxiliary_loss_mlp": 0.01036643, + "balance_loss_clip": 1.04912508, + "balance_loss_mlp": 1.01952505, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.102518601991071, + "language_loss": 0.80380046, + "learning_rate": 3.386758911459485e-06, + "loss": 0.82539594, + "num_input_tokens_seen": 100023655, + "step": 4631, + "time_per_iteration": 2.4948060512542725 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.01046836, + "balance_loss_clip": 1.05324388, + "balance_loss_mlp": 1.03118372, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 2.7828800654863963, + "language_loss": 0.71076572, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73269951, + "num_input_tokens_seen": 100043280, + "step": 4632, + "time_per_iteration": 2.4709959030151367 + }, + { + "auxiliary_loss_clip": 0.01130864, + "auxiliary_loss_mlp": 0.01042448, + "balance_loss_clip": 1.05366516, + "balance_loss_mlp": 1.02727246, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.9065857999042173, + "language_loss": 0.82517612, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84690928, + "num_input_tokens_seen": 100057690, + "step": 4633, + "time_per_iteration": 2.444948434829712 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.0471344, + "balance_loss_mlp": 1.01775575, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.851239005727735, + "language_loss": 0.87660831, + "learning_rate": 3.385916768573529e-06, + "loss": 0.89815652, + "num_input_tokens_seen": 100075875, + "step": 4634, + "time_per_iteration": 2.5268325805664062 + }, + { + "auxiliary_loss_clip": 0.01113211, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.04533792, + "balance_loss_mlp": 1.02519917, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.532595937666824, + "language_loss": 0.76868355, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79024017, + "num_input_tokens_seen": 100092930, + "step": 4635, + "time_per_iteration": 3.9587883949279785 + }, + { + "auxiliary_loss_clip": 0.01144297, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_clip": 1.05046272, + "balance_loss_mlp": 1.02865624, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.6385098739165527, + "language_loss": 0.64980745, + "learning_rate": 3.385355077194637e-06, + "loss": 0.6716994, + "num_input_tokens_seen": 100110790, + "step": 4636, + "time_per_iteration": 2.4287521839141846 + }, + { + "auxiliary_loss_clip": 0.01131238, + "auxiliary_loss_mlp": 0.01043473, + "balance_loss_clip": 1.04719484, + "balance_loss_mlp": 1.02578223, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.564061249988706, + "language_loss": 0.8328526, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.85459971, + "num_input_tokens_seen": 100126970, + "step": 4637, + "time_per_iteration": 2.4357523918151855 + }, + { + "auxiliary_loss_clip": 0.01116044, + "auxiliary_loss_mlp": 0.01043958, + "balance_loss_clip": 1.04521275, + "balance_loss_mlp": 1.02813888, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.6292430854337083, + "language_loss": 0.75687122, + "learning_rate": 3.384793175684533e-06, + "loss": 0.77847123, + "num_input_tokens_seen": 100146720, + "step": 4638, + "time_per_iteration": 2.524545907974243 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01052661, + "balance_loss_clip": 1.04733574, + "balance_loss_mlp": 1.03576875, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.6305619100100244, + "language_loss": 0.71370625, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73554933, + "num_input_tokens_seen": 100165920, + "step": 4639, + "time_per_iteration": 3.9654793739318848 + }, + { + "auxiliary_loss_clip": 0.01133169, + "auxiliary_loss_mlp": 0.01038196, + "balance_loss_clip": 1.04952526, + "balance_loss_mlp": 1.02070761, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 1.9692956546377074, + "language_loss": 0.65699482, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67870843, + "num_input_tokens_seen": 100185525, + "step": 4640, + "time_per_iteration": 2.513566017150879 + }, + { + "auxiliary_loss_clip": 0.01130753, + "auxiliary_loss_mlp": 0.01036463, + "balance_loss_clip": 1.04893637, + "balance_loss_mlp": 1.02078736, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 1.9708281109487868, + "language_loss": 0.72023612, + "learning_rate": 3.383949929609804e-06, + "loss": 0.74190831, + "num_input_tokens_seen": 100204850, + "step": 4641, + "time_per_iteration": 2.48040509223938 + }, + { + "auxiliary_loss_clip": 0.01113271, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.05259573, + "balance_loss_mlp": 1.02856565, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.8491443116460733, + "language_loss": 0.74970049, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77130497, + "num_input_tokens_seen": 100224520, + "step": 4642, + "time_per_iteration": 2.5435776710510254 + }, + { + "auxiliary_loss_clip": 0.01110894, + "auxiliary_loss_mlp": 0.01042228, + "balance_loss_clip": 1.04491997, + "balance_loss_mlp": 1.02404857, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.8650418913430842, + "language_loss": 0.85444152, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87597275, + "num_input_tokens_seen": 100243935, + "step": 4643, + "time_per_iteration": 2.545628547668457 + }, + { + "auxiliary_loss_clip": 0.01102171, + "auxiliary_loss_mlp": 0.01051617, + "balance_loss_clip": 1.04677057, + "balance_loss_mlp": 1.03434408, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 2.846148303259337, + "language_loss": 0.82776356, + "learning_rate": 3.383106211219407e-06, + "loss": 0.84930152, + "num_input_tokens_seen": 100262290, + "step": 4644, + "time_per_iteration": 2.5358786582946777 + }, + { + "auxiliary_loss_clip": 0.01133045, + "auxiliary_loss_mlp": 0.01041398, + "balance_loss_clip": 1.0490737, + "balance_loss_mlp": 1.0246135, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 2.1592508310023737, + "language_loss": 0.79000175, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81174612, + "num_input_tokens_seen": 100280015, + "step": 4645, + "time_per_iteration": 3.8955299854278564 + }, + { + "auxiliary_loss_clip": 0.01053944, + "auxiliary_loss_mlp": 0.01008701, + "balance_loss_clip": 1.03457499, + "balance_loss_mlp": 1.00595951, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7951207447880078, + "language_loss": 0.62275189, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64337832, + "num_input_tokens_seen": 100338935, + "step": 4646, + "time_per_iteration": 3.0608062744140625 + }, + { + "auxiliary_loss_clip": 0.01109692, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.04673898, + "balance_loss_mlp": 1.02198648, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.751292957044441, + "language_loss": 0.89321148, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91467953, + "num_input_tokens_seen": 100359905, + "step": 4647, + "time_per_iteration": 2.5482397079467773 + }, + { + "auxiliary_loss_clip": 0.01136656, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_clip": 1.05077136, + "balance_loss_mlp": 1.02615666, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.9298953944443635, + "language_loss": 0.87143588, + "learning_rate": 3.381980519149988e-06, + "loss": 0.8932339, + "num_input_tokens_seen": 100376955, + "step": 4648, + "time_per_iteration": 2.474304676055908 + }, + { + "auxiliary_loss_clip": 0.01132197, + "auxiliary_loss_mlp": 0.01046008, + "balance_loss_clip": 1.04854572, + "balance_loss_mlp": 1.02909184, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 2.6568409555231827, + "language_loss": 0.73272622, + "learning_rate": 3.38169896509385e-06, + "loss": 0.75450832, + "num_input_tokens_seen": 100397545, + "step": 4649, + "time_per_iteration": 2.5276575088500977 + }, + { + "auxiliary_loss_clip": 0.0111315, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_clip": 1.05029035, + "balance_loss_mlp": 1.03018618, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.3971232441641637, + "language_loss": 0.80566537, + "learning_rate": 3.381417358643549e-06, + "loss": 0.82728374, + "num_input_tokens_seen": 100415080, + "step": 4650, + "time_per_iteration": 2.52616810798645 + }, + { + "auxiliary_loss_clip": 0.010482, + "auxiliary_loss_mlp": 0.00756862, + "balance_loss_clip": 1.03999245, + "balance_loss_mlp": 1.00135958, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.820017300585622, + "language_loss": 0.58886671, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60691738, + "num_input_tokens_seen": 100471105, + "step": 4651, + "time_per_iteration": 3.12190580368042 + }, + { + "auxiliary_loss_clip": 0.01132885, + "auxiliary_loss_mlp": 0.01050607, + "balance_loss_clip": 1.0481689, + "balance_loss_mlp": 1.03210521, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.7024590244428688, + "language_loss": 0.74437219, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.7662071, + "num_input_tokens_seen": 100492520, + "step": 4652, + "time_per_iteration": 2.5161869525909424 + }, + { + "auxiliary_loss_clip": 0.01149781, + "auxiliary_loss_mlp": 0.01055486, + "balance_loss_clip": 1.0545845, + "balance_loss_mlp": 1.03832018, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.143883857455279, + "language_loss": 0.7951147, + "learning_rate": 3.380572225034461e-06, + "loss": 0.8171674, + "num_input_tokens_seen": 100512870, + "step": 4653, + "time_per_iteration": 2.5840649604797363 + }, + { + "auxiliary_loss_clip": 0.01119655, + "auxiliary_loss_mlp": 0.01050732, + "balance_loss_clip": 1.04922211, + "balance_loss_mlp": 1.03399444, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.1026062534847565, + "language_loss": 0.78877574, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81047958, + "num_input_tokens_seen": 100531655, + "step": 4654, + "time_per_iteration": 2.51896595954895 + }, + { + "auxiliary_loss_clip": 0.01097332, + "auxiliary_loss_mlp": 0.01048289, + "balance_loss_clip": 1.04603231, + "balance_loss_mlp": 1.03069353, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 1.982466488608322, + "language_loss": 0.80817497, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.82963115, + "num_input_tokens_seen": 100548005, + "step": 4655, + "time_per_iteration": 4.154110908508301 + }, + { + "auxiliary_loss_clip": 0.01114144, + "auxiliary_loss_mlp": 0.00784925, + "balance_loss_clip": 1.05090773, + "balance_loss_mlp": 1.0012486, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.8896352482484124, + "language_loss": 0.81195313, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.83094382, + "num_input_tokens_seen": 100567980, + "step": 4656, + "time_per_iteration": 2.5978853702545166 + }, + { + "auxiliary_loss_clip": 0.01118713, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.05161691, + "balance_loss_mlp": 1.02620983, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.832188551116309, + "language_loss": 0.83381027, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85543454, + "num_input_tokens_seen": 100588630, + "step": 4657, + "time_per_iteration": 2.5940229892730713 + }, + { + "auxiliary_loss_clip": 0.01112135, + "auxiliary_loss_mlp": 0.01056539, + "balance_loss_clip": 1.05023539, + "balance_loss_mlp": 1.03733444, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.8639637895043775, + "language_loss": 0.62855053, + "learning_rate": 3.379162622133105e-06, + "loss": 0.65023732, + "num_input_tokens_seen": 100608775, + "step": 4658, + "time_per_iteration": 2.661820888519287 + }, + { + "auxiliary_loss_clip": 0.01137636, + "auxiliary_loss_mlp": 0.01054423, + "balance_loss_clip": 1.05213141, + "balance_loss_mlp": 1.03667283, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 2.3766268649156332, + "language_loss": 0.78368592, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80560654, + "num_input_tokens_seen": 100627975, + "step": 4659, + "time_per_iteration": 2.4814720153808594 + }, + { + "auxiliary_loss_clip": 0.01110896, + "auxiliary_loss_mlp": 0.01055205, + "balance_loss_clip": 1.04933786, + "balance_loss_mlp": 1.03714526, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.9694124843387752, + "language_loss": 0.79295778, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81461883, + "num_input_tokens_seen": 100645430, + "step": 4660, + "time_per_iteration": 2.5432002544403076 + }, + { + "auxiliary_loss_clip": 0.01109865, + "auxiliary_loss_mlp": 0.01044848, + "balance_loss_clip": 1.05224729, + "balance_loss_mlp": 1.02795589, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.8771609251110888, + "language_loss": 0.80234158, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82388878, + "num_input_tokens_seen": 100663775, + "step": 4661, + "time_per_iteration": 2.520350933074951 + }, + { + "auxiliary_loss_clip": 0.01127663, + "auxiliary_loss_mlp": 0.01054716, + "balance_loss_clip": 1.05514896, + "balance_loss_mlp": 1.03745496, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.6170588014303677, + "language_loss": 0.79136485, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.81318867, + "num_input_tokens_seen": 100686085, + "step": 4662, + "time_per_iteration": 2.6298673152923584 + }, + { + "auxiliary_loss_clip": 0.0113277, + "auxiliary_loss_mlp": 0.01052779, + "balance_loss_clip": 1.05186021, + "balance_loss_mlp": 1.03339577, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 4.45898739779603, + "language_loss": 0.69775081, + "learning_rate": 3.377751711782227e-06, + "loss": 0.71960634, + "num_input_tokens_seen": 100705135, + "step": 4663, + "time_per_iteration": 2.5143275260925293 + }, + { + "auxiliary_loss_clip": 0.01131138, + "auxiliary_loss_mlp": 0.01056738, + "balance_loss_clip": 1.0575453, + "balance_loss_mlp": 1.03710413, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 2.036340444265047, + "language_loss": 0.78026485, + "learning_rate": 3.377469372935791e-06, + "loss": 0.80214369, + "num_input_tokens_seen": 100724960, + "step": 4664, + "time_per_iteration": 2.5427937507629395 + }, + { + "auxiliary_loss_clip": 0.01114374, + "auxiliary_loss_mlp": 0.01049844, + "balance_loss_clip": 1.05379903, + "balance_loss_mlp": 1.03262973, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.8616869837446322, + "language_loss": 0.79504192, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81668413, + "num_input_tokens_seen": 100741995, + "step": 4665, + "time_per_iteration": 2.5153205394744873 + }, + { + "auxiliary_loss_clip": 0.01135354, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.05207801, + "balance_loss_mlp": 1.02924442, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 1.8629748330222027, + "language_loss": 0.80692852, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82874656, + "num_input_tokens_seen": 100758985, + "step": 4666, + "time_per_iteration": 2.4836957454681396 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01062468, + "balance_loss_clip": 1.05488563, + "balance_loss_mlp": 1.04224968, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 1.8600892549793648, + "language_loss": 0.85060585, + "learning_rate": 3.376622043036658e-06, + "loss": 0.87238294, + "num_input_tokens_seen": 100777820, + "step": 4667, + "time_per_iteration": 2.527161121368408 + }, + { + "auxiliary_loss_clip": 0.01121293, + "auxiliary_loss_mlp": 0.00784979, + "balance_loss_clip": 1.0534966, + "balance_loss_mlp": 1.00122499, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.8612975053574992, + "language_loss": 0.79622555, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81528819, + "num_input_tokens_seen": 100798205, + "step": 4668, + "time_per_iteration": 2.5920772552490234 + }, + { + "auxiliary_loss_clip": 0.0108951, + "auxiliary_loss_mlp": 0.01043068, + "balance_loss_clip": 1.04854608, + "balance_loss_mlp": 1.02511525, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 1.5055741655218862, + "language_loss": 0.76323855, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78456432, + "num_input_tokens_seen": 100819800, + "step": 4669, + "time_per_iteration": 2.651315927505493 + }, + { + "auxiliary_loss_clip": 0.01136835, + "auxiliary_loss_mlp": 0.01041999, + "balance_loss_clip": 1.05407774, + "balance_loss_mlp": 1.02435637, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.097878360778284, + "language_loss": 0.78469396, + "learning_rate": 3.375774243322725e-06, + "loss": 0.80648232, + "num_input_tokens_seen": 100837880, + "step": 4670, + "time_per_iteration": 2.4903526306152344 + }, + { + "auxiliary_loss_clip": 0.0111021, + "auxiliary_loss_mlp": 0.01046325, + "balance_loss_clip": 1.05000687, + "balance_loss_mlp": 1.02747762, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 1.8891439978661255, + "language_loss": 0.79396176, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81552708, + "num_input_tokens_seen": 100856350, + "step": 4671, + "time_per_iteration": 2.5587165355682373 + }, + { + "auxiliary_loss_clip": 0.01129813, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_clip": 1.05251002, + "balance_loss_mlp": 1.02920067, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.9376097617409285, + "language_loss": 0.75170064, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77346992, + "num_input_tokens_seen": 100876135, + "step": 4672, + "time_per_iteration": 2.519448757171631 + }, + { + "auxiliary_loss_clip": 0.01126385, + "auxiliary_loss_mlp": 0.01045847, + "balance_loss_clip": 1.04919636, + "balance_loss_mlp": 1.02647507, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.8647196496421987, + "language_loss": 0.76273489, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.78445721, + "num_input_tokens_seen": 100894790, + "step": 4673, + "time_per_iteration": 2.5205047130584717 + }, + { + "auxiliary_loss_clip": 0.01130673, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.04907417, + "balance_loss_mlp": 1.01956928, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 1.7810056506864342, + "language_loss": 0.72668326, + "learning_rate": 3.374643113381237e-06, + "loss": 0.7483567, + "num_input_tokens_seen": 100915100, + "step": 4674, + "time_per_iteration": 2.4704642295837402 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01042502, + "balance_loss_clip": 1.05322456, + "balance_loss_mlp": 1.02377415, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.8734750780699254, + "language_loss": 0.76908314, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79091179, + "num_input_tokens_seen": 100932795, + "step": 4675, + "time_per_iteration": 3.878265380859375 + }, + { + "auxiliary_loss_clip": 0.01150943, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_clip": 1.05433512, + "balance_loss_mlp": 1.02798867, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 1.9876681734791533, + "language_loss": 0.70529675, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72727191, + "num_input_tokens_seen": 100950505, + "step": 4676, + "time_per_iteration": 2.4506916999816895 + }, + { + "auxiliary_loss_clip": 0.01145406, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.05630648, + "balance_loss_mlp": 1.02451777, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 2.2132337771471047, + "language_loss": 0.70055455, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72241986, + "num_input_tokens_seen": 100968790, + "step": 4677, + "time_per_iteration": 2.452057361602783 + }, + { + "auxiliary_loss_clip": 0.01130587, + "auxiliary_loss_mlp": 0.01050254, + "balance_loss_clip": 1.05278611, + "balance_loss_mlp": 1.03000093, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.662810755005313, + "language_loss": 0.63560879, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65741718, + "num_input_tokens_seen": 100990205, + "step": 4678, + "time_per_iteration": 3.965972423553467 + }, + { + "auxiliary_loss_clip": 0.01136611, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.05360699, + "balance_loss_mlp": 1.028512, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.465161243003789, + "language_loss": 0.70527971, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.72710425, + "num_input_tokens_seen": 101009815, + "step": 4679, + "time_per_iteration": 2.499288320541382 + }, + { + "auxiliary_loss_clip": 0.01135487, + "auxiliary_loss_mlp": 0.01042388, + "balance_loss_clip": 1.0522418, + "balance_loss_mlp": 1.02453065, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 2.080571515339768, + "language_loss": 0.74439251, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76617122, + "num_input_tokens_seen": 101026780, + "step": 4680, + "time_per_iteration": 2.459660053253174 + }, + { + "auxiliary_loss_clip": 0.01149762, + "auxiliary_loss_mlp": 0.01037503, + "balance_loss_clip": 1.05498719, + "balance_loss_mlp": 1.02103996, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.575004447015879, + "language_loss": 0.77616161, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79803425, + "num_input_tokens_seen": 101046215, + "step": 4681, + "time_per_iteration": 2.4565589427948 + }, + { + "auxiliary_loss_clip": 0.01137595, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.05380416, + "balance_loss_mlp": 1.01939714, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 2.543162599330417, + "language_loss": 0.74032998, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76207721, + "num_input_tokens_seen": 101063365, + "step": 4682, + "time_per_iteration": 2.4367008209228516 + }, + { + "auxiliary_loss_clip": 0.01144883, + "auxiliary_loss_mlp": 0.01045178, + "balance_loss_clip": 1.05326009, + "balance_loss_mlp": 1.02844107, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4433536278714387, + "language_loss": 0.80672789, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.82862848, + "num_input_tokens_seen": 101083835, + "step": 4683, + "time_per_iteration": 2.461761236190796 + }, + { + "auxiliary_loss_clip": 0.01091864, + "auxiliary_loss_mlp": 0.01044137, + "balance_loss_clip": 1.04887879, + "balance_loss_mlp": 1.02580249, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.810125807089418, + "language_loss": 0.7641713, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78553134, + "num_input_tokens_seen": 101101740, + "step": 4684, + "time_per_iteration": 4.029368162155151 + }, + { + "auxiliary_loss_clip": 0.01093572, + "auxiliary_loss_mlp": 0.0104347, + "balance_loss_clip": 1.04664159, + "balance_loss_mlp": 1.02599418, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.764378223697007, + "language_loss": 0.75883138, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.78020179, + "num_input_tokens_seen": 101120480, + "step": 4685, + "time_per_iteration": 2.5418171882629395 + }, + { + "auxiliary_loss_clip": 0.0111712, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.0508709, + "balance_loss_mlp": 1.0257144, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 2.1732197775821005, + "language_loss": 0.75554949, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.77715129, + "num_input_tokens_seen": 101142910, + "step": 4686, + "time_per_iteration": 2.556098222732544 + }, + { + "auxiliary_loss_clip": 0.01119214, + "auxiliary_loss_mlp": 0.01051562, + "balance_loss_clip": 1.05019534, + "balance_loss_mlp": 1.03234494, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 3.096100002668051, + "language_loss": 0.63178438, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65349215, + "num_input_tokens_seen": 101160030, + "step": 4687, + "time_per_iteration": 2.516596555709839 + }, + { + "auxiliary_loss_clip": 0.01126367, + "auxiliary_loss_mlp": 0.01049931, + "balance_loss_clip": 1.05357707, + "balance_loss_mlp": 1.03354001, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 2.292592093279761, + "language_loss": 0.76486635, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.78662926, + "num_input_tokens_seen": 101177675, + "step": 4688, + "time_per_iteration": 2.535593032836914 + }, + { + "auxiliary_loss_clip": 0.01109503, + "auxiliary_loss_mlp": 0.01038888, + "balance_loss_clip": 1.04890966, + "balance_loss_mlp": 1.02248454, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 1.8689447268052177, + "language_loss": 0.78390372, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80538762, + "num_input_tokens_seen": 101192225, + "step": 4689, + "time_per_iteration": 2.502474308013916 + }, + { + "auxiliary_loss_clip": 0.01105273, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_clip": 1.04782403, + "balance_loss_mlp": 1.02688539, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 1.6481116740372945, + "language_loss": 0.77961665, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80111611, + "num_input_tokens_seen": 101210870, + "step": 4690, + "time_per_iteration": 2.5840353965759277 + }, + { + "auxiliary_loss_clip": 0.01144606, + "auxiliary_loss_mlp": 0.00784324, + "balance_loss_clip": 1.05132031, + "balance_loss_mlp": 1.0010159, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 3.0382823841264237, + "language_loss": 0.87835574, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89764512, + "num_input_tokens_seen": 101229965, + "step": 4691, + "time_per_iteration": 2.4771363735198975 + }, + { + "auxiliary_loss_clip": 0.01120746, + "auxiliary_loss_mlp": 0.01047437, + "balance_loss_clip": 1.04872489, + "balance_loss_mlp": 1.02891147, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 1.8509404132532432, + "language_loss": 0.82021022, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.841892, + "num_input_tokens_seen": 101250980, + "step": 4692, + "time_per_iteration": 2.5359225273132324 + }, + { + "auxiliary_loss_clip": 0.01110314, + "auxiliary_loss_mlp": 0.01042129, + "balance_loss_clip": 1.05195844, + "balance_loss_mlp": 1.02472413, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.502822403426008, + "language_loss": 0.7441721, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.76569653, + "num_input_tokens_seen": 101273335, + "step": 4693, + "time_per_iteration": 2.618546962738037 + }, + { + "auxiliary_loss_clip": 0.01106964, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.04596066, + "balance_loss_mlp": 1.0204618, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.962232147650758, + "language_loss": 0.77898276, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.80042696, + "num_input_tokens_seen": 101292110, + "step": 4694, + "time_per_iteration": 2.5397536754608154 + }, + { + "auxiliary_loss_clip": 0.01130382, + "auxiliary_loss_mlp": 0.0103862, + "balance_loss_clip": 1.05010235, + "balance_loss_mlp": 1.02169204, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 1.8729146983287912, + "language_loss": 0.67065579, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.6923458, + "num_input_tokens_seen": 101312815, + "step": 4695, + "time_per_iteration": 4.062562942504883 + }, + { + "auxiliary_loss_clip": 0.01130593, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.05367959, + "balance_loss_mlp": 1.02729857, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.0802786889628764, + "language_loss": 0.76078826, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.78256273, + "num_input_tokens_seen": 101329045, + "step": 4696, + "time_per_iteration": 2.508561611175537 + }, + { + "auxiliary_loss_clip": 0.01108912, + "auxiliary_loss_mlp": 0.0104741, + "balance_loss_clip": 1.04891324, + "balance_loss_mlp": 1.03031516, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 1.4901017295592223, + "language_loss": 0.62172824, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64329147, + "num_input_tokens_seen": 101352715, + "step": 4697, + "time_per_iteration": 2.711167573928833 + }, + { + "auxiliary_loss_clip": 0.01098087, + "auxiliary_loss_mlp": 0.01040931, + "balance_loss_clip": 1.04685426, + "balance_loss_mlp": 1.02457523, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 3.1691346540621987, + "language_loss": 0.72865796, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75004816, + "num_input_tokens_seen": 101374640, + "step": 4698, + "time_per_iteration": 2.605137586593628 + }, + { + "auxiliary_loss_clip": 0.01140423, + "auxiliary_loss_mlp": 0.01039309, + "balance_loss_clip": 1.05034304, + "balance_loss_mlp": 1.02350199, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 1.7283266326616, + "language_loss": 0.74892926, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77072656, + "num_input_tokens_seen": 101393595, + "step": 4699, + "time_per_iteration": 2.4754910469055176 + }, + { + "auxiliary_loss_clip": 0.01131799, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.04820728, + "balance_loss_mlp": 1.01924145, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 2.734744029464466, + "language_loss": 0.79960042, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.82129228, + "num_input_tokens_seen": 101409265, + "step": 4700, + "time_per_iteration": 2.4302704334259033 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01045077, + "balance_loss_clip": 1.05543184, + "balance_loss_mlp": 1.0304265, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.849620469339159, + "language_loss": 0.8193633, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.84102821, + "num_input_tokens_seen": 101428365, + "step": 4701, + "time_per_iteration": 2.5496559143066406 + }, + { + "auxiliary_loss_clip": 0.01082047, + "auxiliary_loss_mlp": 0.01041644, + "balance_loss_clip": 1.04869938, + "balance_loss_mlp": 1.02491844, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 3.3404475795503266, + "language_loss": 0.73263657, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75387353, + "num_input_tokens_seen": 101447280, + "step": 4702, + "time_per_iteration": 2.685265302658081 + }, + { + "auxiliary_loss_clip": 0.01143939, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.05408192, + "balance_loss_mlp": 1.02395618, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 2.5052631674790558, + "language_loss": 0.78514814, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80698842, + "num_input_tokens_seen": 101465435, + "step": 4703, + "time_per_iteration": 2.4469380378723145 + }, + { + "auxiliary_loss_clip": 0.01116922, + "auxiliary_loss_mlp": 0.01047293, + "balance_loss_clip": 1.04912889, + "balance_loss_mlp": 1.02897072, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.5957578446421932, + "language_loss": 0.693681, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71532309, + "num_input_tokens_seen": 101486355, + "step": 4704, + "time_per_iteration": 2.6150310039520264 + }, + { + "auxiliary_loss_clip": 0.01114769, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.05782866, + "balance_loss_mlp": 1.02281821, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.8957399017912804, + "language_loss": 0.70211756, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.72366732, + "num_input_tokens_seen": 101505875, + "step": 4705, + "time_per_iteration": 2.5603482723236084 + }, + { + "auxiliary_loss_clip": 0.01067297, + "auxiliary_loss_mlp": 0.01019127, + "balance_loss_clip": 1.04220009, + "balance_loss_mlp": 1.01687396, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7305994100961137, + "language_loss": 0.59289348, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61375773, + "num_input_tokens_seen": 101565045, + "step": 4706, + "time_per_iteration": 3.1143743991851807 + }, + { + "auxiliary_loss_clip": 0.01117231, + "auxiliary_loss_mlp": 0.01035934, + "balance_loss_clip": 1.04774272, + "balance_loss_mlp": 1.02049625, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 5.663946098811928, + "language_loss": 0.82167864, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84321034, + "num_input_tokens_seen": 101585825, + "step": 4707, + "time_per_iteration": 2.6096556186676025 + }, + { + "auxiliary_loss_clip": 0.01126154, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.05093169, + "balance_loss_mlp": 1.02045798, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.7272868755593698, + "language_loss": 0.80476582, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82641649, + "num_input_tokens_seen": 101606105, + "step": 4708, + "time_per_iteration": 2.561432361602783 + }, + { + "auxiliary_loss_clip": 0.01056644, + "auxiliary_loss_mlp": 0.01006767, + "balance_loss_clip": 1.04024601, + "balance_loss_mlp": 1.00445414, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.8910621399308054, + "language_loss": 0.62818146, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64881551, + "num_input_tokens_seen": 101656875, + "step": 4709, + "time_per_iteration": 2.9486217498779297 + }, + { + "auxiliary_loss_clip": 0.01115532, + "auxiliary_loss_mlp": 0.01041568, + "balance_loss_clip": 1.04977822, + "balance_loss_mlp": 1.02325726, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.3981808199421593, + "language_loss": 0.73889554, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76046652, + "num_input_tokens_seen": 101676225, + "step": 4710, + "time_per_iteration": 2.52921724319458 + }, + { + "auxiliary_loss_clip": 0.01108958, + "auxiliary_loss_mlp": 0.01050369, + "balance_loss_clip": 1.04603064, + "balance_loss_mlp": 1.03203511, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.8899162543581147, + "language_loss": 0.78744173, + "learning_rate": 3.364140713048579e-06, + "loss": 0.809035, + "num_input_tokens_seen": 101693710, + "step": 4711, + "time_per_iteration": 2.5442240238189697 + }, + { + "auxiliary_loss_clip": 0.01136007, + "auxiliary_loss_mlp": 0.00783958, + "balance_loss_clip": 1.0533514, + "balance_loss_mlp": 1.00111508, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.257726213058789, + "language_loss": 0.71462524, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73382485, + "num_input_tokens_seen": 101714010, + "step": 4712, + "time_per_iteration": 2.559492588043213 + }, + { + "auxiliary_loss_clip": 0.0114702, + "auxiliary_loss_mlp": 0.01051605, + "balance_loss_clip": 1.05399764, + "balance_loss_mlp": 1.0333662, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 2.18021045607202, + "language_loss": 0.8214674, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84345365, + "num_input_tokens_seen": 101732995, + "step": 4713, + "time_per_iteration": 2.465471029281616 + }, + { + "auxiliary_loss_clip": 0.01122245, + "auxiliary_loss_mlp": 0.01041057, + "balance_loss_clip": 1.05320024, + "balance_loss_mlp": 1.02306843, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.8929833886800194, + "language_loss": 0.75408965, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77572268, + "num_input_tokens_seen": 101751385, + "step": 4714, + "time_per_iteration": 4.006071090698242 + }, + { + "auxiliary_loss_clip": 0.01129788, + "auxiliary_loss_mlp": 0.01047805, + "balance_loss_clip": 1.0506928, + "balance_loss_mlp": 1.03084147, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.444259662798357, + "language_loss": 0.78057998, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80235589, + "num_input_tokens_seen": 101773825, + "step": 4715, + "time_per_iteration": 2.5615296363830566 + }, + { + "auxiliary_loss_clip": 0.01125785, + "auxiliary_loss_mlp": 0.01040633, + "balance_loss_clip": 1.05437052, + "balance_loss_mlp": 1.02355051, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.7411435921084057, + "language_loss": 0.7357648, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.757429, + "num_input_tokens_seen": 101791920, + "step": 4716, + "time_per_iteration": 2.553363084793091 + }, + { + "auxiliary_loss_clip": 0.01122043, + "auxiliary_loss_mlp": 0.01045083, + "balance_loss_clip": 1.05001223, + "balance_loss_mlp": 1.0252583, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 3.340575667614652, + "language_loss": 0.74719632, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76886761, + "num_input_tokens_seen": 101809515, + "step": 4717, + "time_per_iteration": 2.482887029647827 + }, + { + "auxiliary_loss_clip": 0.01115173, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_clip": 1.05068183, + "balance_loss_mlp": 1.02741575, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.5337518903833463, + "language_loss": 0.66829777, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.68989748, + "num_input_tokens_seen": 101827735, + "step": 4718, + "time_per_iteration": 4.110759496688843 + }, + { + "auxiliary_loss_clip": 0.01126238, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_clip": 1.05284226, + "balance_loss_mlp": 1.03011048, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.6550798930517263, + "language_loss": 0.7237494, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74549145, + "num_input_tokens_seen": 101845970, + "step": 4719, + "time_per_iteration": 2.564391613006592 + }, + { + "auxiliary_loss_clip": 0.0113343, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_clip": 1.052356, + "balance_loss_mlp": 1.02502227, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.804438066176265, + "language_loss": 0.80140263, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82315952, + "num_input_tokens_seen": 101865040, + "step": 4720, + "time_per_iteration": 2.4892666339874268 + }, + { + "auxiliary_loss_clip": 0.01133207, + "auxiliary_loss_mlp": 0.01041427, + "balance_loss_clip": 1.05013752, + "balance_loss_mlp": 1.0242846, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 2.321002403637757, + "language_loss": 0.79241866, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.814165, + "num_input_tokens_seen": 101883735, + "step": 4721, + "time_per_iteration": 2.4701225757598877 + }, + { + "auxiliary_loss_clip": 0.0109842, + "auxiliary_loss_mlp": 0.00786745, + "balance_loss_clip": 1.0532949, + "balance_loss_mlp": 1.0011878, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.0483238799894727, + "language_loss": 0.8235001, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84235179, + "num_input_tokens_seen": 101903025, + "step": 4722, + "time_per_iteration": 2.675503730773926 + }, + { + "auxiliary_loss_clip": 0.01148407, + "auxiliary_loss_mlp": 0.01040349, + "balance_loss_clip": 1.05629754, + "balance_loss_mlp": 1.02439833, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 3.0951736021014327, + "language_loss": 0.70232201, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72420961, + "num_input_tokens_seen": 101922255, + "step": 4723, + "time_per_iteration": 3.799989700317383 + }, + { + "auxiliary_loss_clip": 0.0111634, + "auxiliary_loss_mlp": 0.01040355, + "balance_loss_clip": 1.05102968, + "balance_loss_mlp": 1.02212787, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.6402508579690938, + "language_loss": 0.78674245, + "learning_rate": 3.360433840760998e-06, + "loss": 0.80830944, + "num_input_tokens_seen": 101943100, + "step": 4724, + "time_per_iteration": 2.5545156002044678 + }, + { + "auxiliary_loss_clip": 0.01122025, + "auxiliary_loss_mlp": 0.01052922, + "balance_loss_clip": 1.0537014, + "balance_loss_mlp": 1.03440881, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.5928254618980668, + "language_loss": 0.92423046, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94597995, + "num_input_tokens_seen": 101963160, + "step": 4725, + "time_per_iteration": 2.524562120437622 + }, + { + "auxiliary_loss_clip": 0.01135384, + "auxiliary_loss_mlp": 0.01041639, + "balance_loss_clip": 1.05200958, + "balance_loss_mlp": 1.02396071, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 1.8796811860745546, + "language_loss": 0.88531548, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90708572, + "num_input_tokens_seen": 101984300, + "step": 4726, + "time_per_iteration": 2.5242316722869873 + }, + { + "auxiliary_loss_clip": 0.01131423, + "auxiliary_loss_mlp": 0.01045559, + "balance_loss_clip": 1.0528183, + "balance_loss_mlp": 1.02816665, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 6.4518499869396795, + "language_loss": 0.78334594, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80511576, + "num_input_tokens_seen": 102005765, + "step": 4727, + "time_per_iteration": 2.7145323753356934 + }, + { + "auxiliary_loss_clip": 0.01134522, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.05531931, + "balance_loss_mlp": 1.02384567, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 9.676606814434765, + "language_loss": 0.6694653, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.69119668, + "num_input_tokens_seen": 102022755, + "step": 4728, + "time_per_iteration": 2.520010471343994 + }, + { + "auxiliary_loss_clip": 0.01112739, + "auxiliary_loss_mlp": 0.01046505, + "balance_loss_clip": 1.05271053, + "balance_loss_mlp": 1.02951801, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.861926054477741, + "language_loss": 0.76745307, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78904557, + "num_input_tokens_seen": 102041850, + "step": 4729, + "time_per_iteration": 2.5373995304107666 + }, + { + "auxiliary_loss_clip": 0.01126175, + "auxiliary_loss_mlp": 0.01047187, + "balance_loss_clip": 1.05918813, + "balance_loss_mlp": 1.03023553, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 2.0415258592009446, + "language_loss": 0.66555858, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68729222, + "num_input_tokens_seen": 102059500, + "step": 4730, + "time_per_iteration": 2.5568349361419678 + }, + { + "auxiliary_loss_clip": 0.01123504, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.05385828, + "balance_loss_mlp": 1.02059269, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 2.1042502242989447, + "language_loss": 0.75078213, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.77239615, + "num_input_tokens_seen": 102080460, + "step": 4731, + "time_per_iteration": 2.5625736713409424 + }, + { + "auxiliary_loss_clip": 0.01098984, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.05209374, + "balance_loss_mlp": 1.01824212, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.6562547321982466, + "language_loss": 0.8403132, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.86164951, + "num_input_tokens_seen": 102100950, + "step": 4732, + "time_per_iteration": 2.590465784072876 + }, + { + "auxiliary_loss_clip": 0.01135898, + "auxiliary_loss_mlp": 0.01050108, + "balance_loss_clip": 1.05391288, + "balance_loss_mlp": 1.03188086, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.7270887796649426, + "language_loss": 0.78927058, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81113064, + "num_input_tokens_seen": 102119345, + "step": 4733, + "time_per_iteration": 2.4881794452667236 + }, + { + "auxiliary_loss_clip": 0.01148787, + "auxiliary_loss_mlp": 0.0105304, + "balance_loss_clip": 1.05322611, + "balance_loss_mlp": 1.03532529, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 3.6482913942515283, + "language_loss": 0.71634734, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73836565, + "num_input_tokens_seen": 102139050, + "step": 4734, + "time_per_iteration": 3.958815336227417 + }, + { + "auxiliary_loss_clip": 0.01119445, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.05053174, + "balance_loss_mlp": 1.01669538, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.9522043904414315, + "language_loss": 0.74345183, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.76497287, + "num_input_tokens_seen": 102157935, + "step": 4735, + "time_per_iteration": 2.4973764419555664 + }, + { + "auxiliary_loss_clip": 0.01120352, + "auxiliary_loss_mlp": 0.01043989, + "balance_loss_clip": 1.05155218, + "balance_loss_mlp": 1.02805066, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.698699636553964, + "language_loss": 0.79615903, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81780243, + "num_input_tokens_seen": 102175325, + "step": 4736, + "time_per_iteration": 2.4856910705566406 + }, + { + "auxiliary_loss_clip": 0.01147368, + "auxiliary_loss_mlp": 0.01044956, + "balance_loss_clip": 1.05506516, + "balance_loss_mlp": 1.02786183, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 3.0115942160591613, + "language_loss": 0.59592104, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.61784428, + "num_input_tokens_seen": 102196625, + "step": 4737, + "time_per_iteration": 2.5063226222991943 + }, + { + "auxiliary_loss_clip": 0.01127167, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.04930246, + "balance_loss_mlp": 1.02040267, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.7768783964784194, + "language_loss": 0.86418128, + "learning_rate": 3.356432075047052e-06, + "loss": 0.88581371, + "num_input_tokens_seen": 102214975, + "step": 4738, + "time_per_iteration": 2.474618434906006 + }, + { + "auxiliary_loss_clip": 0.01117747, + "auxiliary_loss_mlp": 0.01045624, + "balance_loss_clip": 1.05261612, + "balance_loss_mlp": 1.02773094, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 1.943504204706806, + "language_loss": 0.89570564, + "learning_rate": 3.356145848516118e-06, + "loss": 0.91733932, + "num_input_tokens_seen": 102231885, + "step": 4739, + "time_per_iteration": 2.520581007003784 + }, + { + "auxiliary_loss_clip": 0.01130344, + "auxiliary_loss_mlp": 0.01042045, + "balance_loss_clip": 1.05320024, + "balance_loss_mlp": 1.02534366, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.3890787569151906, + "language_loss": 0.72257668, + "learning_rate": 3.355859570559998e-06, + "loss": 0.7443006, + "num_input_tokens_seen": 102252725, + "step": 4740, + "time_per_iteration": 2.504361391067505 + }, + { + "auxiliary_loss_clip": 0.01125734, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.05663776, + "balance_loss_mlp": 1.02325892, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.6225057899080837, + "language_loss": 0.77764559, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.79929453, + "num_input_tokens_seen": 102271730, + "step": 4741, + "time_per_iteration": 2.5396804809570312 + }, + { + "auxiliary_loss_clip": 0.01104528, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_clip": 1.04864013, + "balance_loss_mlp": 1.02707708, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.6478522074288988, + "language_loss": 0.76076275, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78224927, + "num_input_tokens_seen": 102291325, + "step": 4742, + "time_per_iteration": 2.5219855308532715 + }, + { + "auxiliary_loss_clip": 0.01147526, + "auxiliary_loss_mlp": 0.01056153, + "balance_loss_clip": 1.05211568, + "balance_loss_mlp": 1.03715098, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 1.9346991474491533, + "language_loss": 0.57831913, + "learning_rate": 3.355000428249086e-06, + "loss": 0.60035586, + "num_input_tokens_seen": 102309000, + "step": 4743, + "time_per_iteration": 2.4156365394592285 + }, + { + "auxiliary_loss_clip": 0.01118052, + "auxiliary_loss_mlp": 0.0104851, + "balance_loss_clip": 1.05393934, + "balance_loss_mlp": 1.0314157, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 1.7354687479155946, + "language_loss": 0.74215698, + "learning_rate": 3.354713944700797e-06, + "loss": 0.76382262, + "num_input_tokens_seen": 102329240, + "step": 4744, + "time_per_iteration": 2.602308511734009 + }, + { + "auxiliary_loss_clip": 0.01126258, + "auxiliary_loss_mlp": 0.01044683, + "balance_loss_clip": 1.05150318, + "balance_loss_mlp": 1.02872062, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.629569552117162, + "language_loss": 0.77743012, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.7991395, + "num_input_tokens_seen": 102344440, + "step": 4745, + "time_per_iteration": 2.4399311542510986 + }, + { + "auxiliary_loss_clip": 0.0112181, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.05410111, + "balance_loss_mlp": 1.02098513, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.6870386257815515, + "language_loss": 0.82749844, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.84907955, + "num_input_tokens_seen": 102360985, + "step": 4746, + "time_per_iteration": 2.4357128143310547 + }, + { + "auxiliary_loss_clip": 0.01098607, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.04698753, + "balance_loss_mlp": 1.02214718, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.9212931557755966, + "language_loss": 0.79629934, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.8176837, + "num_input_tokens_seen": 102380320, + "step": 4747, + "time_per_iteration": 2.562934398651123 + }, + { + "auxiliary_loss_clip": 0.01054883, + "auxiliary_loss_mlp": 0.01004941, + "balance_loss_clip": 1.03515089, + "balance_loss_mlp": 1.00296199, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7716905317616896, + "language_loss": 0.60450137, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62509954, + "num_input_tokens_seen": 102439140, + "step": 4748, + "time_per_iteration": 3.019305467605591 + }, + { + "auxiliary_loss_clip": 0.01142133, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.05131805, + "balance_loss_mlp": 1.0289551, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.0631839883699445, + "language_loss": 0.80264103, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82452273, + "num_input_tokens_seen": 102450990, + "step": 4749, + "time_per_iteration": 2.3920602798461914 + }, + { + "auxiliary_loss_clip": 0.01131401, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.05109119, + "balance_loss_mlp": 1.02197838, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 2.244167398558475, + "language_loss": 0.70470554, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72640204, + "num_input_tokens_seen": 102471820, + "step": 4750, + "time_per_iteration": 2.5322515964508057 + }, + { + "auxiliary_loss_clip": 0.01129471, + "auxiliary_loss_mlp": 0.01037651, + "balance_loss_clip": 1.05258822, + "balance_loss_mlp": 1.02146816, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.661677259137255, + "language_loss": 0.81905782, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.84072906, + "num_input_tokens_seen": 102492625, + "step": 4751, + "time_per_iteration": 2.5957748889923096 + }, + { + "auxiliary_loss_clip": 0.01140567, + "auxiliary_loss_mlp": 0.01041321, + "balance_loss_clip": 1.05163825, + "balance_loss_mlp": 1.02532315, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 2.2767362992652314, + "language_loss": 0.80016303, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82198191, + "num_input_tokens_seen": 102514145, + "step": 4752, + "time_per_iteration": 2.5990102291107178 + }, + { + "auxiliary_loss_clip": 0.01127207, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.04738796, + "balance_loss_mlp": 1.02292156, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.8417343824067194, + "language_loss": 0.78816861, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.80983245, + "num_input_tokens_seen": 102532365, + "step": 4753, + "time_per_iteration": 3.9401700496673584 + }, + { + "auxiliary_loss_clip": 0.01145279, + "auxiliary_loss_mlp": 0.01041536, + "balance_loss_clip": 1.05187488, + "balance_loss_mlp": 1.02299893, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.226825388980367, + "language_loss": 0.89697003, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91883814, + "num_input_tokens_seen": 102548425, + "step": 4754, + "time_per_iteration": 2.4108669757843018 + }, + { + "auxiliary_loss_clip": 0.01128254, + "auxiliary_loss_mlp": 0.0103917, + "balance_loss_clip": 1.05035543, + "balance_loss_mlp": 1.02361333, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.8185460046264816, + "language_loss": 0.82566178, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84733593, + "num_input_tokens_seen": 102566370, + "step": 4755, + "time_per_iteration": 2.520503282546997 + }, + { + "auxiliary_loss_clip": 0.01097529, + "auxiliary_loss_mlp": 0.01039168, + "balance_loss_clip": 1.04795098, + "balance_loss_mlp": 1.02307487, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.5719674310513774, + "language_loss": 0.83567059, + "learning_rate": 3.351272138300922e-06, + "loss": 0.85703748, + "num_input_tokens_seen": 102588715, + "step": 4756, + "time_per_iteration": 2.685795545578003 + }, + { + "auxiliary_loss_clip": 0.01040035, + "auxiliary_loss_mlp": 0.01015628, + "balance_loss_clip": 1.03266096, + "balance_loss_mlp": 1.01248121, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8652309581988032, + "language_loss": 0.61002648, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63058305, + "num_input_tokens_seen": 102656715, + "step": 4757, + "time_per_iteration": 4.765835285186768 + }, + { + "auxiliary_loss_clip": 0.01143488, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.05406857, + "balance_loss_mlp": 1.01794124, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 2.7521329403429085, + "language_loss": 0.65885347, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.68062496, + "num_input_tokens_seen": 102676545, + "step": 4758, + "time_per_iteration": 2.468766450881958 + }, + { + "auxiliary_loss_clip": 0.01130151, + "auxiliary_loss_mlp": 0.01035913, + "balance_loss_clip": 1.05022049, + "balance_loss_mlp": 1.0202843, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.630567899268176, + "language_loss": 0.62656218, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.6482228, + "num_input_tokens_seen": 102702875, + "step": 4759, + "time_per_iteration": 2.6265957355499268 + }, + { + "auxiliary_loss_clip": 0.01128726, + "auxiliary_loss_mlp": 0.00783585, + "balance_loss_clip": 1.05170977, + "balance_loss_mlp": 1.00108814, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 4.108160721554859, + "language_loss": 0.74305868, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76218176, + "num_input_tokens_seen": 102723160, + "step": 4760, + "time_per_iteration": 2.4872629642486572 + }, + { + "auxiliary_loss_clip": 0.01122564, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.05658495, + "balance_loss_mlp": 1.02408433, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 2.0426853555077487, + "language_loss": 0.72756755, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.7491805, + "num_input_tokens_seen": 102743855, + "step": 4761, + "time_per_iteration": 2.5519723892211914 + }, + { + "auxiliary_loss_clip": 0.01082798, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.05150771, + "balance_loss_mlp": 1.02908242, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 1.925927054473653, + "language_loss": 0.74588108, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76716042, + "num_input_tokens_seen": 102761370, + "step": 4762, + "time_per_iteration": 2.669043779373169 + }, + { + "auxiliary_loss_clip": 0.0110536, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.0485462, + "balance_loss_mlp": 1.02362156, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.5669443240341974, + "language_loss": 0.76227421, + "learning_rate": 3.349261009210496e-06, + "loss": 0.7837283, + "num_input_tokens_seen": 102780885, + "step": 4763, + "time_per_iteration": 3.9172627925872803 + }, + { + "auxiliary_loss_clip": 0.01100091, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.04344273, + "balance_loss_mlp": 1.01990819, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.6438181998652437, + "language_loss": 0.77075732, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79212528, + "num_input_tokens_seen": 102801000, + "step": 4764, + "time_per_iteration": 2.652261734008789 + }, + { + "auxiliary_loss_clip": 0.01108834, + "auxiliary_loss_mlp": 0.01046351, + "balance_loss_clip": 1.04802525, + "balance_loss_mlp": 1.02826667, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 3.0575113102624214, + "language_loss": 0.71012604, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73167789, + "num_input_tokens_seen": 102820230, + "step": 4765, + "time_per_iteration": 2.5474822521209717 + }, + { + "auxiliary_loss_clip": 0.01124622, + "auxiliary_loss_mlp": 0.01042574, + "balance_loss_clip": 1.04916859, + "balance_loss_mlp": 1.02698112, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.6761669848970453, + "language_loss": 0.76096213, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.78263402, + "num_input_tokens_seen": 102842670, + "step": 4766, + "time_per_iteration": 2.5855534076690674 + }, + { + "auxiliary_loss_clip": 0.01128937, + "auxiliary_loss_mlp": 0.01034473, + "balance_loss_clip": 1.05172431, + "balance_loss_mlp": 1.01899922, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.685036928181708, + "language_loss": 0.77634877, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79798281, + "num_input_tokens_seen": 102864480, + "step": 4767, + "time_per_iteration": 2.538738250732422 + }, + { + "auxiliary_loss_clip": 0.01140516, + "auxiliary_loss_mlp": 0.01042424, + "balance_loss_clip": 1.052423, + "balance_loss_mlp": 1.0264976, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.982088232999748, + "language_loss": 0.65590131, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67773068, + "num_input_tokens_seen": 102883740, + "step": 4768, + "time_per_iteration": 2.4486589431762695 + }, + { + "auxiliary_loss_clip": 0.01121305, + "auxiliary_loss_mlp": 0.01039186, + "balance_loss_clip": 1.04931927, + "balance_loss_mlp": 1.02334285, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 6.977315858591216, + "language_loss": 0.70835733, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72996223, + "num_input_tokens_seen": 102902945, + "step": 4769, + "time_per_iteration": 2.4958040714263916 + }, + { + "auxiliary_loss_clip": 0.01077081, + "auxiliary_loss_mlp": 0.01036644, + "balance_loss_clip": 1.04417503, + "balance_loss_mlp": 1.02139688, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.9545812463965442, + "language_loss": 0.74899805, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77013528, + "num_input_tokens_seen": 102922405, + "step": 4770, + "time_per_iteration": 2.5615177154541016 + }, + { + "auxiliary_loss_clip": 0.01098313, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.04974902, + "balance_loss_mlp": 1.0230366, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 2.2938496568946407, + "language_loss": 0.67567962, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69705337, + "num_input_tokens_seen": 102938980, + "step": 4771, + "time_per_iteration": 2.6029672622680664 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01004377, + "balance_loss_clip": 1.03381515, + "balance_loss_mlp": 1.00194478, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7774003861560476, + "language_loss": 0.56886888, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58950043, + "num_input_tokens_seen": 103000405, + "step": 4772, + "time_per_iteration": 3.0108587741851807 + }, + { + "auxiliary_loss_clip": 0.01069752, + "auxiliary_loss_mlp": 0.0078621, + "balance_loss_clip": 1.04419577, + "balance_loss_mlp": 1.00109982, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.38599858075589, + "language_loss": 0.83050615, + "learning_rate": 3.346383619630856e-06, + "loss": 0.84906578, + "num_input_tokens_seen": 103017970, + "step": 4773, + "time_per_iteration": 4.114518404006958 + }, + { + "auxiliary_loss_clip": 0.01143127, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.05136728, + "balance_loss_mlp": 1.02368546, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.366983384336656, + "language_loss": 0.77579474, + "learning_rate": 3.34609559969027e-06, + "loss": 0.7976256, + "num_input_tokens_seen": 103036385, + "step": 4774, + "time_per_iteration": 2.4934189319610596 + }, + { + "auxiliary_loss_clip": 0.01121335, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_clip": 1.05144048, + "balance_loss_mlp": 1.02410531, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 1.9904716703436114, + "language_loss": 0.73336858, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75499427, + "num_input_tokens_seen": 103052170, + "step": 4775, + "time_per_iteration": 2.487929344177246 + }, + { + "auxiliary_loss_clip": 0.01135496, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.05263376, + "balance_loss_mlp": 1.02596927, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.6533781806412953, + "language_loss": 0.88240218, + "learning_rate": 3.34551940668778e-06, + "loss": 0.90417767, + "num_input_tokens_seen": 103070510, + "step": 4776, + "time_per_iteration": 2.4664478302001953 + }, + { + "auxiliary_loss_clip": 0.01132476, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.05287671, + "balance_loss_mlp": 1.02223158, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.8000657122538914, + "language_loss": 0.74313831, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76483881, + "num_input_tokens_seen": 103089590, + "step": 4777, + "time_per_iteration": 2.466658115386963 + }, + { + "auxiliary_loss_clip": 0.01128646, + "auxiliary_loss_mlp": 0.01043603, + "balance_loss_clip": 1.0546031, + "balance_loss_mlp": 1.02650845, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 2.011253129020381, + "language_loss": 0.80420911, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82593155, + "num_input_tokens_seen": 103109080, + "step": 4778, + "time_per_iteration": 2.5699803829193115 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01041556, + "balance_loss_clip": 1.0526123, + "balance_loss_mlp": 1.02567744, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.7443713557861051, + "language_loss": 0.73684156, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.75845051, + "num_input_tokens_seen": 103127755, + "step": 4779, + "time_per_iteration": 2.5384716987609863 + }, + { + "auxiliary_loss_clip": 0.01121982, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.04879045, + "balance_loss_mlp": 1.02106607, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.7014154328058757, + "language_loss": 0.76308841, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78468728, + "num_input_tokens_seen": 103147035, + "step": 4780, + "time_per_iteration": 2.512096643447876 + }, + { + "auxiliary_loss_clip": 0.01105205, + "auxiliary_loss_mlp": 0.01043203, + "balance_loss_clip": 1.04653716, + "balance_loss_mlp": 1.02779508, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.6425782797988189, + "language_loss": 0.81476462, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83624876, + "num_input_tokens_seen": 103165410, + "step": 4781, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.01105853, + "auxiliary_loss_mlp": 0.01045498, + "balance_loss_clip": 1.05105186, + "balance_loss_mlp": 1.027843, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 1.9969883818757652, + "language_loss": 0.86481273, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88632625, + "num_input_tokens_seen": 103183710, + "step": 4782, + "time_per_iteration": 2.5548150539398193 + }, + { + "auxiliary_loss_clip": 0.01115951, + "auxiliary_loss_mlp": 0.0104356, + "balance_loss_clip": 1.05796611, + "balance_loss_mlp": 1.02700806, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.550173008211044, + "language_loss": 0.70957899, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73117411, + "num_input_tokens_seen": 103203790, + "step": 4783, + "time_per_iteration": 2.563427448272705 + }, + { + "auxiliary_loss_clip": 0.01122414, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.05558693, + "balance_loss_mlp": 1.02677691, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 3.6603954048293863, + "language_loss": 0.76650929, + "learning_rate": 3.343212594663047e-06, + "loss": 0.7881676, + "num_input_tokens_seen": 103223925, + "step": 4784, + "time_per_iteration": 2.604412794113159 + }, + { + "auxiliary_loss_clip": 0.01096899, + "auxiliary_loss_mlp": 0.01045294, + "balance_loss_clip": 1.04844701, + "balance_loss_mlp": 1.02807999, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.4587977133808494, + "language_loss": 0.75671315, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.77813506, + "num_input_tokens_seen": 103244760, + "step": 4785, + "time_per_iteration": 2.5892655849456787 + }, + { + "auxiliary_loss_clip": 0.01143078, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.05381989, + "balance_loss_mlp": 1.02981377, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 2.175168620997094, + "language_loss": 0.8283385, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85022289, + "num_input_tokens_seen": 103261995, + "step": 4786, + "time_per_iteration": 2.506667375564575 + }, + { + "auxiliary_loss_clip": 0.01113817, + "auxiliary_loss_mlp": 0.00781289, + "balance_loss_clip": 1.05460477, + "balance_loss_mlp": 1.00112236, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.696991205301572, + "language_loss": 0.79786599, + "learning_rate": 3.342346699429516e-06, + "loss": 0.8168171, + "num_input_tokens_seen": 103279780, + "step": 4787, + "time_per_iteration": 2.566420555114746 + }, + { + "auxiliary_loss_clip": 0.01122936, + "auxiliary_loss_mlp": 0.01038638, + "balance_loss_clip": 1.05285311, + "balance_loss_mlp": 1.02266383, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 1.9498206571401262, + "language_loss": 0.83889163, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.86050737, + "num_input_tokens_seen": 103300580, + "step": 4788, + "time_per_iteration": 2.5552937984466553 + }, + { + "auxiliary_loss_clip": 0.01105593, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.05260992, + "balance_loss_mlp": 1.02755356, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 2.000481495676167, + "language_loss": 0.73262024, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.7541132, + "num_input_tokens_seen": 103320430, + "step": 4789, + "time_per_iteration": 2.6397194862365723 + }, + { + "auxiliary_loss_clip": 0.01123408, + "auxiliary_loss_mlp": 0.01036014, + "balance_loss_clip": 1.05094337, + "balance_loss_mlp": 1.02068365, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.9439009498244022, + "language_loss": 0.83785504, + "learning_rate": 3.341480346078704e-06, + "loss": 0.85944927, + "num_input_tokens_seen": 103337695, + "step": 4790, + "time_per_iteration": 2.522048234939575 + }, + { + "auxiliary_loss_clip": 0.01134739, + "auxiliary_loss_mlp": 0.0104271, + "balance_loss_clip": 1.05337906, + "balance_loss_mlp": 1.0264734, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 4.353417884558258, + "language_loss": 0.77739936, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.79917389, + "num_input_tokens_seen": 103357010, + "step": 4791, + "time_per_iteration": 2.4900786876678467 + }, + { + "auxiliary_loss_clip": 0.01119504, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.05195284, + "balance_loss_mlp": 1.02133572, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.9568255247838051, + "language_loss": 0.70581293, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72737837, + "num_input_tokens_seen": 103375600, + "step": 4792, + "time_per_iteration": 2.492445945739746 + }, + { + "auxiliary_loss_clip": 0.01106381, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.05640626, + "balance_loss_mlp": 1.02433944, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 1.6554171522671337, + "language_loss": 0.79138565, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81285071, + "num_input_tokens_seen": 103395225, + "step": 4793, + "time_per_iteration": 4.034681558609009 + }, + { + "auxiliary_loss_clip": 0.01116592, + "auxiliary_loss_mlp": 0.01041816, + "balance_loss_clip": 1.05056763, + "balance_loss_mlp": 1.02658141, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.7107654886499148, + "language_loss": 0.78135103, + "learning_rate": 3.340324496161797e-06, + "loss": 0.80293506, + "num_input_tokens_seen": 103417245, + "step": 4794, + "time_per_iteration": 2.722047805786133 + }, + { + "auxiliary_loss_clip": 0.01131496, + "auxiliary_loss_mlp": 0.01040273, + "balance_loss_clip": 1.05280292, + "balance_loss_mlp": 1.02417982, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.1949723885368706, + "language_loss": 0.82609022, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84780788, + "num_input_tokens_seen": 103435500, + "step": 4795, + "time_per_iteration": 2.4708197116851807 + }, + { + "auxiliary_loss_clip": 0.01127157, + "auxiliary_loss_mlp": 0.01043528, + "balance_loss_clip": 1.05205095, + "balance_loss_mlp": 1.02818537, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.0693187301824905, + "language_loss": 0.7457546, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76746142, + "num_input_tokens_seen": 103451040, + "step": 4796, + "time_per_iteration": 2.4846253395080566 + }, + { + "auxiliary_loss_clip": 0.01135254, + "auxiliary_loss_mlp": 0.01043323, + "balance_loss_clip": 1.05174446, + "balance_loss_mlp": 1.02599025, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 2.3831956724258396, + "language_loss": 0.72541428, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74720007, + "num_input_tokens_seen": 103471330, + "step": 4797, + "time_per_iteration": 3.97004771232605 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.00782782, + "balance_loss_clip": 1.04506636, + "balance_loss_mlp": 1.00108111, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 2.238210948054506, + "language_loss": 0.74463379, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76349723, + "num_input_tokens_seen": 103488060, + "step": 4798, + "time_per_iteration": 2.5230796337127686 + }, + { + "auxiliary_loss_clip": 0.01134079, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.05182219, + "balance_loss_mlp": 1.0238868, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 3.0327269240578323, + "language_loss": 0.65094638, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67270172, + "num_input_tokens_seen": 103503600, + "step": 4799, + "time_per_iteration": 2.488895893096924 + }, + { + "auxiliary_loss_clip": 0.01144461, + "auxiliary_loss_mlp": 0.01045955, + "balance_loss_clip": 1.05283999, + "balance_loss_mlp": 1.02939725, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 1.7528918357068024, + "language_loss": 0.82170713, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84361124, + "num_input_tokens_seen": 103524195, + "step": 4800, + "time_per_iteration": 2.4612839221954346 + }, + { + "auxiliary_loss_clip": 0.01105772, + "auxiliary_loss_mlp": 0.0103508, + "balance_loss_clip": 1.04799843, + "balance_loss_mlp": 1.01889086, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.8533471686588996, + "language_loss": 0.90985703, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93126559, + "num_input_tokens_seen": 103545235, + "step": 4801, + "time_per_iteration": 2.603377342224121 + }, + { + "auxiliary_loss_clip": 0.01117227, + "auxiliary_loss_mlp": 0.00782297, + "balance_loss_clip": 1.05168033, + "balance_loss_mlp": 1.00111902, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.1458889200363087, + "language_loss": 0.73755848, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75655365, + "num_input_tokens_seen": 103563305, + "step": 4802, + "time_per_iteration": 4.014875888824463 + }, + { + "auxiliary_loss_clip": 0.01048069, + "auxiliary_loss_mlp": 0.01002585, + "balance_loss_clip": 1.0420481, + "balance_loss_mlp": 1.00026083, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7783505109873209, + "language_loss": 0.62928104, + "learning_rate": 3.337720861641558e-06, + "loss": 0.64978755, + "num_input_tokens_seen": 103625025, + "step": 4803, + "time_per_iteration": 3.217595100402832 + }, + { + "auxiliary_loss_clip": 0.01084954, + "auxiliary_loss_mlp": 0.01046762, + "balance_loss_clip": 1.04110301, + "balance_loss_mlp": 1.03024542, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 2.060307673611002, + "language_loss": 0.70906973, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.73038685, + "num_input_tokens_seen": 103644235, + "step": 4804, + "time_per_iteration": 2.583979368209839 + }, + { + "auxiliary_loss_clip": 0.01133796, + "auxiliary_loss_mlp": 0.01040446, + "balance_loss_clip": 1.050843, + "balance_loss_mlp": 1.02307773, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 2.1904684910540295, + "language_loss": 0.68525028, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70699275, + "num_input_tokens_seen": 103664700, + "step": 4805, + "time_per_iteration": 2.5151124000549316 + }, + { + "auxiliary_loss_clip": 0.01131753, + "auxiliary_loss_mlp": 0.01039227, + "balance_loss_clip": 1.05166888, + "balance_loss_mlp": 1.02371836, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.4421936611777555, + "language_loss": 0.69274002, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71444988, + "num_input_tokens_seen": 103686595, + "step": 4806, + "time_per_iteration": 2.5850207805633545 + }, + { + "auxiliary_loss_clip": 0.0112023, + "auxiliary_loss_mlp": 0.01043561, + "balance_loss_clip": 1.05076087, + "balance_loss_mlp": 1.02732515, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.471218307824567, + "language_loss": 0.7161501, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73778808, + "num_input_tokens_seen": 103707525, + "step": 4807, + "time_per_iteration": 2.5974714756011963 + }, + { + "auxiliary_loss_clip": 0.01102955, + "auxiliary_loss_mlp": 0.01044007, + "balance_loss_clip": 1.0501169, + "balance_loss_mlp": 1.02750838, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 2.7991070869335983, + "language_loss": 0.81511104, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83658063, + "num_input_tokens_seen": 103727905, + "step": 4808, + "time_per_iteration": 2.5591371059417725 + }, + { + "auxiliary_loss_clip": 0.01096557, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.04742563, + "balance_loss_mlp": 1.02893448, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.6488847917417147, + "language_loss": 0.78230035, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80372596, + "num_input_tokens_seen": 103748335, + "step": 4809, + "time_per_iteration": 2.5567855834960938 + }, + { + "auxiliary_loss_clip": 0.01090504, + "auxiliary_loss_mlp": 0.01037455, + "balance_loss_clip": 1.04437995, + "balance_loss_mlp": 1.01974046, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 2.063212455278179, + "language_loss": 0.78765237, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80893195, + "num_input_tokens_seen": 103767020, + "step": 4810, + "time_per_iteration": 2.5972511768341064 + }, + { + "auxiliary_loss_clip": 0.01089728, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.04500842, + "balance_loss_mlp": 1.02100158, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 1.736455151846456, + "language_loss": 0.76945746, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79072148, + "num_input_tokens_seen": 103786355, + "step": 4811, + "time_per_iteration": 2.599724054336548 + }, + { + "auxiliary_loss_clip": 0.01128962, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.05075622, + "balance_loss_mlp": 1.02189898, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.4212977166898288, + "language_loss": 0.7745232, + "learning_rate": 3.335113118275117e-06, + "loss": 0.7962023, + "num_input_tokens_seen": 103809345, + "step": 4812, + "time_per_iteration": 2.5693674087524414 + }, + { + "auxiliary_loss_clip": 0.01037096, + "auxiliary_loss_mlp": 0.01014239, + "balance_loss_clip": 1.04011536, + "balance_loss_mlp": 1.01189065, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 16.587701476696434, + "language_loss": 0.60255468, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62306798, + "num_input_tokens_seen": 103871180, + "step": 4813, + "time_per_iteration": 4.880128383636475 + }, + { + "auxiliary_loss_clip": 0.01093823, + "auxiliary_loss_mlp": 0.01042954, + "balance_loss_clip": 1.04276848, + "balance_loss_mlp": 1.0259192, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 2.1077049810291264, + "language_loss": 0.82445306, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84582078, + "num_input_tokens_seen": 103889040, + "step": 4814, + "time_per_iteration": 2.520932197570801 + }, + { + "auxiliary_loss_clip": 0.01099939, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.05116582, + "balance_loss_mlp": 1.02944458, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.7581045967598903, + "language_loss": 0.72869807, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.75016046, + "num_input_tokens_seen": 103910380, + "step": 4815, + "time_per_iteration": 2.619260311126709 + }, + { + "auxiliary_loss_clip": 0.0112668, + "auxiliary_loss_mlp": 0.01044772, + "balance_loss_clip": 1.04986882, + "balance_loss_mlp": 1.03023446, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.3979514427504436, + "language_loss": 0.70541441, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72712892, + "num_input_tokens_seen": 103929955, + "step": 4816, + "time_per_iteration": 2.5002400875091553 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01044205, + "balance_loss_clip": 1.04828322, + "balance_loss_mlp": 1.02634788, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 2.6001060250125105, + "language_loss": 0.74757648, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76917505, + "num_input_tokens_seen": 103948020, + "step": 4817, + "time_per_iteration": 2.510300636291504 + }, + { + "auxiliary_loss_clip": 0.0110766, + "auxiliary_loss_mlp": 0.01052064, + "balance_loss_clip": 1.05050659, + "balance_loss_mlp": 1.033813, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.7417486312408113, + "language_loss": 0.76436722, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78596449, + "num_input_tokens_seen": 103968740, + "step": 4818, + "time_per_iteration": 2.5904479026794434 + }, + { + "auxiliary_loss_clip": 0.0107283, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.04798031, + "balance_loss_mlp": 1.02594066, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.8920103139112943, + "language_loss": 0.79891205, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.82007378, + "num_input_tokens_seen": 103986005, + "step": 4819, + "time_per_iteration": 2.5630381107330322 + }, + { + "auxiliary_loss_clip": 0.0110712, + "auxiliary_loss_mlp": 0.01047327, + "balance_loss_clip": 1.04854059, + "balance_loss_mlp": 1.02893353, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 3.682263215134981, + "language_loss": 0.78180104, + "learning_rate": 3.332791681244776e-06, + "loss": 0.8033455, + "num_input_tokens_seen": 104005070, + "step": 4820, + "time_per_iteration": 2.5164361000061035 + }, + { + "auxiliary_loss_clip": 0.01094977, + "auxiliary_loss_mlp": 0.01034934, + "balance_loss_clip": 1.04786754, + "balance_loss_mlp": 1.01865017, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.2972582665857195, + "language_loss": 0.72342265, + "learning_rate": 3.332501274072231e-06, + "loss": 0.74472177, + "num_input_tokens_seen": 104022945, + "step": 4821, + "time_per_iteration": 2.5373499393463135 + }, + { + "auxiliary_loss_clip": 0.01127573, + "auxiliary_loss_mlp": 0.0104311, + "balance_loss_clip": 1.04789639, + "balance_loss_mlp": 1.02625346, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.8816397573159898, + "language_loss": 0.72115433, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74286115, + "num_input_tokens_seen": 104042080, + "step": 4822, + "time_per_iteration": 2.473517656326294 + }, + { + "auxiliary_loss_clip": 0.01126388, + "auxiliary_loss_mlp": 0.01052503, + "balance_loss_clip": 1.05187523, + "balance_loss_mlp": 1.03607595, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.9058388328374403, + "language_loss": 0.66373122, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68552017, + "num_input_tokens_seen": 104060975, + "step": 4823, + "time_per_iteration": 2.4517405033111572 + }, + { + "auxiliary_loss_clip": 0.0110554, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.04289114, + "balance_loss_mlp": 1.02417862, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 2.1903929896801992, + "language_loss": 0.81161952, + "learning_rate": 3.331629749427164e-06, + "loss": 0.83308351, + "num_input_tokens_seen": 104081395, + "step": 4824, + "time_per_iteration": 2.529228687286377 + }, + { + "auxiliary_loss_clip": 0.01138948, + "auxiliary_loss_mlp": 0.0104125, + "balance_loss_clip": 1.04822695, + "balance_loss_mlp": 1.02409554, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 1.9319108946488073, + "language_loss": 0.72674584, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74854785, + "num_input_tokens_seen": 104099995, + "step": 4825, + "time_per_iteration": 2.4421896934509277 + }, + { + "auxiliary_loss_clip": 0.01141589, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.05100441, + "balance_loss_mlp": 1.02425575, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.3287621317175415, + "language_loss": 0.73527092, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75709897, + "num_input_tokens_seen": 104118930, + "step": 4826, + "time_per_iteration": 2.4068751335144043 + }, + { + "auxiliary_loss_clip": 0.01126625, + "auxiliary_loss_mlp": 0.01044862, + "balance_loss_clip": 1.04784441, + "balance_loss_mlp": 1.02897096, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 1.9233932713936588, + "language_loss": 0.68406665, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70578152, + "num_input_tokens_seen": 104136940, + "step": 4827, + "time_per_iteration": 2.487623691558838 + }, + { + "auxiliary_loss_clip": 0.0112314, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.04851866, + "balance_loss_mlp": 1.02313089, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 2.494018316776123, + "language_loss": 0.8051393, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82677865, + "num_input_tokens_seen": 104154280, + "step": 4828, + "time_per_iteration": 2.4578020572662354 + }, + { + "auxiliary_loss_clip": 0.01139604, + "auxiliary_loss_mlp": 0.01051011, + "balance_loss_clip": 1.05170333, + "balance_loss_mlp": 1.03432143, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 1.7135347397735163, + "language_loss": 0.80475914, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82666528, + "num_input_tokens_seen": 104172605, + "step": 4829, + "time_per_iteration": 2.4454095363616943 + }, + { + "auxiliary_loss_clip": 0.01114922, + "auxiliary_loss_mlp": 0.01038025, + "balance_loss_clip": 1.04720306, + "balance_loss_mlp": 1.02113271, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.5889144488467382, + "language_loss": 0.8275075, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84903699, + "num_input_tokens_seen": 104194120, + "step": 4830, + "time_per_iteration": 2.5465304851531982 + }, + { + "auxiliary_loss_clip": 0.01129387, + "auxiliary_loss_mlp": 0.01045456, + "balance_loss_clip": 1.04841876, + "balance_loss_mlp": 1.02842057, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.700188529982485, + "language_loss": 0.79278153, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.8145299, + "num_input_tokens_seen": 104210875, + "step": 4831, + "time_per_iteration": 2.445624828338623 + }, + { + "auxiliary_loss_clip": 0.01136866, + "auxiliary_loss_mlp": 0.01041554, + "balance_loss_clip": 1.05023921, + "balance_loss_mlp": 1.0259378, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 2.1074930692742377, + "language_loss": 0.74057204, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76235628, + "num_input_tokens_seen": 104229875, + "step": 4832, + "time_per_iteration": 3.8942177295684814 + }, + { + "auxiliary_loss_clip": 0.01112583, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.04670799, + "balance_loss_mlp": 1.02056909, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 3.3160738720024083, + "language_loss": 0.76098388, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78246224, + "num_input_tokens_seen": 104250405, + "step": 4833, + "time_per_iteration": 2.5182948112487793 + }, + { + "auxiliary_loss_clip": 0.01106355, + "auxiliary_loss_mlp": 0.01039785, + "balance_loss_clip": 1.04557252, + "balance_loss_mlp": 1.02340531, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.907266960634592, + "language_loss": 0.64915669, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.67061806, + "num_input_tokens_seen": 104269185, + "step": 4834, + "time_per_iteration": 2.503587007522583 + }, + { + "auxiliary_loss_clip": 0.01114257, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.04922247, + "balance_loss_mlp": 1.01663291, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.8018781225442075, + "language_loss": 0.71662682, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73808432, + "num_input_tokens_seen": 104289400, + "step": 4835, + "time_per_iteration": 2.5421128273010254 + }, + { + "auxiliary_loss_clip": 0.01113644, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.04617012, + "balance_loss_mlp": 1.02547634, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 2.0236927534045477, + "language_loss": 0.79532743, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81686491, + "num_input_tokens_seen": 104310485, + "step": 4836, + "time_per_iteration": 4.050030946731567 + }, + { + "auxiliary_loss_clip": 0.01099854, + "auxiliary_loss_mlp": 0.01044445, + "balance_loss_clip": 1.04770064, + "balance_loss_mlp": 1.02748179, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.6787447669451474, + "language_loss": 0.80997348, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.83141649, + "num_input_tokens_seen": 104327330, + "step": 4837, + "time_per_iteration": 2.5063161849975586 + }, + { + "auxiliary_loss_clip": 0.01113575, + "auxiliary_loss_mlp": 0.01039132, + "balance_loss_clip": 1.04923201, + "balance_loss_mlp": 1.02290225, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.8746832868158456, + "language_loss": 0.67045921, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69198626, + "num_input_tokens_seen": 104350350, + "step": 4838, + "time_per_iteration": 2.629210948944092 + }, + { + "auxiliary_loss_clip": 0.01140196, + "auxiliary_loss_mlp": 0.00782953, + "balance_loss_clip": 1.04977286, + "balance_loss_mlp": 1.00106931, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 2.541143218580308, + "language_loss": 0.71331626, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73254776, + "num_input_tokens_seen": 104369995, + "step": 4839, + "time_per_iteration": 2.4615299701690674 + }, + { + "auxiliary_loss_clip": 0.0113799, + "auxiliary_loss_mlp": 0.01034158, + "balance_loss_clip": 1.049371, + "balance_loss_mlp": 1.01846409, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.927836683979755, + "language_loss": 0.75441599, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77613753, + "num_input_tokens_seen": 104392285, + "step": 4840, + "time_per_iteration": 2.5549495220184326 + }, + { + "auxiliary_loss_clip": 0.0109211, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.04500151, + "balance_loss_mlp": 1.02433729, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 2.6142766401422293, + "language_loss": 0.60248876, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62380922, + "num_input_tokens_seen": 104412640, + "step": 4841, + "time_per_iteration": 2.615814208984375 + }, + { + "auxiliary_loss_clip": 0.01115082, + "auxiliary_loss_mlp": 0.01034026, + "balance_loss_clip": 1.04896331, + "balance_loss_mlp": 1.01829076, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.3488702295632393, + "language_loss": 0.71326852, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73475957, + "num_input_tokens_seen": 104435245, + "step": 4842, + "time_per_iteration": 3.9942164421081543 + }, + { + "auxiliary_loss_clip": 0.01126496, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.04916, + "balance_loss_mlp": 1.02064204, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.5760957646468257, + "language_loss": 0.73306429, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.75468761, + "num_input_tokens_seen": 104455395, + "step": 4843, + "time_per_iteration": 2.490506649017334 + }, + { + "auxiliary_loss_clip": 0.01083551, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.04697728, + "balance_loss_mlp": 1.01756024, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.8936248457041605, + "language_loss": 0.57813001, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.59929192, + "num_input_tokens_seen": 104473350, + "step": 4844, + "time_per_iteration": 2.5771665573120117 + }, + { + "auxiliary_loss_clip": 0.01131553, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.05326498, + "balance_loss_mlp": 1.01810002, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.3787695429860407, + "language_loss": 0.87142587, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.89308923, + "num_input_tokens_seen": 104492265, + "step": 4845, + "time_per_iteration": 2.4992616176605225 + }, + { + "auxiliary_loss_clip": 0.0111242, + "auxiliary_loss_mlp": 0.01049014, + "balance_loss_clip": 1.04898, + "balance_loss_mlp": 1.03195465, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.8387356921154305, + "language_loss": 0.66999382, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.69160813, + "num_input_tokens_seen": 104510755, + "step": 4846, + "time_per_iteration": 2.5231876373291016 + }, + { + "auxiliary_loss_clip": 0.01116819, + "auxiliary_loss_mlp": 0.01038481, + "balance_loss_clip": 1.05062962, + "balance_loss_mlp": 1.02326941, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 2.536998498232052, + "language_loss": 0.7027728, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72432578, + "num_input_tokens_seen": 104530830, + "step": 4847, + "time_per_iteration": 2.5323495864868164 + }, + { + "auxiliary_loss_clip": 0.01126184, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.04966331, + "balance_loss_mlp": 1.01366019, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 1.5457974863742043, + "language_loss": 0.74233812, + "learning_rate": 3.324641216731237e-06, + "loss": 0.76388997, + "num_input_tokens_seen": 104550115, + "step": 4848, + "time_per_iteration": 2.5135881900787354 + }, + { + "auxiliary_loss_clip": 0.01122849, + "auxiliary_loss_mlp": 0.01043129, + "balance_loss_clip": 1.04726768, + "balance_loss_mlp": 1.0251286, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.090339534028263, + "language_loss": 0.77099168, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.79265153, + "num_input_tokens_seen": 104566255, + "step": 4849, + "time_per_iteration": 2.4460017681121826 + }, + { + "auxiliary_loss_clip": 0.01121819, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.04729319, + "balance_loss_mlp": 1.02246618, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 2.093973199424043, + "language_loss": 0.78863108, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.81023502, + "num_input_tokens_seen": 104585235, + "step": 4850, + "time_per_iteration": 2.486607313156128 + }, + { + "auxiliary_loss_clip": 0.0111156, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.05170584, + "balance_loss_mlp": 1.02200007, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.672924677387861, + "language_loss": 0.75963449, + "learning_rate": 3.323765612674296e-06, + "loss": 0.7811389, + "num_input_tokens_seen": 104605315, + "step": 4851, + "time_per_iteration": 2.523473024368286 + }, + { + "auxiliary_loss_clip": 0.01127222, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.0515641, + "balance_loss_mlp": 1.0226295, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 4.256424811820336, + "language_loss": 0.77225101, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.7938953, + "num_input_tokens_seen": 104626055, + "step": 4852, + "time_per_iteration": 2.5752317905426025 + }, + { + "auxiliary_loss_clip": 0.01117093, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.05047274, + "balance_loss_mlp": 1.02598381, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.5757505430436476, + "language_loss": 0.77909875, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80068469, + "num_input_tokens_seen": 104646005, + "step": 4853, + "time_per_iteration": 4.002268552780151 + }, + { + "auxiliary_loss_clip": 0.0110501, + "auxiliary_loss_mlp": 0.01036204, + "balance_loss_clip": 1.04953122, + "balance_loss_mlp": 1.02061129, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 4.960292308967521, + "language_loss": 0.88109446, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90250653, + "num_input_tokens_seen": 104661620, + "step": 4854, + "time_per_iteration": 2.5325496196746826 + }, + { + "auxiliary_loss_clip": 0.01123909, + "auxiliary_loss_mlp": 0.01053202, + "balance_loss_clip": 1.05121708, + "balance_loss_mlp": 1.03503478, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.743692940823312, + "language_loss": 0.86358202, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88535309, + "num_input_tokens_seen": 104681445, + "step": 4855, + "time_per_iteration": 2.5216047763824463 + }, + { + "auxiliary_loss_clip": 0.0105798, + "auxiliary_loss_mlp": 0.01008439, + "balance_loss_clip": 1.03568828, + "balance_loss_mlp": 1.00637662, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.7903929805451134, + "language_loss": 0.60126364, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62192786, + "num_input_tokens_seen": 104747945, + "step": 4856, + "time_per_iteration": 3.1656625270843506 + }, + { + "auxiliary_loss_clip": 0.01111783, + "auxiliary_loss_mlp": 0.00781706, + "balance_loss_clip": 1.04655349, + "balance_loss_mlp": 1.00116897, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 1.8917868381054697, + "language_loss": 0.68239766, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70133257, + "num_input_tokens_seen": 104766225, + "step": 4857, + "time_per_iteration": 2.494046688079834 + }, + { + "auxiliary_loss_clip": 0.01126586, + "auxiliary_loss_mlp": 0.00782476, + "balance_loss_clip": 1.05077481, + "balance_loss_mlp": 1.00108457, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 1.9637266298539398, + "language_loss": 0.83726764, + "learning_rate": 3.321720780151895e-06, + "loss": 0.85635829, + "num_input_tokens_seen": 104785345, + "step": 4858, + "time_per_iteration": 2.5648765563964844 + }, + { + "auxiliary_loss_clip": 0.01142598, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.05600202, + "balance_loss_mlp": 1.01780772, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 1.6727491080593728, + "language_loss": 0.77251756, + "learning_rate": 3.321428460652342e-06, + "loss": 0.7942729, + "num_input_tokens_seen": 104804560, + "step": 4859, + "time_per_iteration": 2.4443717002868652 + }, + { + "auxiliary_loss_clip": 0.01101468, + "auxiliary_loss_mlp": 0.01040547, + "balance_loss_clip": 1.05248225, + "balance_loss_mlp": 1.02432227, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 3.845046451768076, + "language_loss": 0.6858964, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.70731652, + "num_input_tokens_seen": 104821105, + "step": 4860, + "time_per_iteration": 2.571566581726074 + }, + { + "auxiliary_loss_clip": 0.01114261, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.05207658, + "balance_loss_mlp": 1.02207375, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 3.5465817141681812, + "language_loss": 0.75525177, + "learning_rate": 3.320843671338222e-06, + "loss": 0.77675456, + "num_input_tokens_seen": 104841440, + "step": 4861, + "time_per_iteration": 2.6060914993286133 + }, + { + "auxiliary_loss_clip": 0.01126232, + "auxiliary_loss_mlp": 0.01042776, + "balance_loss_clip": 1.0504179, + "balance_loss_mlp": 1.02842927, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.8321559416370927, + "language_loss": 0.91037238, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93206251, + "num_input_tokens_seen": 104858210, + "step": 4862, + "time_per_iteration": 2.460106611251831 + }, + { + "auxiliary_loss_clip": 0.01129232, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.05131221, + "balance_loss_mlp": 1.02107441, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 3.1881009091585852, + "language_loss": 0.73659074, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75823748, + "num_input_tokens_seen": 104875620, + "step": 4863, + "time_per_iteration": 2.4666574001312256 + }, + { + "auxiliary_loss_clip": 0.01064032, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.04730761, + "balance_loss_mlp": 1.02157319, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.738559788714886, + "language_loss": 0.78157198, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80256903, + "num_input_tokens_seen": 104894600, + "step": 4864, + "time_per_iteration": 2.6542041301727295 + }, + { + "auxiliary_loss_clip": 0.01105545, + "auxiliary_loss_mlp": 0.0104964, + "balance_loss_clip": 1.04990602, + "balance_loss_mlp": 1.03284335, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 1.7671509395714429, + "language_loss": 0.81947166, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8410235, + "num_input_tokens_seen": 104914530, + "step": 4865, + "time_per_iteration": 2.578437089920044 + }, + { + "auxiliary_loss_clip": 0.01094788, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_clip": 1.0541873, + "balance_loss_mlp": 1.02944517, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 2.0218755656684495, + "language_loss": 0.85194874, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87335217, + "num_input_tokens_seen": 104933460, + "step": 4866, + "time_per_iteration": 2.6533138751983643 + }, + { + "auxiliary_loss_clip": 0.01111045, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.05025387, + "balance_loss_mlp": 1.01905346, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.648733412428773, + "language_loss": 0.75666106, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77810574, + "num_input_tokens_seen": 104954495, + "step": 4867, + "time_per_iteration": 2.6539433002471924 + }, + { + "auxiliary_loss_clip": 0.01081126, + "auxiliary_loss_mlp": 0.01042352, + "balance_loss_clip": 1.05032003, + "balance_loss_mlp": 1.02674794, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 1.829956545130437, + "language_loss": 0.73432803, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75556278, + "num_input_tokens_seen": 104971915, + "step": 4868, + "time_per_iteration": 2.6227734088897705 + }, + { + "auxiliary_loss_clip": 0.01088966, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.04895627, + "balance_loss_mlp": 1.01753187, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 2.60269550552886, + "language_loss": 0.7451337, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.7663582, + "num_input_tokens_seen": 104991335, + "step": 4869, + "time_per_iteration": 2.6129181385040283 + }, + { + "auxiliary_loss_clip": 0.01118101, + "auxiliary_loss_mlp": 0.01038282, + "balance_loss_clip": 1.05385661, + "balance_loss_mlp": 1.02210569, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.7511230275184198, + "language_loss": 0.76726985, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78883374, + "num_input_tokens_seen": 105012015, + "step": 4870, + "time_per_iteration": 2.625030994415283 + }, + { + "auxiliary_loss_clip": 0.01132303, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.05332804, + "balance_loss_mlp": 1.02481747, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.2580069929036486, + "language_loss": 0.67607778, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69782054, + "num_input_tokens_seen": 105031460, + "step": 4871, + "time_per_iteration": 4.034849166870117 + }, + { + "auxiliary_loss_clip": 0.01112902, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.04977632, + "balance_loss_mlp": 1.0251286, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 2.006026087012026, + "language_loss": 0.76990646, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79144466, + "num_input_tokens_seen": 105052965, + "step": 4872, + "time_per_iteration": 2.6047616004943848 + }, + { + "auxiliary_loss_clip": 0.01073906, + "auxiliary_loss_mlp": 0.01040275, + "balance_loss_clip": 1.04492354, + "balance_loss_mlp": 1.02308488, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 2.544581538243191, + "language_loss": 0.72597516, + "learning_rate": 3.317330731292164e-06, + "loss": 0.74711692, + "num_input_tokens_seen": 105071840, + "step": 4873, + "time_per_iteration": 2.6394052505493164 + }, + { + "auxiliary_loss_clip": 0.01128966, + "auxiliary_loss_mlp": 0.01038583, + "balance_loss_clip": 1.05145311, + "balance_loss_mlp": 1.02242994, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 2.468756443946029, + "language_loss": 0.78470927, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80638474, + "num_input_tokens_seen": 105089445, + "step": 4874, + "time_per_iteration": 2.4838223457336426 + }, + { + "auxiliary_loss_clip": 0.01085405, + "auxiliary_loss_mlp": 0.0104625, + "balance_loss_clip": 1.04723167, + "balance_loss_mlp": 1.02981126, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.126463613913593, + "language_loss": 0.78022081, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.80153739, + "num_input_tokens_seen": 105106210, + "step": 4875, + "time_per_iteration": 2.5618503093719482 + }, + { + "auxiliary_loss_clip": 0.01140309, + "auxiliary_loss_mlp": 0.01035115, + "balance_loss_clip": 1.06047225, + "balance_loss_mlp": 1.01936173, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.906377232930604, + "language_loss": 0.69123077, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71298504, + "num_input_tokens_seen": 105124200, + "step": 4876, + "time_per_iteration": 4.311360597610474 + }, + { + "auxiliary_loss_clip": 0.01122454, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.05123806, + "balance_loss_mlp": 1.02327919, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 1.9485912160527656, + "language_loss": 0.8205328, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84214413, + "num_input_tokens_seen": 105140400, + "step": 4877, + "time_per_iteration": 2.4917733669281006 + }, + { + "auxiliary_loss_clip": 0.01137325, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.05659628, + "balance_loss_mlp": 1.01993275, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 2.1983827353213314, + "language_loss": 0.67810416, + "learning_rate": 3.315864882155911e-06, + "loss": 0.69982606, + "num_input_tokens_seen": 105157535, + "step": 4878, + "time_per_iteration": 2.4812378883361816 + }, + { + "auxiliary_loss_clip": 0.01100377, + "auxiliary_loss_mlp": 0.010441, + "balance_loss_clip": 1.04931271, + "balance_loss_mlp": 1.02811384, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.8133259350136886, + "language_loss": 0.73365462, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.75509948, + "num_input_tokens_seen": 105175185, + "step": 4879, + "time_per_iteration": 2.593031644821167 + }, + { + "auxiliary_loss_clip": 0.01101946, + "auxiliary_loss_mlp": 0.00784911, + "balance_loss_clip": 1.05323744, + "balance_loss_mlp": 1.00113606, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 1.9302466585788347, + "language_loss": 0.66204059, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68090916, + "num_input_tokens_seen": 105194540, + "step": 4880, + "time_per_iteration": 2.694868803024292 + }, + { + "auxiliary_loss_clip": 0.01129987, + "auxiliary_loss_mlp": 0.01047979, + "balance_loss_clip": 1.05186439, + "balance_loss_mlp": 1.03183818, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.759945499201512, + "language_loss": 0.70258057, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72436023, + "num_input_tokens_seen": 105213215, + "step": 4881, + "time_per_iteration": 3.999300003051758 + }, + { + "auxiliary_loss_clip": 0.01112324, + "auxiliary_loss_mlp": 0.0078374, + "balance_loss_clip": 1.05070138, + "balance_loss_mlp": 1.00109494, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 1.6829515189988633, + "language_loss": 0.83591914, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85487974, + "num_input_tokens_seen": 105231585, + "step": 4882, + "time_per_iteration": 2.614964008331299 + }, + { + "auxiliary_loss_clip": 0.01143403, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.05310953, + "balance_loss_mlp": 1.02127957, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.073704907404153, + "language_loss": 0.70994467, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73175138, + "num_input_tokens_seen": 105250120, + "step": 4883, + "time_per_iteration": 2.4499592781066895 + }, + { + "auxiliary_loss_clip": 0.01118702, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.05222452, + "balance_loss_mlp": 1.01684725, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 2.7473927867642187, + "language_loss": 0.92747271, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94898993, + "num_input_tokens_seen": 105266065, + "step": 4884, + "time_per_iteration": 2.5543956756591797 + }, + { + "auxiliary_loss_clip": 0.01134181, + "auxiliary_loss_mlp": 0.0103484, + "balance_loss_clip": 1.05472362, + "balance_loss_mlp": 1.01897287, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.7913018581404567, + "language_loss": 0.73524451, + "learning_rate": 3.313810597972234e-06, + "loss": 0.7569347, + "num_input_tokens_seen": 105282155, + "step": 4885, + "time_per_iteration": 2.512094259262085 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.01041597, + "balance_loss_clip": 1.04996467, + "balance_loss_mlp": 1.02593851, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 2.0106316750509845, + "language_loss": 0.85339111, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.8750183, + "num_input_tokens_seen": 105299225, + "step": 4886, + "time_per_iteration": 2.4919302463531494 + }, + { + "auxiliary_loss_clip": 0.01109929, + "auxiliary_loss_mlp": 0.01038702, + "balance_loss_clip": 1.05070102, + "balance_loss_mlp": 1.02369344, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.5131159670304206, + "language_loss": 0.76441324, + "learning_rate": 3.313223211088603e-06, + "loss": 0.78589952, + "num_input_tokens_seen": 105315710, + "step": 4887, + "time_per_iteration": 2.5398313999176025 + }, + { + "auxiliary_loss_clip": 0.01115328, + "auxiliary_loss_mlp": 0.01040597, + "balance_loss_clip": 1.05157828, + "balance_loss_mlp": 1.0254637, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.1736677973634677, + "language_loss": 0.79582429, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.81738353, + "num_input_tokens_seen": 105333505, + "step": 4888, + "time_per_iteration": 2.488696336746216 + }, + { + "auxiliary_loss_clip": 0.01112598, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.05385447, + "balance_loss_mlp": 1.01596451, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.509251250405519, + "language_loss": 0.55265868, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57408953, + "num_input_tokens_seen": 105355605, + "step": 4889, + "time_per_iteration": 2.671818256378174 + }, + { + "auxiliary_loss_clip": 0.01130867, + "auxiliary_loss_mlp": 0.01039257, + "balance_loss_clip": 1.05180359, + "balance_loss_mlp": 1.02271092, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.7673040436348997, + "language_loss": 0.8437345, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86543572, + "num_input_tokens_seen": 105374225, + "step": 4890, + "time_per_iteration": 2.4897377490997314 + }, + { + "auxiliary_loss_clip": 0.01133083, + "auxiliary_loss_mlp": 0.01044829, + "balance_loss_clip": 1.05383873, + "balance_loss_mlp": 1.02864015, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.803297201186066, + "language_loss": 0.72510952, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74688864, + "num_input_tokens_seen": 105391565, + "step": 4891, + "time_per_iteration": 2.457214593887329 + }, + { + "auxiliary_loss_clip": 0.0114118, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_clip": 1.05265176, + "balance_loss_mlp": 1.02919781, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.7205830235924893, + "language_loss": 0.77098846, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79285955, + "num_input_tokens_seen": 105409840, + "step": 4892, + "time_per_iteration": 4.051323413848877 + }, + { + "auxiliary_loss_clip": 0.01142089, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.05429006, + "balance_loss_mlp": 1.01740801, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.7466639334457992, + "language_loss": 0.78238732, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80414236, + "num_input_tokens_seen": 105428645, + "step": 4893, + "time_per_iteration": 2.4855878353118896 + }, + { + "auxiliary_loss_clip": 0.01102842, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_clip": 1.05109644, + "balance_loss_mlp": 1.02968454, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.8656272069983155, + "language_loss": 0.8457576, + "learning_rate": 3.311165788957864e-06, + "loss": 0.86724174, + "num_input_tokens_seen": 105447480, + "step": 4894, + "time_per_iteration": 2.6135852336883545 + }, + { + "auxiliary_loss_clip": 0.01129184, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.05313098, + "balance_loss_mlp": 1.02189565, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 3.047087756983715, + "language_loss": 0.90175736, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92342234, + "num_input_tokens_seen": 105464600, + "step": 4895, + "time_per_iteration": 2.465033769607544 + }, + { + "auxiliary_loss_clip": 0.0113548, + "auxiliary_loss_mlp": 0.01039972, + "balance_loss_clip": 1.05544984, + "balance_loss_mlp": 1.02362823, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 1.7683211633952802, + "language_loss": 0.86655068, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88830519, + "num_input_tokens_seen": 105481510, + "step": 4896, + "time_per_iteration": 2.5047900676727295 + }, + { + "auxiliary_loss_clip": 0.01137095, + "auxiliary_loss_mlp": 0.0104999, + "balance_loss_clip": 1.05705428, + "balance_loss_mlp": 1.03388476, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 1.7594095511348797, + "language_loss": 0.73146152, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75333238, + "num_input_tokens_seen": 105501390, + "step": 4897, + "time_per_iteration": 2.5250165462493896 + }, + { + "auxiliary_loss_clip": 0.01131867, + "auxiliary_loss_mlp": 0.01045449, + "balance_loss_clip": 1.05132294, + "balance_loss_mlp": 1.0275197, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 3.5478772409961157, + "language_loss": 0.73740661, + "learning_rate": 3.309989025093813e-06, + "loss": 0.75917983, + "num_input_tokens_seen": 105519600, + "step": 4898, + "time_per_iteration": 2.4996182918548584 + }, + { + "auxiliary_loss_clip": 0.01138316, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_clip": 1.05955517, + "balance_loss_mlp": 1.02873719, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 4.072670668562585, + "language_loss": 0.70631963, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72817433, + "num_input_tokens_seen": 105535970, + "step": 4899, + "time_per_iteration": 2.4705679416656494 + }, + { + "auxiliary_loss_clip": 0.01121848, + "auxiliary_loss_mlp": 0.00783552, + "balance_loss_clip": 1.05331111, + "balance_loss_mlp": 1.00124907, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 2.172117798762618, + "language_loss": 0.78896081, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.80801481, + "num_input_tokens_seen": 105556735, + "step": 4900, + "time_per_iteration": 2.6206047534942627 + }, + { + "auxiliary_loss_clip": 0.01108429, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.04804826, + "balance_loss_mlp": 1.03849721, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 2.1394498667313173, + "language_loss": 0.80321443, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.82486284, + "num_input_tokens_seen": 105574875, + "step": 4901, + "time_per_iteration": 2.4822640419006348 + }, + { + "auxiliary_loss_clip": 0.01114596, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.05467892, + "balance_loss_mlp": 1.02208424, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 2.0603962824731807, + "language_loss": 0.57584333, + "learning_rate": 3.308811466431157e-06, + "loss": 0.59735608, + "num_input_tokens_seen": 105594225, + "step": 4902, + "time_per_iteration": 2.538872003555298 + }, + { + "auxiliary_loss_clip": 0.01125443, + "auxiliary_loss_mlp": 0.01041476, + "balance_loss_clip": 1.05840254, + "balance_loss_mlp": 1.02639616, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6830089947629372, + "language_loss": 0.75498402, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77665329, + "num_input_tokens_seen": 105614000, + "step": 4903, + "time_per_iteration": 2.5262579917907715 + }, + { + "auxiliary_loss_clip": 0.01116208, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.05199671, + "balance_loss_mlp": 1.0324229, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 2.54442230871748, + "language_loss": 0.62196308, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64362991, + "num_input_tokens_seen": 105634575, + "step": 4904, + "time_per_iteration": 2.579015016555786 + }, + { + "auxiliary_loss_clip": 0.01136785, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.05851102, + "balance_loss_mlp": 1.02395725, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 2.021063772526578, + "language_loss": 0.73030567, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75206918, + "num_input_tokens_seen": 105654385, + "step": 4905, + "time_per_iteration": 2.5292017459869385 + }, + { + "auxiliary_loss_clip": 0.01113546, + "auxiliary_loss_mlp": 0.01044213, + "balance_loss_clip": 1.05541599, + "balance_loss_mlp": 1.02817917, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6151901407115297, + "language_loss": 0.81677127, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.83834887, + "num_input_tokens_seen": 105673570, + "step": 4906, + "time_per_iteration": 2.601285457611084 + }, + { + "auxiliary_loss_clip": 0.0109742, + "auxiliary_loss_mlp": 0.0104041, + "balance_loss_clip": 1.05141366, + "balance_loss_mlp": 1.02502596, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 2.270367554686498, + "language_loss": 0.87221634, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89359462, + "num_input_tokens_seen": 105691940, + "step": 4907, + "time_per_iteration": 2.65531325340271 + }, + { + "auxiliary_loss_clip": 0.01148743, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.05802155, + "balance_loss_mlp": 1.02151489, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 3.4708107245392137, + "language_loss": 0.82494473, + "learning_rate": 3.307043639752782e-06, + "loss": 0.84681237, + "num_input_tokens_seen": 105709825, + "step": 4908, + "time_per_iteration": 2.561534881591797 + }, + { + "auxiliary_loss_clip": 0.01082305, + "auxiliary_loss_mlp": 0.0100093, + "balance_loss_clip": 1.0497371, + "balance_loss_mlp": 0.99856931, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7509166525638034, + "language_loss": 0.57221925, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59305161, + "num_input_tokens_seen": 105766880, + "step": 4909, + "time_per_iteration": 2.9440619945526123 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.00782189, + "balance_loss_clip": 1.06159139, + "balance_loss_mlp": 1.0011816, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.800004902160168, + "language_loss": 0.86499, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88418239, + "num_input_tokens_seen": 105786875, + "step": 4910, + "time_per_iteration": 4.07201623916626 + }, + { + "auxiliary_loss_clip": 0.01132509, + "auxiliary_loss_mlp": 0.01039092, + "balance_loss_clip": 1.05907941, + "balance_loss_mlp": 1.02420306, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.8646124195841292, + "language_loss": 0.73092586, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75264186, + "num_input_tokens_seen": 105805315, + "step": 4911, + "time_per_iteration": 2.5234766006469727 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.05869555, + "balance_loss_mlp": 1.02172327, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 15.360312661787733, + "language_loss": 0.89622402, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.91792738, + "num_input_tokens_seen": 105825125, + "step": 4912, + "time_per_iteration": 2.5307540893554688 + }, + { + "auxiliary_loss_clip": 0.01118586, + "auxiliary_loss_mlp": 0.0105202, + "balance_loss_clip": 1.05493915, + "balance_loss_mlp": 1.03579569, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.424326751503138, + "language_loss": 0.83186817, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85357428, + "num_input_tokens_seen": 105846085, + "step": 4913, + "time_per_iteration": 2.533139228820801 + }, + { + "auxiliary_loss_clip": 0.01144019, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.05666482, + "balance_loss_mlp": 1.02770114, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 3.081021017315, + "language_loss": 0.77154565, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79341137, + "num_input_tokens_seen": 105865400, + "step": 4914, + "time_per_iteration": 2.4828670024871826 + }, + { + "auxiliary_loss_clip": 0.01124165, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.05734003, + "balance_loss_mlp": 1.02141702, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 2.184066960932399, + "language_loss": 0.80592132, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.82753766, + "num_input_tokens_seen": 105887920, + "step": 4915, + "time_per_iteration": 4.433633804321289 + }, + { + "auxiliary_loss_clip": 0.01074975, + "auxiliary_loss_mlp": 0.01039774, + "balance_loss_clip": 1.05408144, + "balance_loss_mlp": 1.02387142, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 2.324563611330108, + "language_loss": 0.84946179, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.87060928, + "num_input_tokens_seen": 105904035, + "step": 4916, + "time_per_iteration": 2.708312749862671 + }, + { + "auxiliary_loss_clip": 0.01125638, + "auxiliary_loss_mlp": 0.01035392, + "balance_loss_clip": 1.05019462, + "balance_loss_mlp": 1.02000856, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 1.886323556084085, + "language_loss": 0.6994499, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.72106022, + "num_input_tokens_seen": 105922685, + "step": 4917, + "time_per_iteration": 2.5064175128936768 + }, + { + "auxiliary_loss_clip": 0.01122876, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.05497932, + "balance_loss_mlp": 1.02020097, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.2806425051766017, + "language_loss": 0.91494513, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93652695, + "num_input_tokens_seen": 105940425, + "step": 4918, + "time_per_iteration": 2.5135231018066406 + }, + { + "auxiliary_loss_clip": 0.01146376, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.05847204, + "balance_loss_mlp": 1.02174783, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.8565684578647308, + "language_loss": 0.72791159, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74975669, + "num_input_tokens_seen": 105960550, + "step": 4919, + "time_per_iteration": 2.501917600631714 + }, + { + "auxiliary_loss_clip": 0.01122454, + "auxiliary_loss_mlp": 0.01040648, + "balance_loss_clip": 1.05565286, + "balance_loss_mlp": 1.02414966, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 2.6390503844280815, + "language_loss": 0.75659204, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.77822304, + "num_input_tokens_seen": 105978820, + "step": 4920, + "time_per_iteration": 3.901801347732544 + }, + { + "auxiliary_loss_clip": 0.0112201, + "auxiliary_loss_mlp": 0.01049117, + "balance_loss_clip": 1.0595361, + "balance_loss_mlp": 1.03239167, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.1804394967884795, + "language_loss": 0.68740439, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.70911562, + "num_input_tokens_seen": 105997545, + "step": 4921, + "time_per_iteration": 2.553957223892212 + }, + { + "auxiliary_loss_clip": 0.01121902, + "auxiliary_loss_mlp": 0.01045014, + "balance_loss_clip": 1.0558877, + "balance_loss_mlp": 1.02759767, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 2.2517291696674406, + "language_loss": 0.74341512, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76508433, + "num_input_tokens_seen": 106015320, + "step": 4922, + "time_per_iteration": 2.515455961227417 + }, + { + "auxiliary_loss_clip": 0.01150343, + "auxiliary_loss_mlp": 0.00782967, + "balance_loss_clip": 1.05725193, + "balance_loss_mlp": 1.00141418, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 1.8334289295295982, + "language_loss": 0.77117062, + "learning_rate": 3.302616272134737e-06, + "loss": 0.79050374, + "num_input_tokens_seen": 106034555, + "step": 4923, + "time_per_iteration": 2.516998767852783 + }, + { + "auxiliary_loss_clip": 0.01121949, + "auxiliary_loss_mlp": 0.01040244, + "balance_loss_clip": 1.05916333, + "balance_loss_mlp": 1.02387702, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.7281891957094733, + "language_loss": 0.861929, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88355088, + "num_input_tokens_seen": 106054200, + "step": 4924, + "time_per_iteration": 2.577796459197998 + }, + { + "auxiliary_loss_clip": 0.01134018, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.05690122, + "balance_loss_mlp": 1.01727152, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.5117820914246285, + "language_loss": 0.82148015, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84315813, + "num_input_tokens_seen": 106074700, + "step": 4925, + "time_per_iteration": 2.5155739784240723 + }, + { + "auxiliary_loss_clip": 0.01084028, + "auxiliary_loss_mlp": 0.01046836, + "balance_loss_clip": 1.04677904, + "balance_loss_mlp": 1.02890694, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 3.413614519671832, + "language_loss": 0.86160898, + "learning_rate": 3.301729463727452e-06, + "loss": 0.88291764, + "num_input_tokens_seen": 106091415, + "step": 4926, + "time_per_iteration": 2.585233688354492 + }, + { + "auxiliary_loss_clip": 0.01106073, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.05133653, + "balance_loss_mlp": 1.01825094, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.4477322642980646, + "language_loss": 0.85755616, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.87895685, + "num_input_tokens_seen": 106109135, + "step": 4927, + "time_per_iteration": 2.5394859313964844 + }, + { + "auxiliary_loss_clip": 0.01131353, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.05577612, + "balance_loss_mlp": 1.02216434, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.7850855052162793, + "language_loss": 0.80668628, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.82837677, + "num_input_tokens_seen": 106125750, + "step": 4928, + "time_per_iteration": 2.4810211658477783 + }, + { + "auxiliary_loss_clip": 0.01123913, + "auxiliary_loss_mlp": 0.01041107, + "balance_loss_clip": 1.05243766, + "balance_loss_mlp": 1.02221203, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 3.9282979829179823, + "language_loss": 0.72298855, + "learning_rate": 3.300842211064773e-06, + "loss": 0.74463868, + "num_input_tokens_seen": 106142835, + "step": 4929, + "time_per_iteration": 2.5745275020599365 + }, + { + "auxiliary_loss_clip": 0.01120538, + "auxiliary_loss_mlp": 0.0105047, + "balance_loss_clip": 1.05179131, + "balance_loss_mlp": 1.03162277, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.197475162794807, + "language_loss": 0.71473622, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.73644626, + "num_input_tokens_seen": 106160680, + "step": 4930, + "time_per_iteration": 2.514488935470581 + }, + { + "auxiliary_loss_clip": 0.01043833, + "auxiliary_loss_mlp": 0.01000245, + "balance_loss_clip": 1.04532003, + "balance_loss_mlp": 0.99806327, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.808480369693753, + "language_loss": 0.60648525, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62692595, + "num_input_tokens_seen": 106224415, + "step": 4931, + "time_per_iteration": 4.6511616706848145 + }, + { + "auxiliary_loss_clip": 0.01043817, + "auxiliary_loss_mlp": 0.0101206, + "balance_loss_clip": 1.0568006, + "balance_loss_mlp": 1.00962818, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7600378523537604, + "language_loss": 0.52361619, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54417497, + "num_input_tokens_seen": 106279140, + "step": 4932, + "time_per_iteration": 3.131101369857788 + }, + { + "auxiliary_loss_clip": 0.01129719, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.05232465, + "balance_loss_mlp": 1.02383482, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 1.6732319745860373, + "language_loss": 0.81595957, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83765614, + "num_input_tokens_seen": 106298190, + "step": 4933, + "time_per_iteration": 2.6850743293762207 + }, + { + "auxiliary_loss_clip": 0.01099218, + "auxiliary_loss_mlp": 0.01039454, + "balance_loss_clip": 1.05097115, + "balance_loss_mlp": 1.02246654, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 2.2959950742046087, + "language_loss": 0.75198424, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77337092, + "num_input_tokens_seen": 106319065, + "step": 4934, + "time_per_iteration": 2.605940818786621 + }, + { + "auxiliary_loss_clip": 0.0112284, + "auxiliary_loss_mlp": 0.01047519, + "balance_loss_clip": 1.05056453, + "balance_loss_mlp": 1.03060317, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 2.7918343307260054, + "language_loss": 0.6225099, + "learning_rate": 3.299066374184594e-06, + "loss": 0.6442135, + "num_input_tokens_seen": 106338040, + "step": 4935, + "time_per_iteration": 2.5050413608551025 + }, + { + "auxiliary_loss_clip": 0.01133255, + "auxiliary_loss_mlp": 0.01040918, + "balance_loss_clip": 1.05640364, + "balance_loss_mlp": 1.02463388, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.591892522969879, + "language_loss": 0.79916948, + "learning_rate": 3.2987702288932e-06, + "loss": 0.82091123, + "num_input_tokens_seen": 106358900, + "step": 4936, + "time_per_iteration": 2.570347785949707 + }, + { + "auxiliary_loss_clip": 0.01100309, + "auxiliary_loss_mlp": 0.01044156, + "balance_loss_clip": 1.05266249, + "balance_loss_mlp": 1.02743053, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.7403581471184237, + "language_loss": 0.74262518, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76406986, + "num_input_tokens_seen": 106381805, + "step": 4937, + "time_per_iteration": 2.7178421020507812 + }, + { + "auxiliary_loss_clip": 0.01093472, + "auxiliary_loss_mlp": 0.01040379, + "balance_loss_clip": 1.05285072, + "balance_loss_mlp": 1.02434528, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.7903683951136902, + "language_loss": 0.78495473, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80629325, + "num_input_tokens_seen": 106402365, + "step": 4938, + "time_per_iteration": 2.5946848392486572 + }, + { + "auxiliary_loss_clip": 0.01117617, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.05243003, + "balance_loss_mlp": 1.03058076, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 1.9106553872095404, + "language_loss": 0.76864159, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79030776, + "num_input_tokens_seen": 106419800, + "step": 4939, + "time_per_iteration": 2.492949962615967 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01038549, + "balance_loss_clip": 1.04796088, + "balance_loss_mlp": 1.02203822, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.5509859638622931, + "language_loss": 0.77940232, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80088949, + "num_input_tokens_seen": 106440300, + "step": 4940, + "time_per_iteration": 2.581454277038574 + }, + { + "auxiliary_loss_clip": 0.01120552, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.05505061, + "balance_loss_mlp": 1.01959157, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 2.0533067390987934, + "language_loss": 0.75469947, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77628618, + "num_input_tokens_seen": 106460035, + "step": 4941, + "time_per_iteration": 2.548290491104126 + }, + { + "auxiliary_loss_clip": 0.01137466, + "auxiliary_loss_mlp": 0.01051414, + "balance_loss_clip": 1.05586314, + "balance_loss_mlp": 1.03355634, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.9684661994336725, + "language_loss": 0.7375046, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.75939333, + "num_input_tokens_seen": 106481095, + "step": 4942, + "time_per_iteration": 2.6438302993774414 + }, + { + "auxiliary_loss_clip": 0.01109088, + "auxiliary_loss_mlp": 0.01041705, + "balance_loss_clip": 1.05219448, + "balance_loss_mlp": 1.02381134, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.874846852725113, + "language_loss": 0.7062549, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72776282, + "num_input_tokens_seen": 106501590, + "step": 4943, + "time_per_iteration": 2.5863354206085205 + }, + { + "auxiliary_loss_clip": 0.01123152, + "auxiliary_loss_mlp": 0.01041701, + "balance_loss_clip": 1.05334139, + "balance_loss_mlp": 1.02409339, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 2.1762859806327066, + "language_loss": 0.79800946, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.81965804, + "num_input_tokens_seen": 106519430, + "step": 4944, + "time_per_iteration": 2.5134217739105225 + }, + { + "auxiliary_loss_clip": 0.01118326, + "auxiliary_loss_mlp": 0.01039441, + "balance_loss_clip": 1.05307078, + "balance_loss_mlp": 1.02410436, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.889554617786966, + "language_loss": 0.8320154, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85359311, + "num_input_tokens_seen": 106535870, + "step": 4945, + "time_per_iteration": 2.525954246520996 + }, + { + "auxiliary_loss_clip": 0.01094821, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.05245841, + "balance_loss_mlp": 1.02166533, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 1.864065288377578, + "language_loss": 0.66733563, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.68865794, + "num_input_tokens_seen": 106553560, + "step": 4946, + "time_per_iteration": 2.5783579349517822 + }, + { + "auxiliary_loss_clip": 0.01133505, + "auxiliary_loss_mlp": 0.00785663, + "balance_loss_clip": 1.05758595, + "balance_loss_mlp": 1.00136709, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 2.062051021734464, + "language_loss": 0.74071503, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75990671, + "num_input_tokens_seen": 106574115, + "step": 4947, + "time_per_iteration": 2.56404709815979 + }, + { + "auxiliary_loss_clip": 0.01113551, + "auxiliary_loss_mlp": 0.01043416, + "balance_loss_clip": 1.05551219, + "balance_loss_mlp": 1.02661955, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.02114431735241, + "language_loss": 0.73119235, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75276196, + "num_input_tokens_seen": 106593070, + "step": 4948, + "time_per_iteration": 2.6593070030212402 + }, + { + "auxiliary_loss_clip": 0.01142288, + "auxiliary_loss_mlp": 0.01036221, + "balance_loss_clip": 1.05497801, + "balance_loss_mlp": 1.02013969, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.0666306547312554, + "language_loss": 0.84150136, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.8632865, + "num_input_tokens_seen": 106610695, + "step": 4949, + "time_per_iteration": 2.4879186153411865 + }, + { + "auxiliary_loss_clip": 0.01130567, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.05477548, + "balance_loss_mlp": 1.02309358, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 2.343101251324142, + "language_loss": 0.70896703, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73067003, + "num_input_tokens_seen": 106631300, + "step": 4950, + "time_per_iteration": 4.054612398147583 + }, + { + "auxiliary_loss_clip": 0.01098371, + "auxiliary_loss_mlp": 0.0104364, + "balance_loss_clip": 1.05559313, + "balance_loss_mlp": 1.02867913, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 10.91381767376052, + "language_loss": 0.82437718, + "learning_rate": 3.294322145875789e-06, + "loss": 0.8457973, + "num_input_tokens_seen": 106650065, + "step": 4951, + "time_per_iteration": 2.610652208328247 + }, + { + "auxiliary_loss_clip": 0.01121467, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.05419064, + "balance_loss_mlp": 1.01909149, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.593105186649093, + "language_loss": 0.74127668, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76285052, + "num_input_tokens_seen": 106668230, + "step": 4952, + "time_per_iteration": 2.564084529876709 + }, + { + "auxiliary_loss_clip": 0.01067196, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_clip": 1.04695296, + "balance_loss_mlp": 1.03043795, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.6862202181335038, + "language_loss": 0.83753574, + "learning_rate": 3.293728232937228e-06, + "loss": 0.85869443, + "num_input_tokens_seen": 106687785, + "step": 4953, + "time_per_iteration": 2.666611433029175 + }, + { + "auxiliary_loss_clip": 0.01120756, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.05179453, + "balance_loss_mlp": 1.02517641, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 1.926287199441711, + "language_loss": 0.73555064, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.75717318, + "num_input_tokens_seen": 106706875, + "step": 4954, + "time_per_iteration": 3.990048885345459 + }, + { + "auxiliary_loss_clip": 0.01142161, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.05559826, + "balance_loss_mlp": 1.02240181, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.9209458557384194, + "language_loss": 0.75191277, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77371037, + "num_input_tokens_seen": 106725105, + "step": 4955, + "time_per_iteration": 2.467973232269287 + }, + { + "auxiliary_loss_clip": 0.01104598, + "auxiliary_loss_mlp": 0.01038073, + "balance_loss_clip": 1.05961323, + "balance_loss_mlp": 1.02101409, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 1.6720325507035843, + "language_loss": 0.72271222, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74413902, + "num_input_tokens_seen": 106744780, + "step": 4956, + "time_per_iteration": 2.5937607288360596 + }, + { + "auxiliary_loss_clip": 0.01138209, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_clip": 1.05819845, + "balance_loss_mlp": 1.0259521, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 2.3330054060063317, + "language_loss": 0.78931618, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81112945, + "num_input_tokens_seen": 106764670, + "step": 4957, + "time_per_iteration": 2.518470525741577 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_clip": 1.056422, + "balance_loss_mlp": 1.02550757, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.6763930700918932, + "language_loss": 0.70319176, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.72499049, + "num_input_tokens_seen": 106783695, + "step": 4958, + "time_per_iteration": 2.5174496173858643 + }, + { + "auxiliary_loss_clip": 0.01107028, + "auxiliary_loss_mlp": 0.01042687, + "balance_loss_clip": 1.05392003, + "balance_loss_mlp": 1.02617645, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.606248745094292, + "language_loss": 0.79100907, + "learning_rate": 3.291945317082743e-06, + "loss": 0.81250626, + "num_input_tokens_seen": 106803150, + "step": 4959, + "time_per_iteration": 3.9788200855255127 + }, + { + "auxiliary_loss_clip": 0.01132724, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.05461717, + "balance_loss_mlp": 1.02844596, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.80700611366441, + "language_loss": 0.79098022, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81275076, + "num_input_tokens_seen": 106820705, + "step": 4960, + "time_per_iteration": 2.505568027496338 + }, + { + "auxiliary_loss_clip": 0.01110101, + "auxiliary_loss_mlp": 0.01048877, + "balance_loss_clip": 1.05027461, + "balance_loss_mlp": 1.03105545, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.258873591243938, + "language_loss": 0.73824745, + "learning_rate": 3.291350619752129e-06, + "loss": 0.75983727, + "num_input_tokens_seen": 106837335, + "step": 4961, + "time_per_iteration": 2.533430814743042 + }, + { + "auxiliary_loss_clip": 0.01132416, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.054124, + "balance_loss_mlp": 1.02456379, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.8473149291047697, + "language_loss": 0.6222868, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64401257, + "num_input_tokens_seen": 106856250, + "step": 4962, + "time_per_iteration": 2.522216320037842 + }, + { + "auxiliary_loss_clip": 0.01128806, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_clip": 1.05492544, + "balance_loss_mlp": 1.02605808, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.6967897448012068, + "language_loss": 0.82880533, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85052574, + "num_input_tokens_seen": 106873370, + "step": 4963, + "time_per_iteration": 2.4800546169281006 + }, + { + "auxiliary_loss_clip": 0.01119765, + "auxiliary_loss_mlp": 0.01036559, + "balance_loss_clip": 1.06395853, + "balance_loss_mlp": 1.01951218, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.1257372507610346, + "language_loss": 0.6593107, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68087393, + "num_input_tokens_seen": 106890330, + "step": 4964, + "time_per_iteration": 2.548062562942505 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.05492449, + "balance_loss_mlp": 1.01941633, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.665309057445533, + "language_loss": 0.70848197, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.73012161, + "num_input_tokens_seen": 106909190, + "step": 4965, + "time_per_iteration": 2.4998910427093506 + }, + { + "auxiliary_loss_clip": 0.01147775, + "auxiliary_loss_mlp": 0.01047534, + "balance_loss_clip": 1.05967426, + "balance_loss_mlp": 1.03104758, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 2.848381585556099, + "language_loss": 0.66269052, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68464357, + "num_input_tokens_seen": 106927825, + "step": 4966, + "time_per_iteration": 2.4800376892089844 + }, + { + "auxiliary_loss_clip": 0.01149647, + "auxiliary_loss_mlp": 0.01041394, + "balance_loss_clip": 1.06053281, + "balance_loss_mlp": 1.02518129, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 3.0842556771092746, + "language_loss": 0.74845415, + "learning_rate": 3.289565352885785e-06, + "loss": 0.77036452, + "num_input_tokens_seen": 106943155, + "step": 4967, + "time_per_iteration": 2.450439453125 + }, + { + "auxiliary_loss_clip": 0.01114109, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.04908574, + "balance_loss_mlp": 1.01873338, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 2.021950613073649, + "language_loss": 0.70891815, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73040181, + "num_input_tokens_seen": 106960295, + "step": 4968, + "time_per_iteration": 2.5106887817382812 + }, + { + "auxiliary_loss_clip": 0.01126502, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.05267835, + "balance_loss_mlp": 1.02020216, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 1.9609844201370306, + "language_loss": 0.77070284, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.7923404, + "num_input_tokens_seen": 106982870, + "step": 4969, + "time_per_iteration": 2.6108295917510986 + }, + { + "auxiliary_loss_clip": 0.01145075, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.05837893, + "balance_loss_mlp": 1.01875174, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.6044721575705965, + "language_loss": 0.69941515, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72120231, + "num_input_tokens_seen": 107002405, + "step": 4970, + "time_per_iteration": 4.081451654434204 + }, + { + "auxiliary_loss_clip": 0.01137915, + "auxiliary_loss_mlp": 0.01043218, + "balance_loss_clip": 1.0575943, + "balance_loss_mlp": 1.02563453, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.5384467254782295, + "language_loss": 0.84881711, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87062842, + "num_input_tokens_seen": 107017310, + "step": 4971, + "time_per_iteration": 2.485564708709717 + }, + { + "auxiliary_loss_clip": 0.01123047, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_clip": 1.06143594, + "balance_loss_mlp": 1.02689803, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.9214182449837542, + "language_loss": 0.79449034, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81616682, + "num_input_tokens_seen": 107034645, + "step": 4972, + "time_per_iteration": 2.5333757400512695 + }, + { + "auxiliary_loss_clip": 0.01147109, + "auxiliary_loss_mlp": 0.01045257, + "balance_loss_clip": 1.05861425, + "balance_loss_mlp": 1.02949786, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 2.1434223802020367, + "language_loss": 0.84997213, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87189579, + "num_input_tokens_seen": 107051125, + "step": 4973, + "time_per_iteration": 2.443027973175049 + }, + { + "auxiliary_loss_clip": 0.01122386, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.06173813, + "balance_loss_mlp": 1.0166018, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.6150100874578144, + "language_loss": 0.77333313, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79488158, + "num_input_tokens_seen": 107068815, + "step": 4974, + "time_per_iteration": 2.5293774604797363 + }, + { + "auxiliary_loss_clip": 0.01121502, + "auxiliary_loss_mlp": 0.00786548, + "balance_loss_clip": 1.05695498, + "balance_loss_mlp": 1.00153184, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 1.7739622107020443, + "language_loss": 0.7260325, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74511302, + "num_input_tokens_seen": 107090420, + "step": 4975, + "time_per_iteration": 2.5993430614471436 + }, + { + "auxiliary_loss_clip": 0.0113619, + "auxiliary_loss_mlp": 0.01039036, + "balance_loss_clip": 1.05903935, + "balance_loss_mlp": 1.02231133, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 2.5420153162618435, + "language_loss": 0.76048207, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78223431, + "num_input_tokens_seen": 107107255, + "step": 4976, + "time_per_iteration": 2.4885683059692383 + }, + { + "auxiliary_loss_clip": 0.01136053, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.06105363, + "balance_loss_mlp": 1.02348781, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 1.982886602269771, + "language_loss": 0.86125427, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88300496, + "num_input_tokens_seen": 107123840, + "step": 4977, + "time_per_iteration": 2.4720234870910645 + }, + { + "auxiliary_loss_clip": 0.01123703, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.05908251, + "balance_loss_mlp": 1.02122927, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.5286640426527234, + "language_loss": 0.68037611, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.70198178, + "num_input_tokens_seen": 107143475, + "step": 4978, + "time_per_iteration": 2.5462496280670166 + }, + { + "auxiliary_loss_clip": 0.01130407, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.06295967, + "balance_loss_mlp": 1.02073538, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 2.1765838867893437, + "language_loss": 0.75985569, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78153431, + "num_input_tokens_seen": 107161725, + "step": 4979, + "time_per_iteration": 2.582728624343872 + }, + { + "auxiliary_loss_clip": 0.0109722, + "auxiliary_loss_mlp": 0.01041347, + "balance_loss_clip": 1.05209947, + "balance_loss_mlp": 1.02297711, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 2.0122350575259116, + "language_loss": 0.68343484, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70482051, + "num_input_tokens_seen": 107183935, + "step": 4980, + "time_per_iteration": 2.6814537048339844 + }, + { + "auxiliary_loss_clip": 0.01134407, + "auxiliary_loss_mlp": 0.00781973, + "balance_loss_clip": 1.06102216, + "balance_loss_mlp": 1.00136149, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.8992728605396887, + "language_loss": 0.73635769, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75552142, + "num_input_tokens_seen": 107204285, + "step": 4981, + "time_per_iteration": 2.5647003650665283 + }, + { + "auxiliary_loss_clip": 0.01136177, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.05614996, + "balance_loss_mlp": 1.02519822, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.5956080340507488, + "language_loss": 0.86342192, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.88519502, + "num_input_tokens_seen": 107225265, + "step": 4982, + "time_per_iteration": 2.541883945465088 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01038622, + "balance_loss_clip": 1.05909848, + "balance_loss_mlp": 1.02096736, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.204413345364479, + "language_loss": 0.86647058, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88812077, + "num_input_tokens_seen": 107241335, + "step": 4983, + "time_per_iteration": 2.5100297927856445 + }, + { + "auxiliary_loss_clip": 0.01133084, + "auxiliary_loss_mlp": 0.01045263, + "balance_loss_clip": 1.06221056, + "balance_loss_mlp": 1.03014708, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 5.890108731246762, + "language_loss": 0.78740788, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80919135, + "num_input_tokens_seen": 107259375, + "step": 4984, + "time_per_iteration": 2.5218610763549805 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01044362, + "balance_loss_clip": 1.05584943, + "balance_loss_mlp": 1.02710032, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.566139882987968, + "language_loss": 0.785339, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80693269, + "num_input_tokens_seen": 107279890, + "step": 4985, + "time_per_iteration": 2.567540168762207 + }, + { + "auxiliary_loss_clip": 0.01089914, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_clip": 1.04959798, + "balance_loss_mlp": 1.02739644, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 2.3244265454579867, + "language_loss": 0.71584213, + "learning_rate": 3.283900405580837e-06, + "loss": 0.73722297, + "num_input_tokens_seen": 107303430, + "step": 4986, + "time_per_iteration": 2.8877146244049072 + }, + { + "auxiliary_loss_clip": 0.01125841, + "auxiliary_loss_mlp": 0.01045131, + "balance_loss_clip": 1.05688918, + "balance_loss_mlp": 1.02810764, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.8874987982024303, + "language_loss": 0.73165309, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75336283, + "num_input_tokens_seen": 107323700, + "step": 4987, + "time_per_iteration": 2.5437183380126953 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.05704677, + "balance_loss_mlp": 1.02135777, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 1.8351170257411404, + "language_loss": 0.80324125, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82478476, + "num_input_tokens_seen": 107341965, + "step": 4988, + "time_per_iteration": 2.5278968811035156 + }, + { + "auxiliary_loss_clip": 0.01122112, + "auxiliary_loss_mlp": 0.00786149, + "balance_loss_clip": 1.05702245, + "balance_loss_mlp": 1.00156856, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.5456565594062701, + "language_loss": 0.71115863, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.73024124, + "num_input_tokens_seen": 107362615, + "step": 4989, + "time_per_iteration": 4.011227369308472 + }, + { + "auxiliary_loss_clip": 0.01113487, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.05949342, + "balance_loss_mlp": 1.02442586, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 3.278778856449789, + "language_loss": 0.85305679, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87460244, + "num_input_tokens_seen": 107378980, + "step": 4990, + "time_per_iteration": 2.5400776863098145 + }, + { + "auxiliary_loss_clip": 0.0113408, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.05505097, + "balance_loss_mlp": 1.02140963, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.9897237003878483, + "language_loss": 0.67445773, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69618511, + "num_input_tokens_seen": 107397640, + "step": 4991, + "time_per_iteration": 2.5387203693389893 + }, + { + "auxiliary_loss_clip": 0.01125057, + "auxiliary_loss_mlp": 0.01042118, + "balance_loss_clip": 1.0530746, + "balance_loss_mlp": 1.02421272, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.7634110843125672, + "language_loss": 0.7862677, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80793941, + "num_input_tokens_seen": 107416020, + "step": 4992, + "time_per_iteration": 2.542387008666992 + }, + { + "auxiliary_loss_clip": 0.01148139, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.05960035, + "balance_loss_mlp": 1.0241766, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 1.950938802125223, + "language_loss": 0.82716286, + "learning_rate": 3.281808885221193e-06, + "loss": 0.8490479, + "num_input_tokens_seen": 107436340, + "step": 4993, + "time_per_iteration": 2.4785430431365967 + }, + { + "auxiliary_loss_clip": 0.01099046, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_clip": 1.05123329, + "balance_loss_mlp": 1.02841353, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.5353419402784514, + "language_loss": 0.8578639, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.87931883, + "num_input_tokens_seen": 107454585, + "step": 4994, + "time_per_iteration": 4.13312292098999 + }, + { + "auxiliary_loss_clip": 0.0112127, + "auxiliary_loss_mlp": 0.01039177, + "balance_loss_clip": 1.06013143, + "balance_loss_mlp": 1.02266669, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.4716717949839349, + "language_loss": 0.81260693, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83421141, + "num_input_tokens_seen": 107477180, + "step": 4995, + "time_per_iteration": 2.6301522254943848 + }, + { + "auxiliary_loss_clip": 0.01123367, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.05849135, + "balance_loss_mlp": 1.0194273, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.7282821427637758, + "language_loss": 0.67212546, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.693717, + "num_input_tokens_seen": 107500250, + "step": 4996, + "time_per_iteration": 2.8982691764831543 + }, + { + "auxiliary_loss_clip": 0.0112029, + "auxiliary_loss_mlp": 0.01044767, + "balance_loss_clip": 1.05721498, + "balance_loss_mlp": 1.02738667, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.8363905904445856, + "language_loss": 0.75529122, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77694178, + "num_input_tokens_seen": 107520070, + "step": 4997, + "time_per_iteration": 2.550759792327881 + }, + { + "auxiliary_loss_clip": 0.01133813, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.05674899, + "balance_loss_mlp": 1.02780032, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.7511977096057445, + "language_loss": 0.77793038, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79970551, + "num_input_tokens_seen": 107539285, + "step": 4998, + "time_per_iteration": 3.9070305824279785 + }, + { + "auxiliary_loss_clip": 0.01142743, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.05817401, + "balance_loss_mlp": 1.02374125, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.5967445224971404, + "language_loss": 0.73333162, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75515229, + "num_input_tokens_seen": 107560260, + "step": 4999, + "time_per_iteration": 2.506221294403076 + }, + { + "auxiliary_loss_clip": 0.01135418, + "auxiliary_loss_mlp": 0.01040767, + "balance_loss_clip": 1.0559299, + "balance_loss_mlp": 1.02449512, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.7928543650658098, + "language_loss": 0.7589972, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.78075904, + "num_input_tokens_seen": 107579260, + "step": 5000, + "time_per_iteration": 2.502995729446411 + }, + { + "auxiliary_loss_clip": 0.01142071, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.05827391, + "balance_loss_mlp": 1.0240519, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.819663982048067, + "language_loss": 0.81675756, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83856404, + "num_input_tokens_seen": 107595245, + "step": 5001, + "time_per_iteration": 2.435581684112549 + }, + { + "auxiliary_loss_clip": 0.01133143, + "auxiliary_loss_mlp": 0.0104136, + "balance_loss_clip": 1.05782902, + "balance_loss_mlp": 1.02469444, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.6427402600757715, + "language_loss": 0.80509317, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82683825, + "num_input_tokens_seen": 107613985, + "step": 5002, + "time_per_iteration": 2.518935441970825 + }, + { + "auxiliary_loss_clip": 0.01091482, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.05029738, + "balance_loss_mlp": 1.01728225, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 2.4717496260048573, + "language_loss": 0.71293283, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73419631, + "num_input_tokens_seen": 107631435, + "step": 5003, + "time_per_iteration": 2.5837275981903076 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01038067, + "balance_loss_clip": 1.05488706, + "balance_loss_mlp": 1.02197361, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 2.992318961391801, + "language_loss": 0.70653176, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72804582, + "num_input_tokens_seen": 107650530, + "step": 5004, + "time_per_iteration": 2.624202013015747 + }, + { + "auxiliary_loss_clip": 0.011262, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_clip": 1.05965042, + "balance_loss_mlp": 1.02706623, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 7.256215401622421, + "language_loss": 0.82206637, + "learning_rate": 3.278217882782715e-06, + "loss": 0.84377128, + "num_input_tokens_seen": 107662240, + "step": 5005, + "time_per_iteration": 2.523477792739868 + }, + { + "auxiliary_loss_clip": 0.01132263, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.05683994, + "balance_loss_mlp": 1.0231142, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.697683155811907, + "language_loss": 0.74853957, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77024394, + "num_input_tokens_seen": 107680330, + "step": 5006, + "time_per_iteration": 2.536095380783081 + }, + { + "auxiliary_loss_clip": 0.0110415, + "auxiliary_loss_mlp": 0.00783997, + "balance_loss_clip": 1.05360126, + "balance_loss_mlp": 1.00105214, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.182386548291203, + "language_loss": 0.71268368, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.73156512, + "num_input_tokens_seen": 107700020, + "step": 5007, + "time_per_iteration": 2.6218464374542236 + }, + { + "auxiliary_loss_clip": 0.01134157, + "auxiliary_loss_mlp": 0.01042491, + "balance_loss_clip": 1.05815494, + "balance_loss_mlp": 1.02589703, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 6.5067735711862955, + "language_loss": 0.76097739, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78274387, + "num_input_tokens_seen": 107718575, + "step": 5008, + "time_per_iteration": 2.568333387374878 + }, + { + "auxiliary_loss_clip": 0.01131549, + "auxiliary_loss_mlp": 0.01041335, + "balance_loss_clip": 1.05814803, + "balance_loss_mlp": 1.02536106, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 2.1354437868587683, + "language_loss": 0.8479501, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.86967891, + "num_input_tokens_seen": 107738635, + "step": 5009, + "time_per_iteration": 2.5192816257476807 + }, + { + "auxiliary_loss_clip": 0.01139134, + "auxiliary_loss_mlp": 0.01040584, + "balance_loss_clip": 1.05787778, + "balance_loss_mlp": 1.02301288, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.9879401424114558, + "language_loss": 0.83957767, + "learning_rate": 3.276719570659604e-06, + "loss": 0.86137486, + "num_input_tokens_seen": 107753415, + "step": 5010, + "time_per_iteration": 3.9933972358703613 + }, + { + "auxiliary_loss_clip": 0.01114823, + "auxiliary_loss_mlp": 0.01036905, + "balance_loss_clip": 1.05758071, + "balance_loss_mlp": 1.02196813, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 2.1780215544345363, + "language_loss": 0.85694796, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87846518, + "num_input_tokens_seen": 107773840, + "step": 5011, + "time_per_iteration": 2.652860403060913 + }, + { + "auxiliary_loss_clip": 0.01124944, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.05242848, + "balance_loss_mlp": 1.02378082, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 3.3731650291550297, + "language_loss": 0.72245634, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74410647, + "num_input_tokens_seen": 107792020, + "step": 5012, + "time_per_iteration": 2.53423810005188 + }, + { + "auxiliary_loss_clip": 0.01132016, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.05607915, + "balance_loss_mlp": 1.02269459, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 2.1944255934073444, + "language_loss": 0.8730678, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89477742, + "num_input_tokens_seen": 107809595, + "step": 5013, + "time_per_iteration": 2.491495132446289 + }, + { + "auxiliary_loss_clip": 0.01118454, + "auxiliary_loss_mlp": 0.01049695, + "balance_loss_clip": 1.05158067, + "balance_loss_mlp": 1.02957201, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 2.115739226313608, + "language_loss": 0.83054185, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.8522234, + "num_input_tokens_seen": 107827230, + "step": 5014, + "time_per_iteration": 2.508753776550293 + }, + { + "auxiliary_loss_clip": 0.01097185, + "auxiliary_loss_mlp": 0.01042979, + "balance_loss_clip": 1.04959106, + "balance_loss_mlp": 1.02525282, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.783425646449566, + "language_loss": 0.68094373, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70234543, + "num_input_tokens_seen": 107847195, + "step": 5015, + "time_per_iteration": 2.5981457233428955 + }, + { + "auxiliary_loss_clip": 0.01119179, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.05487692, + "balance_loss_mlp": 1.02211833, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.470639729683146, + "language_loss": 0.7488085, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.77038753, + "num_input_tokens_seen": 107866420, + "step": 5016, + "time_per_iteration": 2.546409845352173 + }, + { + "auxiliary_loss_clip": 0.01135553, + "auxiliary_loss_mlp": 0.0103808, + "balance_loss_clip": 1.05493116, + "balance_loss_mlp": 1.0217483, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.77754689840188, + "language_loss": 0.65936214, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.68109852, + "num_input_tokens_seen": 107889090, + "step": 5017, + "time_per_iteration": 2.5850985050201416 + }, + { + "auxiliary_loss_clip": 0.01099601, + "auxiliary_loss_mlp": 0.01051561, + "balance_loss_clip": 1.04908776, + "balance_loss_mlp": 1.03332222, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 1.973044499829905, + "language_loss": 0.68557447, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.70708609, + "num_input_tokens_seen": 107907520, + "step": 5018, + "time_per_iteration": 2.562055826187134 + }, + { + "auxiliary_loss_clip": 0.01140511, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.05577362, + "balance_loss_mlp": 1.02420747, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 2.044624696068382, + "language_loss": 0.7864821, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.80827558, + "num_input_tokens_seen": 107925650, + "step": 5019, + "time_per_iteration": 2.480684995651245 + }, + { + "auxiliary_loss_clip": 0.01117654, + "auxiliary_loss_mlp": 0.01039587, + "balance_loss_clip": 1.05588567, + "balance_loss_mlp": 1.02412558, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 1.941384344201344, + "language_loss": 0.69336784, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.71494025, + "num_input_tokens_seen": 107943975, + "step": 5020, + "time_per_iteration": 2.5467729568481445 + }, + { + "auxiliary_loss_clip": 0.01146749, + "auxiliary_loss_mlp": 0.01046248, + "balance_loss_clip": 1.05653572, + "balance_loss_mlp": 1.03039277, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 2.3457049882461076, + "language_loss": 0.78330362, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.8052336, + "num_input_tokens_seen": 107962950, + "step": 5021, + "time_per_iteration": 2.451472759246826 + }, + { + "auxiliary_loss_clip": 0.01137496, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.05748773, + "balance_loss_mlp": 1.01867533, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 3.4918897241193187, + "language_loss": 0.75589716, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.77761626, + "num_input_tokens_seen": 107979700, + "step": 5022, + "time_per_iteration": 2.4732327461242676 + }, + { + "auxiliary_loss_clip": 0.01144902, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.05561817, + "balance_loss_mlp": 1.02847183, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.8120332379099136, + "language_loss": 0.70014268, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72203732, + "num_input_tokens_seen": 107996645, + "step": 5023, + "time_per_iteration": 2.4427666664123535 + }, + { + "auxiliary_loss_clip": 0.01120111, + "auxiliary_loss_mlp": 0.01038928, + "balance_loss_clip": 1.05300665, + "balance_loss_mlp": 1.02369881, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 1.9439205172321672, + "language_loss": 0.71319807, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73478842, + "num_input_tokens_seen": 108015020, + "step": 5024, + "time_per_iteration": 2.5380663871765137 + }, + { + "auxiliary_loss_clip": 0.01131593, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.05726814, + "balance_loss_mlp": 1.0295496, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.9779033029976305, + "language_loss": 0.73967552, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76144755, + "num_input_tokens_seen": 108036430, + "step": 5025, + "time_per_iteration": 2.5425143241882324 + }, + { + "auxiliary_loss_clip": 0.01135413, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.06039357, + "balance_loss_mlp": 1.02425396, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.673071983247995, + "language_loss": 0.67476475, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69651419, + "num_input_tokens_seen": 108054250, + "step": 5026, + "time_per_iteration": 2.515976905822754 + }, + { + "auxiliary_loss_clip": 0.01139802, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.06050229, + "balance_loss_mlp": 1.02567399, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.7865114520386165, + "language_loss": 0.85242146, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87423861, + "num_input_tokens_seen": 108071495, + "step": 5027, + "time_per_iteration": 2.4935495853424072 + }, + { + "auxiliary_loss_clip": 0.01109961, + "auxiliary_loss_mlp": 0.01039131, + "balance_loss_clip": 1.05623066, + "balance_loss_mlp": 1.02468324, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.7428408708249299, + "language_loss": 0.78536165, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80685258, + "num_input_tokens_seen": 108092135, + "step": 5028, + "time_per_iteration": 2.6468374729156494 + }, + { + "auxiliary_loss_clip": 0.01122101, + "auxiliary_loss_mlp": 0.01044369, + "balance_loss_clip": 1.05598557, + "balance_loss_mlp": 1.02796626, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 1.7866048752993167, + "language_loss": 0.76946819, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79113287, + "num_input_tokens_seen": 108112945, + "step": 5029, + "time_per_iteration": 4.12734580039978 + }, + { + "auxiliary_loss_clip": 0.01121239, + "auxiliary_loss_mlp": 0.01041156, + "balance_loss_clip": 1.05922985, + "balance_loss_mlp": 1.02286959, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 1.9168695020559234, + "language_loss": 0.8195563, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84118021, + "num_input_tokens_seen": 108130325, + "step": 5030, + "time_per_iteration": 2.5722339153289795 + }, + { + "auxiliary_loss_clip": 0.01100071, + "auxiliary_loss_mlp": 0.00783988, + "balance_loss_clip": 1.05360782, + "balance_loss_mlp": 1.00109506, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 2.2561233120129844, + "language_loss": 0.69801509, + "learning_rate": 3.270413459468905e-06, + "loss": 0.7168557, + "num_input_tokens_seen": 108150300, + "step": 5031, + "time_per_iteration": 2.6142220497131348 + }, + { + "auxiliary_loss_clip": 0.0112832, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.05359077, + "balance_loss_mlp": 1.01895952, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.84189498172347, + "language_loss": 0.82234907, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84398389, + "num_input_tokens_seen": 108170330, + "step": 5032, + "time_per_iteration": 2.561858892440796 + }, + { + "auxiliary_loss_clip": 0.01110836, + "auxiliary_loss_mlp": 0.01046585, + "balance_loss_clip": 1.05763698, + "balance_loss_mlp": 1.02845299, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.4983171391414736, + "language_loss": 0.7360369, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75761104, + "num_input_tokens_seen": 108191265, + "step": 5033, + "time_per_iteration": 4.170456171035767 + }, + { + "auxiliary_loss_clip": 0.01127692, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.05368447, + "balance_loss_mlp": 1.0341208, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.8455334515532242, + "language_loss": 0.74151069, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76329303, + "num_input_tokens_seen": 108211615, + "step": 5034, + "time_per_iteration": 2.541646957397461 + }, + { + "auxiliary_loss_clip": 0.01143959, + "auxiliary_loss_mlp": 0.01037667, + "balance_loss_clip": 1.05591726, + "balance_loss_mlp": 1.02191997, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.9980714283479244, + "language_loss": 0.72090364, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74271989, + "num_input_tokens_seen": 108231080, + "step": 5035, + "time_per_iteration": 2.5137174129486084 + }, + { + "auxiliary_loss_clip": 0.01131857, + "auxiliary_loss_mlp": 0.01039271, + "balance_loss_clip": 1.0553534, + "balance_loss_mlp": 1.02409613, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 4.041370907008333, + "language_loss": 0.87276149, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89447272, + "num_input_tokens_seen": 108251125, + "step": 5036, + "time_per_iteration": 2.542963743209839 + }, + { + "auxiliary_loss_clip": 0.01103217, + "auxiliary_loss_mlp": 0.01052365, + "balance_loss_clip": 1.05301523, + "balance_loss_mlp": 1.03539002, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 3.493931593031666, + "language_loss": 0.77930915, + "learning_rate": 3.268607806688536e-06, + "loss": 0.80086493, + "num_input_tokens_seen": 108272545, + "step": 5037, + "time_per_iteration": 2.5849204063415527 + }, + { + "auxiliary_loss_clip": 0.0110596, + "auxiliary_loss_mlp": 0.01047031, + "balance_loss_clip": 1.05051231, + "balance_loss_mlp": 1.03077078, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.1857777586079172, + "language_loss": 0.77475286, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79628277, + "num_input_tokens_seen": 108289725, + "step": 5038, + "time_per_iteration": 3.9045298099517822 + }, + { + "auxiliary_loss_clip": 0.01115244, + "auxiliary_loss_mlp": 0.01038793, + "balance_loss_clip": 1.05191803, + "balance_loss_mlp": 1.02353406, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 2.1623867904844154, + "language_loss": 0.74191272, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76345319, + "num_input_tokens_seen": 108310690, + "step": 5039, + "time_per_iteration": 2.5845446586608887 + }, + { + "auxiliary_loss_clip": 0.01140779, + "auxiliary_loss_mlp": 0.00781367, + "balance_loss_clip": 1.05779099, + "balance_loss_mlp": 1.00105047, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 3.3980887848151626, + "language_loss": 0.79757607, + "learning_rate": 3.267704330716847e-06, + "loss": 0.81679755, + "num_input_tokens_seen": 108328905, + "step": 5040, + "time_per_iteration": 2.486142158508301 + }, + { + "auxiliary_loss_clip": 0.01118491, + "auxiliary_loss_mlp": 0.01039666, + "balance_loss_clip": 1.05538201, + "balance_loss_mlp": 1.02536702, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.7671581779573677, + "language_loss": 0.81914872, + "learning_rate": 3.267403075901438e-06, + "loss": 0.84073031, + "num_input_tokens_seen": 108346680, + "step": 5041, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01057732, + "auxiliary_loss_mlp": 0.01014529, + "balance_loss_clip": 1.0612359, + "balance_loss_mlp": 1.0123955, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7593613637900173, + "language_loss": 0.59476614, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61548877, + "num_input_tokens_seen": 108413885, + "step": 5042, + "time_per_iteration": 3.265795946121216 + }, + { + "auxiliary_loss_clip": 0.01145256, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.05733037, + "balance_loss_mlp": 1.01940036, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.9000584320723672, + "language_loss": 0.7161634, + "learning_rate": 3.266800422101892e-06, + "loss": 0.7379663, + "num_input_tokens_seen": 108433640, + "step": 5043, + "time_per_iteration": 2.4901206493377686 + }, + { + "auxiliary_loss_clip": 0.01103338, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.05656958, + "balance_loss_mlp": 1.01807129, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.251522636386573, + "language_loss": 0.69442993, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71579695, + "num_input_tokens_seen": 108452640, + "step": 5044, + "time_per_iteration": 2.768709659576416 + }, + { + "auxiliary_loss_clip": 0.01128551, + "auxiliary_loss_mlp": 0.01037612, + "balance_loss_clip": 1.05367005, + "balance_loss_mlp": 1.0227108, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.4103706846542494, + "language_loss": 0.77197337, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79363501, + "num_input_tokens_seen": 108472470, + "step": 5045, + "time_per_iteration": 2.523869276046753 + }, + { + "auxiliary_loss_clip": 0.01143129, + "auxiliary_loss_mlp": 0.00781466, + "balance_loss_clip": 1.05567098, + "balance_loss_mlp": 1.0011946, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.784447723620555, + "language_loss": 0.72678381, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74602973, + "num_input_tokens_seen": 108493025, + "step": 5046, + "time_per_iteration": 2.5271549224853516 + }, + { + "auxiliary_loss_clip": 0.01133473, + "auxiliary_loss_mlp": 0.01042124, + "balance_loss_clip": 1.05516839, + "balance_loss_mlp": 1.02464771, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 1.864186590158815, + "language_loss": 0.81309628, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.83485222, + "num_input_tokens_seen": 108513480, + "step": 5047, + "time_per_iteration": 2.5774149894714355 + }, + { + "auxiliary_loss_clip": 0.01086491, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.04979205, + "balance_loss_mlp": 1.02566099, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.8075754870084524, + "language_loss": 0.72196597, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74323922, + "num_input_tokens_seen": 108533155, + "step": 5048, + "time_per_iteration": 2.617940902709961 + }, + { + "auxiliary_loss_clip": 0.01117987, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.05201578, + "balance_loss_mlp": 1.01796746, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.8558826637269972, + "language_loss": 0.75798571, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.7794888, + "num_input_tokens_seen": 108551900, + "step": 5049, + "time_per_iteration": 3.9789671897888184 + }, + { + "auxiliary_loss_clip": 0.01132438, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.05292892, + "balance_loss_mlp": 1.02147782, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.675631995672188, + "language_loss": 0.81940734, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.841097, + "num_input_tokens_seen": 108574005, + "step": 5050, + "time_per_iteration": 2.5596823692321777 + }, + { + "auxiliary_loss_clip": 0.01109837, + "auxiliary_loss_mlp": 0.0103528, + "balance_loss_clip": 1.05621529, + "balance_loss_mlp": 1.0190798, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.4854168457996075, + "language_loss": 0.74072337, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.76217461, + "num_input_tokens_seen": 108592715, + "step": 5051, + "time_per_iteration": 2.5674448013305664 + }, + { + "auxiliary_loss_clip": 0.01084399, + "auxiliary_loss_mlp": 0.00782697, + "balance_loss_clip": 1.05008769, + "balance_loss_mlp": 1.00128198, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 2.719840997835246, + "language_loss": 0.76597196, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78464293, + "num_input_tokens_seen": 108611770, + "step": 5052, + "time_per_iteration": 2.651369333267212 + }, + { + "auxiliary_loss_clip": 0.01141717, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.05248964, + "balance_loss_mlp": 1.02534819, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 2.21569492933456, + "language_loss": 0.83082098, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.85264778, + "num_input_tokens_seen": 108629070, + "step": 5053, + "time_per_iteration": 2.4361753463745117 + }, + { + "auxiliary_loss_clip": 0.01118661, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.05408764, + "balance_loss_mlp": 1.02212584, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.610607853654712, + "language_loss": 0.70958942, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73115551, + "num_input_tokens_seen": 108646315, + "step": 5054, + "time_per_iteration": 2.499926805496216 + }, + { + "auxiliary_loss_clip": 0.01142869, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.05506802, + "balance_loss_mlp": 1.02053857, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 2.6972851663876423, + "language_loss": 0.69100988, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71280015, + "num_input_tokens_seen": 108665920, + "step": 5055, + "time_per_iteration": 2.5071253776550293 + }, + { + "auxiliary_loss_clip": 0.01114533, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.05337167, + "balance_loss_mlp": 1.01817727, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 1.8565786523364922, + "language_loss": 0.67713094, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69861621, + "num_input_tokens_seen": 108683485, + "step": 5056, + "time_per_iteration": 2.4995193481445312 + }, + { + "auxiliary_loss_clip": 0.01118565, + "auxiliary_loss_mlp": 0.01042678, + "balance_loss_clip": 1.05512905, + "balance_loss_mlp": 1.02747941, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.7868255508711128, + "language_loss": 0.82542413, + "learning_rate": 3.262576470461507e-06, + "loss": 0.8470366, + "num_input_tokens_seen": 108702700, + "step": 5057, + "time_per_iteration": 2.5442488193511963 + }, + { + "auxiliary_loss_clip": 0.01117874, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.05300403, + "balance_loss_mlp": 1.02146947, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 1.8066048672061255, + "language_loss": 0.89103329, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91257966, + "num_input_tokens_seen": 108721860, + "step": 5058, + "time_per_iteration": 2.569431781768799 + }, + { + "auxiliary_loss_clip": 0.01109669, + "auxiliary_loss_mlp": 0.01040721, + "balance_loss_clip": 1.05381107, + "balance_loss_mlp": 1.02540267, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 2.5787668619650197, + "language_loss": 0.70935762, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73086154, + "num_input_tokens_seen": 108743215, + "step": 5059, + "time_per_iteration": 2.647433042526245 + }, + { + "auxiliary_loss_clip": 0.01086504, + "auxiliary_loss_mlp": 0.01038456, + "balance_loss_clip": 1.05043912, + "balance_loss_mlp": 1.02345955, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 1.8499266457192531, + "language_loss": 0.72701967, + "learning_rate": 3.26167011603268e-06, + "loss": 0.74826926, + "num_input_tokens_seen": 108765505, + "step": 5060, + "time_per_iteration": 2.6931867599487305 + }, + { + "auxiliary_loss_clip": 0.01143944, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.05701256, + "balance_loss_mlp": 1.01915741, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.9506690022788238, + "language_loss": 0.77066076, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.7924338, + "num_input_tokens_seen": 108783370, + "step": 5061, + "time_per_iteration": 2.529414415359497 + }, + { + "auxiliary_loss_clip": 0.01111114, + "auxiliary_loss_mlp": 0.01037938, + "balance_loss_clip": 1.05734324, + "balance_loss_mlp": 1.02115297, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 1.9530396490690856, + "language_loss": 0.82296997, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84446049, + "num_input_tokens_seen": 108797430, + "step": 5062, + "time_per_iteration": 2.561800479888916 + }, + { + "auxiliary_loss_clip": 0.01140142, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.05534554, + "balance_loss_mlp": 1.01834416, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 2.219053806675268, + "language_loss": 0.74287426, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76460195, + "num_input_tokens_seen": 108816945, + "step": 5063, + "time_per_iteration": 2.5682411193847656 + }, + { + "auxiliary_loss_clip": 0.01131072, + "auxiliary_loss_mlp": 0.00780263, + "balance_loss_clip": 1.0565263, + "balance_loss_mlp": 1.00122643, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.7241822824969524, + "language_loss": 0.83933657, + "learning_rate": 3.26046097371721e-06, + "loss": 0.85844994, + "num_input_tokens_seen": 108836615, + "step": 5064, + "time_per_iteration": 2.522736072540283 + }, + { + "auxiliary_loss_clip": 0.01130343, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.0551213, + "balance_loss_mlp": 1.02068782, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 2.0162331486239835, + "language_loss": 0.75736737, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.7790373, + "num_input_tokens_seen": 108855165, + "step": 5065, + "time_per_iteration": 2.517958879470825 + }, + { + "auxiliary_loss_clip": 0.01114983, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.05178571, + "balance_loss_mlp": 1.02412522, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 1.718899778209846, + "language_loss": 0.62171882, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.6432721, + "num_input_tokens_seen": 108874690, + "step": 5066, + "time_per_iteration": 2.6095807552337646 + }, + { + "auxiliary_loss_clip": 0.01117546, + "auxiliary_loss_mlp": 0.01044326, + "balance_loss_clip": 1.05439019, + "balance_loss_mlp": 1.0287807, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 2.1112641330266095, + "language_loss": 0.8269105, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.84852922, + "num_input_tokens_seen": 108893140, + "step": 5067, + "time_per_iteration": 2.6081326007843018 + }, + { + "auxiliary_loss_clip": 0.01141892, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.05718517, + "balance_loss_mlp": 1.02771771, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 2.047044979584788, + "language_loss": 0.62959135, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65142703, + "num_input_tokens_seen": 108911880, + "step": 5068, + "time_per_iteration": 3.9248571395874023 + }, + { + "auxiliary_loss_clip": 0.01129866, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.05264246, + "balance_loss_mlp": 1.01639414, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.917800216245386, + "language_loss": 0.74988461, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77149224, + "num_input_tokens_seen": 108930440, + "step": 5069, + "time_per_iteration": 2.5417733192443848 + }, + { + "auxiliary_loss_clip": 0.01104884, + "auxiliary_loss_mlp": 0.01039885, + "balance_loss_clip": 1.04883718, + "balance_loss_mlp": 1.02572286, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 1.93757029928957, + "language_loss": 0.75703311, + "learning_rate": 3.258645826569261e-06, + "loss": 0.77848077, + "num_input_tokens_seen": 108949125, + "step": 5070, + "time_per_iteration": 2.58518385887146 + }, + { + "auxiliary_loss_clip": 0.01146013, + "auxiliary_loss_mlp": 0.00781644, + "balance_loss_clip": 1.05667448, + "balance_loss_mlp": 1.00133896, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 2.2013248258812803, + "language_loss": 0.81484491, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83412153, + "num_input_tokens_seen": 108972190, + "step": 5071, + "time_per_iteration": 2.5710575580596924 + }, + { + "auxiliary_loss_clip": 0.01112397, + "auxiliary_loss_mlp": 0.01045133, + "balance_loss_clip": 1.04804695, + "balance_loss_mlp": 1.02907562, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 3.1237251951813647, + "language_loss": 0.7585088, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78008413, + "num_input_tokens_seen": 108990325, + "step": 5072, + "time_per_iteration": 4.041898965835571 + }, + { + "auxiliary_loss_clip": 0.01101221, + "auxiliary_loss_mlp": 0.01044457, + "balance_loss_clip": 1.05139589, + "balance_loss_mlp": 1.02942443, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 2.1204968721690327, + "language_loss": 0.71016407, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73162091, + "num_input_tokens_seen": 109009505, + "step": 5073, + "time_per_iteration": 2.5829737186431885 + }, + { + "auxiliary_loss_clip": 0.01136567, + "auxiliary_loss_mlp": 0.01044856, + "balance_loss_clip": 1.05713964, + "balance_loss_mlp": 1.02954912, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 1.977857283374623, + "language_loss": 0.76337117, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78518534, + "num_input_tokens_seen": 109026350, + "step": 5074, + "time_per_iteration": 2.50238037109375 + }, + { + "auxiliary_loss_clip": 0.01121654, + "auxiliary_loss_mlp": 0.01034636, + "balance_loss_clip": 1.05776596, + "balance_loss_mlp": 1.02090907, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.9159701827923457, + "language_loss": 0.74641883, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76798177, + "num_input_tokens_seen": 109044165, + "step": 5075, + "time_per_iteration": 2.5479896068573 + }, + { + "auxiliary_loss_clip": 0.01147322, + "auxiliary_loss_mlp": 0.01041741, + "balance_loss_clip": 1.05823541, + "balance_loss_mlp": 1.02496803, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.1250731202487287, + "language_loss": 0.75481671, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77670741, + "num_input_tokens_seen": 109060665, + "step": 5076, + "time_per_iteration": 2.4252262115478516 + }, + { + "auxiliary_loss_clip": 0.01120088, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.0527184, + "balance_loss_mlp": 1.02191114, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.6846155769401503, + "language_loss": 0.79278588, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81435061, + "num_input_tokens_seen": 109080035, + "step": 5077, + "time_per_iteration": 3.8912713527679443 + }, + { + "auxiliary_loss_clip": 0.0109037, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.04720581, + "balance_loss_mlp": 1.01935887, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.7835296169071, + "language_loss": 0.7440933, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76532805, + "num_input_tokens_seen": 109097385, + "step": 5078, + "time_per_iteration": 2.570638656616211 + }, + { + "auxiliary_loss_clip": 0.01088873, + "auxiliary_loss_mlp": 0.01051579, + "balance_loss_clip": 1.04932523, + "balance_loss_mlp": 1.03686237, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 1.933068936555459, + "language_loss": 0.66907895, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69048345, + "num_input_tokens_seen": 109115495, + "step": 5079, + "time_per_iteration": 2.568248748779297 + }, + { + "auxiliary_loss_clip": 0.01132675, + "auxiliary_loss_mlp": 0.01036413, + "balance_loss_clip": 1.05622458, + "balance_loss_mlp": 1.02216184, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 2.0401174292994333, + "language_loss": 0.79910672, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82079756, + "num_input_tokens_seen": 109134235, + "step": 5080, + "time_per_iteration": 2.511793851852417 + }, + { + "auxiliary_loss_clip": 0.01131185, + "auxiliary_loss_mlp": 0.00779378, + "balance_loss_clip": 1.05747521, + "balance_loss_mlp": 1.0011816, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 4.304472071709128, + "language_loss": 0.81077564, + "learning_rate": 3.255313596022074e-06, + "loss": 0.82988131, + "num_input_tokens_seen": 109152760, + "step": 5081, + "time_per_iteration": 2.54135799407959 + }, + { + "auxiliary_loss_clip": 0.01127147, + "auxiliary_loss_mlp": 0.01034423, + "balance_loss_clip": 1.05702984, + "balance_loss_mlp": 1.02046394, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.7309207583968464, + "language_loss": 0.71882302, + "learning_rate": 3.255010380132783e-06, + "loss": 0.7404387, + "num_input_tokens_seen": 109173925, + "step": 5082, + "time_per_iteration": 2.567359685897827 + }, + { + "auxiliary_loss_clip": 0.01135084, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.0575459, + "balance_loss_mlp": 1.02207661, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 2.1275672286371887, + "language_loss": 0.73162067, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75335449, + "num_input_tokens_seen": 109192510, + "step": 5083, + "time_per_iteration": 2.524033546447754 + }, + { + "auxiliary_loss_clip": 0.01113284, + "auxiliary_loss_mlp": 0.00781888, + "balance_loss_clip": 1.04981613, + "balance_loss_mlp": 1.00123239, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 2.927213069075764, + "language_loss": 0.71373236, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73268414, + "num_input_tokens_seen": 109210885, + "step": 5084, + "time_per_iteration": 2.5053670406341553 + }, + { + "auxiliary_loss_clip": 0.01104633, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.05042779, + "balance_loss_mlp": 1.02024913, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 1.7651507492759992, + "language_loss": 0.78232807, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80374169, + "num_input_tokens_seen": 109229180, + "step": 5085, + "time_per_iteration": 2.538999557495117 + }, + { + "auxiliary_loss_clip": 0.01140534, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.05726326, + "balance_loss_mlp": 1.02387619, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.709894459403338, + "language_loss": 0.78561962, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80740958, + "num_input_tokens_seen": 109249510, + "step": 5086, + "time_per_iteration": 2.471074104309082 + }, + { + "auxiliary_loss_clip": 0.01109505, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.0500803, + "balance_loss_mlp": 1.02517891, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.78539679690062, + "language_loss": 0.76274019, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78424478, + "num_input_tokens_seen": 109268200, + "step": 5087, + "time_per_iteration": 2.532719850540161 + }, + { + "auxiliary_loss_clip": 0.01134687, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.05700874, + "balance_loss_mlp": 1.02323604, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 5.242221501793272, + "language_loss": 0.72628522, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74802279, + "num_input_tokens_seen": 109288370, + "step": 5088, + "time_per_iteration": 4.017417669296265 + }, + { + "auxiliary_loss_clip": 0.01132819, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.05225229, + "balance_loss_mlp": 1.02100945, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.4003061097098404, + "language_loss": 0.79304457, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81473815, + "num_input_tokens_seen": 109306730, + "step": 5089, + "time_per_iteration": 2.457759380340576 + }, + { + "auxiliary_loss_clip": 0.01116422, + "auxiliary_loss_mlp": 0.01040467, + "balance_loss_clip": 1.05603838, + "balance_loss_mlp": 1.02536345, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 1.7528157029264764, + "language_loss": 0.77042949, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79199839, + "num_input_tokens_seen": 109327360, + "step": 5090, + "time_per_iteration": 2.5675647258758545 + }, + { + "auxiliary_loss_clip": 0.01118979, + "auxiliary_loss_mlp": 0.0104518, + "balance_loss_clip": 1.04915905, + "balance_loss_mlp": 1.02982628, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.6621663268137383, + "language_loss": 0.7616905, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78333211, + "num_input_tokens_seen": 109348135, + "step": 5091, + "time_per_iteration": 2.5646815299987793 + }, + { + "auxiliary_loss_clip": 0.01076123, + "auxiliary_loss_mlp": 0.01044042, + "balance_loss_clip": 1.04591775, + "balance_loss_mlp": 1.02711451, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 2.231746759378873, + "language_loss": 0.71508455, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.73628622, + "num_input_tokens_seen": 109366220, + "step": 5092, + "time_per_iteration": 2.6544251441955566 + }, + { + "auxiliary_loss_clip": 0.01121152, + "auxiliary_loss_mlp": 0.0103683, + "balance_loss_clip": 1.05600834, + "balance_loss_mlp": 1.02211404, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 2.163692458993665, + "language_loss": 0.82822222, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84980202, + "num_input_tokens_seen": 109385260, + "step": 5093, + "time_per_iteration": 2.556835174560547 + }, + { + "auxiliary_loss_clip": 0.01141306, + "auxiliary_loss_mlp": 0.00781013, + "balance_loss_clip": 1.05606055, + "balance_loss_mlp": 1.00142479, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 5.193188174587396, + "language_loss": 0.74802822, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.76725137, + "num_input_tokens_seen": 109405025, + "step": 5094, + "time_per_iteration": 2.4944944381713867 + }, + { + "auxiliary_loss_clip": 0.01119104, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.05617189, + "balance_loss_mlp": 1.02657008, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 2.141784712811641, + "language_loss": 0.76072621, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78232932, + "num_input_tokens_seen": 109422465, + "step": 5095, + "time_per_iteration": 2.5131962299346924 + }, + { + "auxiliary_loss_clip": 0.01128573, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.05529809, + "balance_loss_mlp": 1.02362967, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.7840550688043488, + "language_loss": 0.80358768, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82525682, + "num_input_tokens_seen": 109440575, + "step": 5096, + "time_per_iteration": 2.5110480785369873 + }, + { + "auxiliary_loss_clip": 0.01132542, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.05603218, + "balance_loss_mlp": 1.01885581, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 4.183337106600484, + "language_loss": 0.81841826, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84008086, + "num_input_tokens_seen": 109459050, + "step": 5097, + "time_per_iteration": 2.4834277629852295 + }, + { + "auxiliary_loss_clip": 0.01143989, + "auxiliary_loss_mlp": 0.01042891, + "balance_loss_clip": 1.05662131, + "balance_loss_mlp": 1.02639222, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.8738859583061023, + "language_loss": 0.78072852, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80259734, + "num_input_tokens_seen": 109475860, + "step": 5098, + "time_per_iteration": 2.4665591716766357 + }, + { + "auxiliary_loss_clip": 0.01096666, + "auxiliary_loss_mlp": 0.01036134, + "balance_loss_clip": 1.05209255, + "balance_loss_mlp": 1.02062535, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.845353430512984, + "language_loss": 0.84155512, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86288315, + "num_input_tokens_seen": 109494760, + "step": 5099, + "time_per_iteration": 2.6156251430511475 + }, + { + "auxiliary_loss_clip": 0.01141661, + "auxiliary_loss_mlp": 0.01039916, + "balance_loss_clip": 1.05242848, + "balance_loss_mlp": 1.02414465, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 1.7941689340460316, + "language_loss": 0.85682571, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87864155, + "num_input_tokens_seen": 109516480, + "step": 5100, + "time_per_iteration": 2.504977226257324 + }, + { + "auxiliary_loss_clip": 0.01105983, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.04923749, + "balance_loss_mlp": 1.02044463, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.8582449816974989, + "language_loss": 0.7894733, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81090456, + "num_input_tokens_seen": 109534615, + "step": 5101, + "time_per_iteration": 2.563063144683838 + }, + { + "auxiliary_loss_clip": 0.0110722, + "auxiliary_loss_mlp": 0.0104441, + "balance_loss_clip": 1.05270672, + "balance_loss_mlp": 1.02758992, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.5876468896158402, + "language_loss": 0.80241668, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82393295, + "num_input_tokens_seen": 109554040, + "step": 5102, + "time_per_iteration": 2.580643892288208 + }, + { + "auxiliary_loss_clip": 0.01144182, + "auxiliary_loss_mlp": 0.01039542, + "balance_loss_clip": 1.05606771, + "balance_loss_mlp": 1.02267337, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.7616269854190234, + "language_loss": 0.887766, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.90960324, + "num_input_tokens_seen": 109574345, + "step": 5103, + "time_per_iteration": 2.469564199447632 + }, + { + "auxiliary_loss_clip": 0.01123175, + "auxiliary_loss_mlp": 0.01040954, + "balance_loss_clip": 1.05080175, + "balance_loss_mlp": 1.02515876, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 2.0325687815245215, + "language_loss": 0.73921734, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76085865, + "num_input_tokens_seen": 109593670, + "step": 5104, + "time_per_iteration": 2.5024070739746094 + }, + { + "auxiliary_loss_clip": 0.01127735, + "auxiliary_loss_mlp": 0.00783342, + "balance_loss_clip": 1.04882574, + "balance_loss_mlp": 1.00125456, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 2.3878301105015853, + "language_loss": 0.72680801, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.74591875, + "num_input_tokens_seen": 109613385, + "step": 5105, + "time_per_iteration": 2.521101474761963 + }, + { + "auxiliary_loss_clip": 0.01118303, + "auxiliary_loss_mlp": 0.01043279, + "balance_loss_clip": 1.05451417, + "balance_loss_mlp": 1.02688813, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 2.275574554175822, + "language_loss": 0.87342972, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89504558, + "num_input_tokens_seen": 109632395, + "step": 5106, + "time_per_iteration": 2.5579476356506348 + }, + { + "auxiliary_loss_clip": 0.01107326, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_clip": 1.04770076, + "balance_loss_mlp": 1.02647305, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.2294358517744364, + "language_loss": 0.71379006, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73529363, + "num_input_tokens_seen": 109651380, + "step": 5107, + "time_per_iteration": 4.1104700565338135 + }, + { + "auxiliary_loss_clip": 0.01102202, + "auxiliary_loss_mlp": 0.01047332, + "balance_loss_clip": 1.04904795, + "balance_loss_mlp": 1.03121507, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 3.5394191172341283, + "language_loss": 0.7232306, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74472594, + "num_input_tokens_seen": 109670240, + "step": 5108, + "time_per_iteration": 2.55212140083313 + }, + { + "auxiliary_loss_clip": 0.01113785, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.05032456, + "balance_loss_mlp": 1.02100527, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.4665183526858805, + "language_loss": 0.85680264, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87830222, + "num_input_tokens_seen": 109690810, + "step": 5109, + "time_per_iteration": 2.5734658241271973 + }, + { + "auxiliary_loss_clip": 0.01113492, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.04996991, + "balance_loss_mlp": 1.01624346, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.618378153646493, + "language_loss": 0.67498088, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69642681, + "num_input_tokens_seen": 109711145, + "step": 5110, + "time_per_iteration": 2.5573527812957764 + }, + { + "auxiliary_loss_clip": 0.01128505, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.05420232, + "balance_loss_mlp": 1.01726711, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.5727404948000283, + "language_loss": 0.7699858, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79158211, + "num_input_tokens_seen": 109731425, + "step": 5111, + "time_per_iteration": 4.040214538574219 + }, + { + "auxiliary_loss_clip": 0.01140621, + "auxiliary_loss_mlp": 0.01040229, + "balance_loss_clip": 1.0539844, + "balance_loss_mlp": 1.02488697, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 2.125196065928698, + "language_loss": 0.66911364, + "learning_rate": 3.245891825796765e-06, + "loss": 0.6909222, + "num_input_tokens_seen": 109752720, + "step": 5112, + "time_per_iteration": 2.4935200214385986 + }, + { + "auxiliary_loss_clip": 0.01130006, + "auxiliary_loss_mlp": 0.0104509, + "balance_loss_clip": 1.05059981, + "balance_loss_mlp": 1.02787638, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 3.073921599479014, + "language_loss": 0.7953949, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81714588, + "num_input_tokens_seen": 109772840, + "step": 5113, + "time_per_iteration": 2.5462183952331543 + }, + { + "auxiliary_loss_clip": 0.01102693, + "auxiliary_loss_mlp": 0.00781734, + "balance_loss_clip": 1.0496695, + "balance_loss_mlp": 1.00142324, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 2.2465382290398006, + "language_loss": 0.77220488, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79104918, + "num_input_tokens_seen": 109790150, + "step": 5114, + "time_per_iteration": 2.512878179550171 + }, + { + "auxiliary_loss_clip": 0.01104236, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.05013204, + "balance_loss_mlp": 1.02174914, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 2.1214697340658635, + "language_loss": 0.62410432, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64553308, + "num_input_tokens_seen": 109807985, + "step": 5115, + "time_per_iteration": 2.55886173248291 + }, + { + "auxiliary_loss_clip": 0.01131711, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_clip": 1.05182683, + "balance_loss_mlp": 1.02726471, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 2.5767300165578506, + "language_loss": 0.82896519, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.85070848, + "num_input_tokens_seen": 109825920, + "step": 5116, + "time_per_iteration": 3.907179355621338 + }, + { + "auxiliary_loss_clip": 0.01115384, + "auxiliary_loss_mlp": 0.01046192, + "balance_loss_clip": 1.05213714, + "balance_loss_mlp": 1.0316124, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.8648325355435806, + "language_loss": 0.75876522, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78038096, + "num_input_tokens_seen": 109846220, + "step": 5117, + "time_per_iteration": 2.5352745056152344 + }, + { + "auxiliary_loss_clip": 0.01098831, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.04953313, + "balance_loss_mlp": 1.02144611, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.2844928903812933, + "language_loss": 0.72652686, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74790823, + "num_input_tokens_seen": 109863870, + "step": 5118, + "time_per_iteration": 2.549959659576416 + }, + { + "auxiliary_loss_clip": 0.01101794, + "auxiliary_loss_mlp": 0.01036308, + "balance_loss_clip": 1.06026673, + "balance_loss_mlp": 1.021824, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.5439726732067731, + "language_loss": 0.74229574, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76367676, + "num_input_tokens_seen": 109883500, + "step": 5119, + "time_per_iteration": 2.6307730674743652 + }, + { + "auxiliary_loss_clip": 0.011339, + "auxiliary_loss_mlp": 0.01047483, + "balance_loss_clip": 1.054739, + "balance_loss_mlp": 1.03084159, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.9454952408712227, + "language_loss": 0.80236638, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82418025, + "num_input_tokens_seen": 109904620, + "step": 5120, + "time_per_iteration": 2.5477256774902344 + }, + { + "auxiliary_loss_clip": 0.01126696, + "auxiliary_loss_mlp": 0.01043711, + "balance_loss_clip": 1.04906511, + "balance_loss_mlp": 1.02859569, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.9477779203567231, + "language_loss": 0.79833102, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82003504, + "num_input_tokens_seen": 109922275, + "step": 5121, + "time_per_iteration": 2.4793715476989746 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.04844105, + "balance_loss_mlp": 1.02169192, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.5212884881614406, + "language_loss": 0.82484031, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84624696, + "num_input_tokens_seen": 109944265, + "step": 5122, + "time_per_iteration": 2.5760557651519775 + }, + { + "auxiliary_loss_clip": 0.01080713, + "auxiliary_loss_mlp": 0.01003819, + "balance_loss_clip": 1.05644965, + "balance_loss_mlp": 1.00125623, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7398789407320314, + "language_loss": 0.58649033, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60733563, + "num_input_tokens_seen": 110014160, + "step": 5123, + "time_per_iteration": 3.229128837585449 + }, + { + "auxiliary_loss_clip": 0.01133153, + "auxiliary_loss_mlp": 0.00782842, + "balance_loss_clip": 1.05212271, + "balance_loss_mlp": 1.0012455, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5839273918306398, + "language_loss": 0.83523554, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85439545, + "num_input_tokens_seen": 110034865, + "step": 5124, + "time_per_iteration": 2.551213264465332 + }, + { + "auxiliary_loss_clip": 0.01145138, + "auxiliary_loss_mlp": 0.01040296, + "balance_loss_clip": 1.05590212, + "balance_loss_mlp": 1.02530515, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 2.394051036292262, + "language_loss": 0.78778982, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.8096441, + "num_input_tokens_seen": 110052930, + "step": 5125, + "time_per_iteration": 2.4749436378479004 + }, + { + "auxiliary_loss_clip": 0.0112579, + "auxiliary_loss_mlp": 0.01037012, + "balance_loss_clip": 1.04812789, + "balance_loss_mlp": 1.01975083, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.9327302132671218, + "language_loss": 0.6435219, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66514993, + "num_input_tokens_seen": 110071765, + "step": 5126, + "time_per_iteration": 2.5480079650878906 + }, + { + "auxiliary_loss_clip": 0.01098614, + "auxiliary_loss_mlp": 0.01039921, + "balance_loss_clip": 1.05433607, + "balance_loss_mlp": 1.02534199, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.92220507833291, + "language_loss": 0.86575812, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88714349, + "num_input_tokens_seen": 110092660, + "step": 5127, + "time_per_iteration": 2.624798536300659 + }, + { + "auxiliary_loss_clip": 0.01092336, + "auxiliary_loss_mlp": 0.01040524, + "balance_loss_clip": 1.05183887, + "balance_loss_mlp": 1.02480054, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.6898779463421434, + "language_loss": 0.6906271, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.71195567, + "num_input_tokens_seen": 110114960, + "step": 5128, + "time_per_iteration": 4.140835762023926 + }, + { + "auxiliary_loss_clip": 0.01131934, + "auxiliary_loss_mlp": 0.00781689, + "balance_loss_clip": 1.05465269, + "balance_loss_mlp": 1.00136042, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.9030789182264167, + "language_loss": 0.7168026, + "learning_rate": 3.240705750931993e-06, + "loss": 0.73593891, + "num_input_tokens_seen": 110135750, + "step": 5129, + "time_per_iteration": 2.5457231998443604 + }, + { + "auxiliary_loss_clip": 0.01059939, + "auxiliary_loss_mlp": 0.0100528, + "balance_loss_clip": 1.05543542, + "balance_loss_mlp": 1.00289619, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8302572921656167, + "language_loss": 0.59298277, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61363494, + "num_input_tokens_seen": 110189480, + "step": 5130, + "time_per_iteration": 3.1037847995758057 + }, + { + "auxiliary_loss_clip": 0.01116823, + "auxiliary_loss_mlp": 0.01043044, + "balance_loss_clip": 1.04864383, + "balance_loss_mlp": 1.02659321, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.285743757214614, + "language_loss": 0.73119473, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75279337, + "num_input_tokens_seen": 110206445, + "step": 5131, + "time_per_iteration": 2.523310661315918 + }, + { + "auxiliary_loss_clip": 0.01101805, + "auxiliary_loss_mlp": 0.01036033, + "balance_loss_clip": 1.04905081, + "balance_loss_mlp": 1.02159715, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.5194434562257506, + "language_loss": 0.70841736, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.72979569, + "num_input_tokens_seen": 110226845, + "step": 5132, + "time_per_iteration": 2.57914662361145 + }, + { + "auxiliary_loss_clip": 0.01133945, + "auxiliary_loss_mlp": 0.00780619, + "balance_loss_clip": 1.05116487, + "balance_loss_mlp": 1.00123227, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7292271945257454, + "language_loss": 0.89851677, + "learning_rate": 3.239483519913136e-06, + "loss": 0.91766238, + "num_input_tokens_seen": 110244095, + "step": 5133, + "time_per_iteration": 2.458472967147827 + }, + { + "auxiliary_loss_clip": 0.01120592, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.05036473, + "balance_loss_mlp": 1.02378654, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 2.009936258239435, + "language_loss": 0.67273134, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69433141, + "num_input_tokens_seen": 110264240, + "step": 5134, + "time_per_iteration": 2.6432418823242188 + }, + { + "auxiliary_loss_clip": 0.01123498, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_clip": 1.05052233, + "balance_loss_mlp": 1.0244329, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 7.998719071712069, + "language_loss": 0.82759106, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.84923005, + "num_input_tokens_seen": 110282450, + "step": 5135, + "time_per_iteration": 2.4566009044647217 + }, + { + "auxiliary_loss_clip": 0.01048053, + "auxiliary_loss_mlp": 0.01002909, + "balance_loss_clip": 1.04289699, + "balance_loss_mlp": 1.00058401, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.708843025669265, + "language_loss": 0.55276668, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57327628, + "num_input_tokens_seen": 110343715, + "step": 5136, + "time_per_iteration": 3.1589629650115967 + }, + { + "auxiliary_loss_clip": 0.01119375, + "auxiliary_loss_mlp": 0.00782135, + "balance_loss_clip": 1.05216694, + "balance_loss_mlp": 1.00132084, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 2.3065394672453707, + "language_loss": 0.75911617, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.77813125, + "num_input_tokens_seen": 110368430, + "step": 5137, + "time_per_iteration": 2.9246597290039062 + }, + { + "auxiliary_loss_clip": 0.01104466, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.05095923, + "balance_loss_mlp": 1.0225563, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 1.770826237134852, + "language_loss": 0.79749167, + "learning_rate": 3.237954673696424e-06, + "loss": 0.8188982, + "num_input_tokens_seen": 110386735, + "step": 5138, + "time_per_iteration": 2.5772576332092285 + }, + { + "auxiliary_loss_clip": 0.01081056, + "auxiliary_loss_mlp": 0.01043138, + "balance_loss_clip": 1.04332209, + "balance_loss_mlp": 1.02630007, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.5742100256544234, + "language_loss": 0.81295967, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83420157, + "num_input_tokens_seen": 110406820, + "step": 5139, + "time_per_iteration": 2.644608736038208 + }, + { + "auxiliary_loss_clip": 0.0112875, + "auxiliary_loss_mlp": 0.01039968, + "balance_loss_clip": 1.05177629, + "balance_loss_mlp": 1.02310562, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 1.9143077747804182, + "language_loss": 0.77024555, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.7919327, + "num_input_tokens_seen": 110424225, + "step": 5140, + "time_per_iteration": 2.4947619438171387 + }, + { + "auxiliary_loss_clip": 0.01099882, + "auxiliary_loss_mlp": 0.01047169, + "balance_loss_clip": 1.0467608, + "balance_loss_mlp": 1.03267288, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 1.8411156323868845, + "language_loss": 0.78311276, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80458319, + "num_input_tokens_seen": 110443310, + "step": 5141, + "time_per_iteration": 2.663501262664795 + }, + { + "auxiliary_loss_clip": 0.01119153, + "auxiliary_loss_mlp": 0.01044351, + "balance_loss_clip": 1.04953039, + "balance_loss_mlp": 1.02875829, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.2882401073206124, + "language_loss": 0.87182426, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89345932, + "num_input_tokens_seen": 110460215, + "step": 5142, + "time_per_iteration": 2.566622495651245 + }, + { + "auxiliary_loss_clip": 0.01126844, + "auxiliary_loss_mlp": 0.01042189, + "balance_loss_clip": 1.05001855, + "balance_loss_mlp": 1.02733529, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 2.0283773350455063, + "language_loss": 0.78823173, + "learning_rate": 3.23642465389567e-06, + "loss": 0.8099221, + "num_input_tokens_seen": 110479385, + "step": 5143, + "time_per_iteration": 2.4807255268096924 + }, + { + "auxiliary_loss_clip": 0.01107152, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.05109298, + "balance_loss_mlp": 1.02284026, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 2.453763399531267, + "language_loss": 0.72192633, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74337804, + "num_input_tokens_seen": 110499885, + "step": 5144, + "time_per_iteration": 2.6134145259857178 + }, + { + "auxiliary_loss_clip": 0.011269, + "auxiliary_loss_mlp": 0.01046011, + "balance_loss_clip": 1.04687572, + "balance_loss_mlp": 1.0301739, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.8234430680043456, + "language_loss": 0.74124628, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76297534, + "num_input_tokens_seen": 110519690, + "step": 5145, + "time_per_iteration": 2.559006690979004 + }, + { + "auxiliary_loss_clip": 0.0111245, + "auxiliary_loss_mlp": 0.0104614, + "balance_loss_clip": 1.04624748, + "balance_loss_mlp": 1.0300945, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 2.484139522350709, + "language_loss": 0.76182795, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78341389, + "num_input_tokens_seen": 110540520, + "step": 5146, + "time_per_iteration": 4.050955295562744 + }, + { + "auxiliary_loss_clip": 0.01111689, + "auxiliary_loss_mlp": 0.01038923, + "balance_loss_clip": 1.04663515, + "balance_loss_mlp": 1.02411103, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 1.9624883036438114, + "language_loss": 0.66953266, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.69103879, + "num_input_tokens_seen": 110557950, + "step": 5147, + "time_per_iteration": 2.4924488067626953 + }, + { + "auxiliary_loss_clip": 0.01132133, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.05335236, + "balance_loss_mlp": 1.02926624, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 1.8438720312170398, + "language_loss": 0.75010717, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77186579, + "num_input_tokens_seen": 110578215, + "step": 5148, + "time_per_iteration": 2.527482509613037 + }, + { + "auxiliary_loss_clip": 0.01133152, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_clip": 1.05177188, + "balance_loss_mlp": 1.02877474, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.419724025912444, + "language_loss": 0.72492409, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74670768, + "num_input_tokens_seen": 110592990, + "step": 5149, + "time_per_iteration": 2.420407772064209 + }, + { + "auxiliary_loss_clip": 0.01093213, + "auxiliary_loss_mlp": 0.01045185, + "balance_loss_clip": 1.04455614, + "balance_loss_mlp": 1.02903199, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 2.0007162499113926, + "language_loss": 0.84622693, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.86761093, + "num_input_tokens_seen": 110612130, + "step": 5150, + "time_per_iteration": 4.11989951133728 + }, + { + "auxiliary_loss_clip": 0.01086986, + "auxiliary_loss_mlp": 0.01042025, + "balance_loss_clip": 1.04472816, + "balance_loss_mlp": 1.02564597, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.688162041685363, + "language_loss": 0.78541464, + "learning_rate": 3.233974184780424e-06, + "loss": 0.80670476, + "num_input_tokens_seen": 110632045, + "step": 5151, + "time_per_iteration": 2.6048383712768555 + }, + { + "auxiliary_loss_clip": 0.01129339, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.050789, + "balance_loss_mlp": 1.02139711, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 2.588604475544768, + "language_loss": 0.66817331, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.68984497, + "num_input_tokens_seen": 110649340, + "step": 5152, + "time_per_iteration": 2.4779043197631836 + }, + { + "auxiliary_loss_clip": 0.01078451, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.04517543, + "balance_loss_mlp": 1.02434015, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 2.0833644409069287, + "language_loss": 0.82941657, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.85059643, + "num_input_tokens_seen": 110668450, + "step": 5153, + "time_per_iteration": 2.6633095741271973 + }, + { + "auxiliary_loss_clip": 0.01113227, + "auxiliary_loss_mlp": 0.00781916, + "balance_loss_clip": 1.04937875, + "balance_loss_mlp": 1.00120103, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 2.101875793387893, + "language_loss": 0.74079341, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.75974488, + "num_input_tokens_seen": 110689410, + "step": 5154, + "time_per_iteration": 2.5522618293762207 + }, + { + "auxiliary_loss_clip": 0.01126313, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.04769337, + "balance_loss_mlp": 1.01981866, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.482212208109933, + "language_loss": 0.76381683, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78543216, + "num_input_tokens_seen": 110707350, + "step": 5155, + "time_per_iteration": 3.825594186782837 + }, + { + "auxiliary_loss_clip": 0.01121606, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.05014658, + "balance_loss_mlp": 1.02445006, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.9723968940181043, + "language_loss": 0.79064322, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81226146, + "num_input_tokens_seen": 110724910, + "step": 5156, + "time_per_iteration": 2.4973485469818115 + }, + { + "auxiliary_loss_clip": 0.01125201, + "auxiliary_loss_mlp": 0.01038754, + "balance_loss_clip": 1.05083084, + "balance_loss_mlp": 1.02238607, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 2.782149719060854, + "language_loss": 0.75111294, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77275252, + "num_input_tokens_seen": 110744010, + "step": 5157, + "time_per_iteration": 2.484600782394409 + }, + { + "auxiliary_loss_clip": 0.01101509, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_clip": 1.04476762, + "balance_loss_mlp": 1.0251199, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.5754109449481783, + "language_loss": 0.69385666, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71526432, + "num_input_tokens_seen": 110765835, + "step": 5158, + "time_per_iteration": 2.626389980316162 + }, + { + "auxiliary_loss_clip": 0.01092445, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.04795706, + "balance_loss_mlp": 1.02074432, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 3.0960477766755687, + "language_loss": 0.84770107, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.868967, + "num_input_tokens_seen": 110784655, + "step": 5159, + "time_per_iteration": 2.595781087875366 + }, + { + "auxiliary_loss_clip": 0.01115797, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.05052257, + "balance_loss_mlp": 1.02277923, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.789090585311885, + "language_loss": 0.85115296, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87269229, + "num_input_tokens_seen": 110802545, + "step": 5160, + "time_per_iteration": 2.504664182662964 + }, + { + "auxiliary_loss_clip": 0.01125857, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.04931235, + "balance_loss_mlp": 1.01905036, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 2.160522655730799, + "language_loss": 0.7555033, + "learning_rate": 3.230906887766584e-06, + "loss": 0.77710402, + "num_input_tokens_seen": 110820265, + "step": 5161, + "time_per_iteration": 2.4976279735565186 + }, + { + "auxiliary_loss_clip": 0.01124474, + "auxiliary_loss_mlp": 0.01038374, + "balance_loss_clip": 1.04632616, + "balance_loss_mlp": 1.02286458, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 2.576313905315535, + "language_loss": 0.81803036, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83965886, + "num_input_tokens_seen": 110836195, + "step": 5162, + "time_per_iteration": 2.456782341003418 + }, + { + "auxiliary_loss_clip": 0.01123891, + "auxiliary_loss_mlp": 0.01033744, + "balance_loss_clip": 1.04901814, + "balance_loss_mlp": 1.01995766, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.628608893089152, + "language_loss": 0.82845479, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85003114, + "num_input_tokens_seen": 110856420, + "step": 5163, + "time_per_iteration": 2.50003981590271 + }, + { + "auxiliary_loss_clip": 0.01142038, + "auxiliary_loss_mlp": 0.0103928, + "balance_loss_clip": 1.05260885, + "balance_loss_mlp": 1.02411079, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.8554032410720878, + "language_loss": 0.76335883, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78517199, + "num_input_tokens_seen": 110876650, + "step": 5164, + "time_per_iteration": 2.465836524963379 + }, + { + "auxiliary_loss_clip": 0.01097561, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.04754126, + "balance_loss_mlp": 1.02200007, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.9299268495662243, + "language_loss": 0.74786913, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76921999, + "num_input_tokens_seen": 110894445, + "step": 5165, + "time_per_iteration": 2.5959877967834473 + }, + { + "auxiliary_loss_clip": 0.01095483, + "auxiliary_loss_mlp": 0.01042691, + "balance_loss_clip": 1.04560757, + "balance_loss_mlp": 1.02541757, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.768697592525312, + "language_loss": 0.75976205, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78114378, + "num_input_tokens_seen": 110912855, + "step": 5166, + "time_per_iteration": 2.51818585395813 + }, + { + "auxiliary_loss_clip": 0.01115678, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.04819989, + "balance_loss_mlp": 1.02133214, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.3667649841250724, + "language_loss": 0.73287851, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75440371, + "num_input_tokens_seen": 110928025, + "step": 5167, + "time_per_iteration": 3.771998167037964 + }, + { + "auxiliary_loss_clip": 0.01031405, + "auxiliary_loss_mlp": 0.01017379, + "balance_loss_clip": 1.04856133, + "balance_loss_mlp": 1.01418424, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.732217813777549, + "language_loss": 0.52945924, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.54994714, + "num_input_tokens_seen": 110992215, + "step": 5168, + "time_per_iteration": 3.1841177940368652 + }, + { + "auxiliary_loss_clip": 0.01130452, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.05261219, + "balance_loss_mlp": 1.02494836, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 1.842022612319205, + "language_loss": 0.79164398, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.81336486, + "num_input_tokens_seen": 111010400, + "step": 5169, + "time_per_iteration": 2.455259084701538 + }, + { + "auxiliary_loss_clip": 0.0111592, + "auxiliary_loss_mlp": 0.01035603, + "balance_loss_clip": 1.04642844, + "balance_loss_mlp": 1.02047563, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.9591343263648038, + "language_loss": 0.6402728, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66178799, + "num_input_tokens_seen": 111033960, + "step": 5170, + "time_per_iteration": 2.626868963241577 + }, + { + "auxiliary_loss_clip": 0.01103396, + "auxiliary_loss_mlp": 0.00782607, + "balance_loss_clip": 1.05138588, + "balance_loss_mlp": 1.00115371, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.682369576930934, + "language_loss": 0.77950132, + "learning_rate": 3.22783492314295e-06, + "loss": 0.7983613, + "num_input_tokens_seen": 111053265, + "step": 5171, + "time_per_iteration": 2.6215431690216064 + }, + { + "auxiliary_loss_clip": 0.01095552, + "auxiliary_loss_mlp": 0.01043675, + "balance_loss_clip": 1.04885554, + "balance_loss_mlp": 1.02828526, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.846622707437311, + "language_loss": 0.83824837, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.8596406, + "num_input_tokens_seen": 111071130, + "step": 5172, + "time_per_iteration": 2.5753631591796875 + }, + { + "auxiliary_loss_clip": 0.01090676, + "auxiliary_loss_mlp": 0.01043729, + "balance_loss_clip": 1.04933715, + "balance_loss_mlp": 1.02802932, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 2.334861825094694, + "language_loss": 0.84311533, + "learning_rate": 3.227219971129842e-06, + "loss": 0.8644594, + "num_input_tokens_seen": 111089560, + "step": 5173, + "time_per_iteration": 2.552278518676758 + }, + { + "auxiliary_loss_clip": 0.01138939, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.05425847, + "balance_loss_mlp": 1.01842499, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.7835202298681365, + "language_loss": 0.83670306, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85842252, + "num_input_tokens_seen": 111109960, + "step": 5174, + "time_per_iteration": 2.4967808723449707 + }, + { + "auxiliary_loss_clip": 0.01117755, + "auxiliary_loss_mlp": 0.01040089, + "balance_loss_clip": 1.05115902, + "balance_loss_mlp": 1.02485394, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.092756647676537, + "language_loss": 0.85419917, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87577754, + "num_input_tokens_seen": 111127960, + "step": 5175, + "time_per_iteration": 2.504456043243408 + }, + { + "auxiliary_loss_clip": 0.01089278, + "auxiliary_loss_mlp": 0.01038188, + "balance_loss_clip": 1.04955673, + "balance_loss_mlp": 1.02166557, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.675302205420281, + "language_loss": 0.83324796, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85452259, + "num_input_tokens_seen": 111146730, + "step": 5176, + "time_per_iteration": 2.718400239944458 + }, + { + "auxiliary_loss_clip": 0.01124678, + "auxiliary_loss_mlp": 0.0104051, + "balance_loss_clip": 1.0476377, + "balance_loss_mlp": 1.02398765, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 2.1569944451417737, + "language_loss": 0.80910617, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.83075804, + "num_input_tokens_seen": 111166295, + "step": 5177, + "time_per_iteration": 2.506613254547119 + }, + { + "auxiliary_loss_clip": 0.01128726, + "auxiliary_loss_mlp": 0.00782328, + "balance_loss_clip": 1.0520618, + "balance_loss_mlp": 1.0010891, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 2.0762474007336404, + "language_loss": 0.80961424, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82872474, + "num_input_tokens_seen": 111185665, + "step": 5178, + "time_per_iteration": 2.541377067565918 + }, + { + "auxiliary_loss_clip": 0.01112631, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.05060089, + "balance_loss_mlp": 1.02326119, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 2.567045124483449, + "language_loss": 0.80922449, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83073306, + "num_input_tokens_seen": 111201615, + "step": 5179, + "time_per_iteration": 2.4997074604034424 + }, + { + "auxiliary_loss_clip": 0.01109527, + "auxiliary_loss_mlp": 0.01044148, + "balance_loss_clip": 1.05334222, + "balance_loss_mlp": 1.02854323, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 2.006331487296696, + "language_loss": 0.77949452, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.80103129, + "num_input_tokens_seen": 111220515, + "step": 5180, + "time_per_iteration": 2.529757022857666 + }, + { + "auxiliary_loss_clip": 0.01104027, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.05229104, + "balance_loss_mlp": 1.01761055, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.812828632693618, + "language_loss": 0.83300453, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85437369, + "num_input_tokens_seen": 111240395, + "step": 5181, + "time_per_iteration": 2.6177375316619873 + }, + { + "auxiliary_loss_clip": 0.01106795, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.04926801, + "balance_loss_mlp": 1.02390885, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.6926856544478817, + "language_loss": 0.7408421, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76228762, + "num_input_tokens_seen": 111261100, + "step": 5182, + "time_per_iteration": 2.6368768215179443 + }, + { + "auxiliary_loss_clip": 0.01092102, + "auxiliary_loss_mlp": 0.00783013, + "balance_loss_clip": 1.04831266, + "balance_loss_mlp": 1.00119698, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 2.213648337210038, + "language_loss": 0.70399749, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72274864, + "num_input_tokens_seen": 111281320, + "step": 5183, + "time_per_iteration": 2.6421685218811035 + }, + { + "auxiliary_loss_clip": 0.01058573, + "auxiliary_loss_mlp": 0.01003007, + "balance_loss_clip": 1.05990922, + "balance_loss_mlp": 1.00067008, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9493089004733539, + "language_loss": 0.59623802, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61685383, + "num_input_tokens_seen": 111341405, + "step": 5184, + "time_per_iteration": 3.149334669113159 + }, + { + "auxiliary_loss_clip": 0.01111114, + "auxiliary_loss_mlp": 0.01041561, + "balance_loss_clip": 1.0454154, + "balance_loss_mlp": 1.02666545, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.3541382891880995, + "language_loss": 0.69745481, + "learning_rate": 3.223526353268311e-06, + "loss": 0.71898162, + "num_input_tokens_seen": 111358975, + "step": 5185, + "time_per_iteration": 2.5310938358306885 + }, + { + "auxiliary_loss_clip": 0.0111928, + "auxiliary_loss_mlp": 0.01045762, + "balance_loss_clip": 1.05443048, + "balance_loss_mlp": 1.03009796, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.2712919694528466, + "language_loss": 0.63600314, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.65765357, + "num_input_tokens_seen": 111375845, + "step": 5186, + "time_per_iteration": 3.9727957248687744 + }, + { + "auxiliary_loss_clip": 0.01124692, + "auxiliary_loss_mlp": 0.01046801, + "balance_loss_clip": 1.05276299, + "balance_loss_mlp": 1.03026664, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.106063367769215, + "language_loss": 0.86003959, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88175458, + "num_input_tokens_seen": 111394150, + "step": 5187, + "time_per_iteration": 2.591404676437378 + }, + { + "auxiliary_loss_clip": 0.01142362, + "auxiliary_loss_mlp": 0.00782026, + "balance_loss_clip": 1.05357075, + "balance_loss_mlp": 1.00136328, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.5885814019806106, + "language_loss": 0.63292646, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.65217036, + "num_input_tokens_seen": 111418355, + "step": 5188, + "time_per_iteration": 2.5834174156188965 + }, + { + "auxiliary_loss_clip": 0.01085289, + "auxiliary_loss_mlp": 0.01043657, + "balance_loss_clip": 1.04990172, + "balance_loss_mlp": 1.02762318, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.053086439997663, + "language_loss": 0.83641499, + "learning_rate": 3.222293661638346e-06, + "loss": 0.8577044, + "num_input_tokens_seen": 111435445, + "step": 5189, + "time_per_iteration": 4.1596457958221436 + }, + { + "auxiliary_loss_clip": 0.01056769, + "auxiliary_loss_mlp": 0.01036247, + "balance_loss_clip": 1.04529166, + "balance_loss_mlp": 1.02091718, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.7216267445386655, + "language_loss": 0.78722036, + "learning_rate": 3.22198537282789e-06, + "loss": 0.80815053, + "num_input_tokens_seen": 111453430, + "step": 5190, + "time_per_iteration": 2.750845193862915 + }, + { + "auxiliary_loss_clip": 0.01082456, + "auxiliary_loss_mlp": 0.01062699, + "balance_loss_clip": 1.03973961, + "balance_loss_mlp": 1.04487753, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.6249184550268774, + "language_loss": 0.75201374, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77346534, + "num_input_tokens_seen": 111475325, + "step": 5191, + "time_per_iteration": 2.836386203765869 + }, + { + "auxiliary_loss_clip": 0.01072314, + "auxiliary_loss_mlp": 0.00756214, + "balance_loss_clip": 1.05020308, + "balance_loss_mlp": 1.00107694, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8516624889867118, + "language_loss": 0.63908172, + "learning_rate": 3.221368656205247e-06, + "loss": 0.65736699, + "num_input_tokens_seen": 111533960, + "step": 5192, + "time_per_iteration": 3.163095712661743 + }, + { + "auxiliary_loss_clip": 0.01129519, + "auxiliary_loss_mlp": 0.01048784, + "balance_loss_clip": 1.05069816, + "balance_loss_mlp": 1.03233337, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.8955512758204671, + "language_loss": 0.80155838, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82334143, + "num_input_tokens_seen": 111554055, + "step": 5193, + "time_per_iteration": 2.491612672805786 + }, + { + "auxiliary_loss_clip": 0.01114067, + "auxiliary_loss_mlp": 0.01059441, + "balance_loss_clip": 1.04782021, + "balance_loss_mlp": 1.04086852, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 1.8186773010193924, + "language_loss": 0.72110116, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74283624, + "num_input_tokens_seen": 111574305, + "step": 5194, + "time_per_iteration": 3.9375462532043457 + }, + { + "auxiliary_loss_clip": 0.01140766, + "auxiliary_loss_mlp": 0.01041019, + "balance_loss_clip": 1.05372739, + "balance_loss_mlp": 1.02644002, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.5071814510085906, + "language_loss": 0.76778316, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78960103, + "num_input_tokens_seen": 111595680, + "step": 5195, + "time_per_iteration": 2.518568754196167 + }, + { + "auxiliary_loss_clip": 0.01140487, + "auxiliary_loss_mlp": 0.01046946, + "balance_loss_clip": 1.05083752, + "balance_loss_mlp": 1.03173542, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.397016832758078, + "language_loss": 0.77752, + "learning_rate": 3.220134667280476e-06, + "loss": 0.79939437, + "num_input_tokens_seen": 111618135, + "step": 5196, + "time_per_iteration": 2.5255489349365234 + }, + { + "auxiliary_loss_clip": 0.01047095, + "auxiliary_loss_mlp": 0.00757229, + "balance_loss_clip": 1.03595304, + "balance_loss_mlp": 1.0010258, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7725696125749915, + "language_loss": 0.54797041, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56601357, + "num_input_tokens_seen": 111682220, + "step": 5197, + "time_per_iteration": 3.123342990875244 + }, + { + "auxiliary_loss_clip": 0.01140624, + "auxiliary_loss_mlp": 0.01039869, + "balance_loss_clip": 1.05443501, + "balance_loss_mlp": 1.02501535, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.9175377647145528, + "language_loss": 0.66428083, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68608576, + "num_input_tokens_seen": 111700815, + "step": 5198, + "time_per_iteration": 2.4849894046783447 + }, + { + "auxiliary_loss_clip": 0.01098885, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.04352951, + "balance_loss_mlp": 1.01838851, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.1677330398973784, + "language_loss": 0.69772017, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71906263, + "num_input_tokens_seen": 111718195, + "step": 5199, + "time_per_iteration": 2.569240093231201 + }, + { + "auxiliary_loss_clip": 0.01129236, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.05041647, + "balance_loss_mlp": 1.03427315, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.7945075255643033, + "language_loss": 0.78712511, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.80891621, + "num_input_tokens_seen": 111734440, + "step": 5200, + "time_per_iteration": 2.4714949131011963 + }, + { + "auxiliary_loss_clip": 0.01126931, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.05110753, + "balance_loss_mlp": 1.01886344, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.14314414480676, + "language_loss": 0.83191812, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85352957, + "num_input_tokens_seen": 111751960, + "step": 5201, + "time_per_iteration": 2.4868972301483154 + }, + { + "auxiliary_loss_clip": 0.01141208, + "auxiliary_loss_mlp": 0.01046661, + "balance_loss_clip": 1.0514555, + "balance_loss_mlp": 1.02986479, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.5753073724420883, + "language_loss": 0.69368237, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71556103, + "num_input_tokens_seen": 111769585, + "step": 5202, + "time_per_iteration": 2.460057020187378 + }, + { + "auxiliary_loss_clip": 0.01139886, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.05090082, + "balance_loss_mlp": 1.02318215, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.7153959036728261, + "language_loss": 0.8388676, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86063683, + "num_input_tokens_seen": 111787880, + "step": 5203, + "time_per_iteration": 2.432884454727173 + }, + { + "auxiliary_loss_clip": 0.01086375, + "auxiliary_loss_mlp": 0.01037307, + "balance_loss_clip": 1.04771876, + "balance_loss_mlp": 1.02104735, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.4118685908345983, + "language_loss": 0.60782844, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62906528, + "num_input_tokens_seen": 111805950, + "step": 5204, + "time_per_iteration": 2.6214888095855713 + }, + { + "auxiliary_loss_clip": 0.01103853, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.0457294, + "balance_loss_mlp": 1.02403998, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.7100116401660939, + "language_loss": 0.66176552, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68319058, + "num_input_tokens_seen": 111826135, + "step": 5205, + "time_per_iteration": 2.5098321437835693 + }, + { + "auxiliary_loss_clip": 0.01130453, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.0511874, + "balance_loss_mlp": 1.03001857, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.5199178983294297, + "language_loss": 0.7657035, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78747618, + "num_input_tokens_seen": 111844700, + "step": 5206, + "time_per_iteration": 2.510495662689209 + }, + { + "auxiliary_loss_clip": 0.01135973, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.04880953, + "balance_loss_mlp": 1.01985621, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 2.491584880780973, + "language_loss": 0.82974517, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85145473, + "num_input_tokens_seen": 111861585, + "step": 5207, + "time_per_iteration": 3.93369722366333 + }, + { + "auxiliary_loss_clip": 0.01120346, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.04747868, + "balance_loss_mlp": 1.02575195, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.6039576994771263, + "language_loss": 0.71476293, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73636335, + "num_input_tokens_seen": 111882950, + "step": 5208, + "time_per_iteration": 2.490936756134033 + }, + { + "auxiliary_loss_clip": 0.01118473, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.05038571, + "balance_loss_mlp": 1.02454972, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.8741178873878968, + "language_loss": 0.74641889, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76800537, + "num_input_tokens_seen": 111901640, + "step": 5209, + "time_per_iteration": 2.5451736450195312 + }, + { + "auxiliary_loss_clip": 0.01135715, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.04741895, + "balance_loss_mlp": 1.02400744, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.968932311514938, + "language_loss": 0.77358615, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79533148, + "num_input_tokens_seen": 111919615, + "step": 5210, + "time_per_iteration": 2.457426071166992 + }, + { + "auxiliary_loss_clip": 0.01123053, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.04778457, + "balance_loss_mlp": 1.02229047, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 2.438368142165657, + "language_loss": 0.798513, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.82010937, + "num_input_tokens_seen": 111938485, + "step": 5211, + "time_per_iteration": 2.4908857345581055 + }, + { + "auxiliary_loss_clip": 0.01126876, + "auxiliary_loss_mlp": 0.01036478, + "balance_loss_clip": 1.04957604, + "balance_loss_mlp": 1.02189267, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.8261062804707953, + "language_loss": 0.7976895, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81932294, + "num_input_tokens_seen": 111956425, + "step": 5212, + "time_per_iteration": 2.479125738143921 + }, + { + "auxiliary_loss_clip": 0.01122564, + "auxiliary_loss_mlp": 0.01044682, + "balance_loss_clip": 1.0504272, + "balance_loss_mlp": 1.02893412, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.088647875224006, + "language_loss": 0.70849049, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73016298, + "num_input_tokens_seen": 111975915, + "step": 5213, + "time_per_iteration": 2.5473852157592773 + }, + { + "auxiliary_loss_clip": 0.01129015, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.05188131, + "balance_loss_mlp": 1.02331507, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 2.3965166702537113, + "language_loss": 0.76778394, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.78945619, + "num_input_tokens_seen": 111995055, + "step": 5214, + "time_per_iteration": 2.4856252670288086 + }, + { + "auxiliary_loss_clip": 0.01098999, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.05444169, + "balance_loss_mlp": 1.01823163, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.854502457553366, + "language_loss": 0.82593393, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.8472507, + "num_input_tokens_seen": 112015830, + "step": 5215, + "time_per_iteration": 2.6369128227233887 + }, + { + "auxiliary_loss_clip": 0.01131791, + "auxiliary_loss_mlp": 0.01036808, + "balance_loss_clip": 1.05410039, + "balance_loss_mlp": 1.02050018, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.0586126414126804, + "language_loss": 0.79520077, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81688678, + "num_input_tokens_seen": 112035065, + "step": 5216, + "time_per_iteration": 2.491506338119507 + }, + { + "auxiliary_loss_clip": 0.01118371, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.04910016, + "balance_loss_mlp": 1.0272305, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 1.7776744925658972, + "language_loss": 0.6825161, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70414245, + "num_input_tokens_seen": 112058405, + "step": 5217, + "time_per_iteration": 2.5712010860443115 + }, + { + "auxiliary_loss_clip": 0.01118743, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.04805231, + "balance_loss_mlp": 1.01932263, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 2.234996213851214, + "language_loss": 0.80430073, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82582837, + "num_input_tokens_seen": 112076420, + "step": 5218, + "time_per_iteration": 2.5080108642578125 + }, + { + "auxiliary_loss_clip": 0.01136048, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.04834473, + "balance_loss_mlp": 1.0253855, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.6719294391462722, + "language_loss": 0.69259137, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.71436077, + "num_input_tokens_seen": 112090775, + "step": 5219, + "time_per_iteration": 2.4446041584014893 + }, + { + "auxiliary_loss_clip": 0.01113587, + "auxiliary_loss_mlp": 0.01041753, + "balance_loss_clip": 1.04724324, + "balance_loss_mlp": 1.02815139, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 2.4677666867004744, + "language_loss": 0.80625975, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.82781315, + "num_input_tokens_seen": 112110980, + "step": 5220, + "time_per_iteration": 2.525376319885254 + }, + { + "auxiliary_loss_clip": 0.01129518, + "auxiliary_loss_mlp": 0.0104092, + "balance_loss_clip": 1.05212688, + "balance_loss_mlp": 1.02639449, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 2.2969763491066084, + "language_loss": 0.73587006, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75757444, + "num_input_tokens_seen": 112129020, + "step": 5221, + "time_per_iteration": 2.4607207775115967 + }, + { + "auxiliary_loss_clip": 0.01103607, + "auxiliary_loss_mlp": 0.01041782, + "balance_loss_clip": 1.04439974, + "balance_loss_mlp": 1.02798331, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.8873096010916348, + "language_loss": 0.81765693, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.83911079, + "num_input_tokens_seen": 112147865, + "step": 5222, + "time_per_iteration": 2.4777238368988037 + }, + { + "auxiliary_loss_clip": 0.01130575, + "auxiliary_loss_mlp": 0.01045236, + "balance_loss_clip": 1.05046844, + "balance_loss_mlp": 1.02894008, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 2.182248085852534, + "language_loss": 0.70017093, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72192907, + "num_input_tokens_seen": 112166745, + "step": 5223, + "time_per_iteration": 2.4742045402526855 + }, + { + "auxiliary_loss_clip": 0.01119579, + "auxiliary_loss_mlp": 0.0077962, + "balance_loss_clip": 1.0455457, + "balance_loss_mlp": 1.00089729, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.80997831394962, + "language_loss": 0.80459893, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82359093, + "num_input_tokens_seen": 112185895, + "step": 5224, + "time_per_iteration": 2.491170644760132 + }, + { + "auxiliary_loss_clip": 0.01135006, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_clip": 1.05349839, + "balance_loss_mlp": 1.02793026, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 2.0947435369499514, + "language_loss": 0.58003628, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60182261, + "num_input_tokens_seen": 112204465, + "step": 5225, + "time_per_iteration": 3.9693922996520996 + }, + { + "auxiliary_loss_clip": 0.01089011, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.04651952, + "balance_loss_mlp": 1.01844096, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 4.770406635087709, + "language_loss": 0.81437987, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.8355909, + "num_input_tokens_seen": 112221635, + "step": 5226, + "time_per_iteration": 2.5623185634613037 + }, + { + "auxiliary_loss_clip": 0.01123869, + "auxiliary_loss_mlp": 0.01059275, + "balance_loss_clip": 1.04913986, + "balance_loss_mlp": 1.04239511, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 2.0399408602140547, + "language_loss": 0.74003577, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76186728, + "num_input_tokens_seen": 112241240, + "step": 5227, + "time_per_iteration": 2.476296901702881 + }, + { + "auxiliary_loss_clip": 0.01127554, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.05576169, + "balance_loss_mlp": 1.02374077, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.7898344751699178, + "language_loss": 0.67656451, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69823122, + "num_input_tokens_seen": 112262350, + "step": 5228, + "time_per_iteration": 2.555593252182007 + }, + { + "auxiliary_loss_clip": 0.01120049, + "auxiliary_loss_mlp": 0.01042355, + "balance_loss_clip": 1.05355692, + "balance_loss_mlp": 1.02775836, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 4.97389299116263, + "language_loss": 0.79579401, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.81741798, + "num_input_tokens_seen": 112283710, + "step": 5229, + "time_per_iteration": 4.10670018196106 + }, + { + "auxiliary_loss_clip": 0.01116857, + "auxiliary_loss_mlp": 0.01039474, + "balance_loss_clip": 1.05268872, + "balance_loss_mlp": 1.02406001, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.8802362248581246, + "language_loss": 0.69701904, + "learning_rate": 3.209615948222611e-06, + "loss": 0.71858233, + "num_input_tokens_seen": 112304285, + "step": 5230, + "time_per_iteration": 2.5474958419799805 + }, + { + "auxiliary_loss_clip": 0.01098336, + "auxiliary_loss_mlp": 0.01051671, + "balance_loss_clip": 1.04786515, + "balance_loss_mlp": 1.0338614, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.6034393120755899, + "language_loss": 0.7984916, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81999171, + "num_input_tokens_seen": 112325110, + "step": 5231, + "time_per_iteration": 2.6087214946746826 + }, + { + "auxiliary_loss_clip": 0.01111994, + "auxiliary_loss_mlp": 0.01045187, + "balance_loss_clip": 1.05378985, + "balance_loss_mlp": 1.02842617, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.3162248610192173, + "language_loss": 0.84417546, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.86574727, + "num_input_tokens_seen": 112339855, + "step": 5232, + "time_per_iteration": 2.47249174118042 + }, + { + "auxiliary_loss_clip": 0.01091908, + "auxiliary_loss_mlp": 0.01051363, + "balance_loss_clip": 1.0502491, + "balance_loss_mlp": 1.03462636, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 2.432715650288494, + "language_loss": 0.79868788, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82012057, + "num_input_tokens_seen": 112358480, + "step": 5233, + "time_per_iteration": 2.564643144607544 + }, + { + "auxiliary_loss_clip": 0.01100618, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.05252242, + "balance_loss_mlp": 1.02260685, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 2.0053758767767227, + "language_loss": 0.70982111, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.73119867, + "num_input_tokens_seen": 112382350, + "step": 5234, + "time_per_iteration": 4.203598737716675 + }, + { + "auxiliary_loss_clip": 0.01109427, + "auxiliary_loss_mlp": 0.01034753, + "balance_loss_clip": 1.0567373, + "balance_loss_mlp": 1.01942241, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.8682498516686021, + "language_loss": 0.72300887, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74445069, + "num_input_tokens_seen": 112400260, + "step": 5235, + "time_per_iteration": 2.6011931896209717 + }, + { + "auxiliary_loss_clip": 0.01131163, + "auxiliary_loss_mlp": 0.01037732, + "balance_loss_clip": 1.0527463, + "balance_loss_mlp": 1.0224967, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 1.886300528060783, + "language_loss": 0.78927588, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.81096482, + "num_input_tokens_seen": 112419400, + "step": 5236, + "time_per_iteration": 2.499642848968506 + }, + { + "auxiliary_loss_clip": 0.0114631, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_clip": 1.0551194, + "balance_loss_mlp": 1.02717829, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.8121852318633422, + "language_loss": 0.75783575, + "learning_rate": 3.207443732256881e-06, + "loss": 0.77972972, + "num_input_tokens_seen": 112440825, + "step": 5237, + "time_per_iteration": 2.5472609996795654 + }, + { + "auxiliary_loss_clip": 0.01139135, + "auxiliary_loss_mlp": 0.0103758, + "balance_loss_clip": 1.05438292, + "balance_loss_mlp": 1.02369189, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 1.675001358045065, + "language_loss": 0.7969752, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.81874239, + "num_input_tokens_seen": 112459180, + "step": 5238, + "time_per_iteration": 2.457195997238159 + }, + { + "auxiliary_loss_clip": 0.01077173, + "auxiliary_loss_mlp": 0.01004089, + "balance_loss_clip": 1.05452979, + "balance_loss_mlp": 1.00208616, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8288476006676414, + "language_loss": 0.67883801, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69965065, + "num_input_tokens_seen": 112516680, + "step": 5239, + "time_per_iteration": 3.0473079681396484 + }, + { + "auxiliary_loss_clip": 0.01126222, + "auxiliary_loss_mlp": 0.01045094, + "balance_loss_clip": 1.05546224, + "balance_loss_mlp": 1.02805269, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.2698871718892026, + "language_loss": 0.82739884, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84911203, + "num_input_tokens_seen": 112535895, + "step": 5240, + "time_per_iteration": 2.5270168781280518 + }, + { + "auxiliary_loss_clip": 0.0111785, + "auxiliary_loss_mlp": 0.00780588, + "balance_loss_clip": 1.05244029, + "balance_loss_mlp": 1.00077105, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 1.8681294554168846, + "language_loss": 0.81018007, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.82916439, + "num_input_tokens_seen": 112557490, + "step": 5241, + "time_per_iteration": 2.5866286754608154 + }, + { + "auxiliary_loss_clip": 0.0114327, + "auxiliary_loss_mlp": 0.01037715, + "balance_loss_clip": 1.05728698, + "balance_loss_mlp": 1.02252209, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.5730258559615644, + "language_loss": 0.74266911, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76447898, + "num_input_tokens_seen": 112577075, + "step": 5242, + "time_per_iteration": 2.542231798171997 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.05220342, + "balance_loss_mlp": 1.02044654, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.7776855237248352, + "language_loss": 0.73664129, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.75813746, + "num_input_tokens_seen": 112597620, + "step": 5243, + "time_per_iteration": 2.7269649505615234 + }, + { + "auxiliary_loss_clip": 0.01128281, + "auxiliary_loss_mlp": 0.01036715, + "balance_loss_clip": 1.0511775, + "balance_loss_mlp": 1.02130151, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 1.79589879268775, + "language_loss": 0.64364439, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66529435, + "num_input_tokens_seen": 112617150, + "step": 5244, + "time_per_iteration": 2.488039255142212 + }, + { + "auxiliary_loss_clip": 0.01098592, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.05632365, + "balance_loss_mlp": 1.02279663, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.3742755477230806, + "language_loss": 0.91480917, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.9361763, + "num_input_tokens_seen": 112631090, + "step": 5245, + "time_per_iteration": 2.547523021697998 + }, + { + "auxiliary_loss_clip": 0.01132527, + "auxiliary_loss_mlp": 0.01044668, + "balance_loss_clip": 1.05373764, + "balance_loss_mlp": 1.02896833, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 5.820451391113406, + "language_loss": 0.7515651, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77333707, + "num_input_tokens_seen": 112651220, + "step": 5246, + "time_per_iteration": 4.060669422149658 + }, + { + "auxiliary_loss_clip": 0.01142205, + "auxiliary_loss_mlp": 0.01045198, + "balance_loss_clip": 1.05280232, + "balance_loss_mlp": 1.02964115, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 2.5514214640819497, + "language_loss": 0.61954713, + "learning_rate": 3.204336675750321e-06, + "loss": 0.64142114, + "num_input_tokens_seen": 112671560, + "step": 5247, + "time_per_iteration": 2.559779167175293 + }, + { + "auxiliary_loss_clip": 0.01133828, + "auxiliary_loss_mlp": 0.01044447, + "balance_loss_clip": 1.05371976, + "balance_loss_mlp": 1.02852058, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.7144854182140477, + "language_loss": 0.82360661, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84538937, + "num_input_tokens_seen": 112689790, + "step": 5248, + "time_per_iteration": 2.4572055339813232 + }, + { + "auxiliary_loss_clip": 0.01126056, + "auxiliary_loss_mlp": 0.0105374, + "balance_loss_clip": 1.05624914, + "balance_loss_mlp": 1.0364784, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.8154356722262894, + "language_loss": 0.8466965, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.86849445, + "num_input_tokens_seen": 112708265, + "step": 5249, + "time_per_iteration": 2.568847417831421 + }, + { + "auxiliary_loss_clip": 0.01107757, + "auxiliary_loss_mlp": 0.01041237, + "balance_loss_clip": 1.05147612, + "balance_loss_mlp": 1.02428567, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 2.195067765010073, + "language_loss": 0.85315847, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87464845, + "num_input_tokens_seen": 112727820, + "step": 5250, + "time_per_iteration": 2.659715175628662 + }, + { + "auxiliary_loss_clip": 0.01114916, + "auxiliary_loss_mlp": 0.01045464, + "balance_loss_clip": 1.05235004, + "balance_loss_mlp": 1.02914381, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 2.5862825399854588, + "language_loss": 0.68911678, + "learning_rate": 3.203092573767835e-06, + "loss": 0.7107206, + "num_input_tokens_seen": 112743140, + "step": 5251, + "time_per_iteration": 2.4930593967437744 + }, + { + "auxiliary_loss_clip": 0.011435, + "auxiliary_loss_mlp": 0.01040875, + "balance_loss_clip": 1.05508494, + "balance_loss_mlp": 1.02500784, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.7334616136730836, + "language_loss": 0.7864325, + "learning_rate": 3.202781434189246e-06, + "loss": 0.80827618, + "num_input_tokens_seen": 112764705, + "step": 5252, + "time_per_iteration": 2.490216016769409 + }, + { + "auxiliary_loss_clip": 0.01126171, + "auxiliary_loss_mlp": 0.01055761, + "balance_loss_clip": 1.05255318, + "balance_loss_mlp": 1.0389998, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 3.451544918427502, + "language_loss": 0.74489146, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76671082, + "num_input_tokens_seen": 112785310, + "step": 5253, + "time_per_iteration": 2.503704071044922 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.0104337, + "balance_loss_clip": 1.05165768, + "balance_loss_mlp": 1.02750373, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.705059649930483, + "language_loss": 0.7320044, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75364935, + "num_input_tokens_seen": 112802905, + "step": 5254, + "time_per_iteration": 2.534691572189331 + }, + { + "auxiliary_loss_clip": 0.01133569, + "auxiliary_loss_mlp": 0.01046049, + "balance_loss_clip": 1.05446506, + "balance_loss_mlp": 1.03015828, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.9565639327686417, + "language_loss": 0.78048837, + "learning_rate": 3.201847741843128e-06, + "loss": 0.8022846, + "num_input_tokens_seen": 112820305, + "step": 5255, + "time_per_iteration": 2.4966232776641846 + }, + { + "auxiliary_loss_clip": 0.01119021, + "auxiliary_loss_mlp": 0.01045531, + "balance_loss_clip": 1.05314565, + "balance_loss_mlp": 1.02751863, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.3036514804706685, + "language_loss": 0.78084278, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80248833, + "num_input_tokens_seen": 112841185, + "step": 5256, + "time_per_iteration": 2.5363409519195557 + }, + { + "auxiliary_loss_clip": 0.01099804, + "auxiliary_loss_mlp": 0.01039736, + "balance_loss_clip": 1.05314732, + "balance_loss_mlp": 1.02556825, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.672812730317152, + "language_loss": 0.71466124, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73605669, + "num_input_tokens_seen": 112860570, + "step": 5257, + "time_per_iteration": 2.5410585403442383 + }, + { + "auxiliary_loss_clip": 0.01135156, + "auxiliary_loss_mlp": 0.01047996, + "balance_loss_clip": 1.05620933, + "balance_loss_mlp": 1.03122282, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 2.1047387886626616, + "language_loss": 0.76859891, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79043043, + "num_input_tokens_seen": 112877975, + "step": 5258, + "time_per_iteration": 2.474299669265747 + }, + { + "auxiliary_loss_clip": 0.01113718, + "auxiliary_loss_mlp": 0.01050776, + "balance_loss_clip": 1.05033219, + "balance_loss_mlp": 1.03390765, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 1.9933524372554126, + "language_loss": 0.7268554, + "learning_rate": 3.200602180731467e-06, + "loss": 0.74850029, + "num_input_tokens_seen": 112896170, + "step": 5259, + "time_per_iteration": 2.523953676223755 + }, + { + "auxiliary_loss_clip": 0.01116718, + "auxiliary_loss_mlp": 0.00783969, + "balance_loss_clip": 1.05047965, + "balance_loss_mlp": 1.00075936, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.645947371351178, + "language_loss": 0.66361862, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68262547, + "num_input_tokens_seen": 112916180, + "step": 5260, + "time_per_iteration": 2.6086971759796143 + }, + { + "auxiliary_loss_clip": 0.01127293, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.04691112, + "balance_loss_mlp": 1.0202136, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.7065999697895482, + "language_loss": 0.72180748, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74343002, + "num_input_tokens_seen": 112936745, + "step": 5261, + "time_per_iteration": 2.5352537631988525 + }, + { + "auxiliary_loss_clip": 0.01060163, + "auxiliary_loss_mlp": 0.01006561, + "balance_loss_clip": 1.03806257, + "balance_loss_mlp": 1.00435603, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7498756901119485, + "language_loss": 0.50658351, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52725077, + "num_input_tokens_seen": 112994845, + "step": 5262, + "time_per_iteration": 3.087620258331299 + }, + { + "auxiliary_loss_clip": 0.01130084, + "auxiliary_loss_mlp": 0.01039673, + "balance_loss_clip": 1.05485964, + "balance_loss_mlp": 1.02427077, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.5750647319569877, + "language_loss": 0.85263449, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87433207, + "num_input_tokens_seen": 113015125, + "step": 5263, + "time_per_iteration": 2.52726674079895 + }, + { + "auxiliary_loss_clip": 0.01108951, + "auxiliary_loss_mlp": 0.01041037, + "balance_loss_clip": 1.0483377, + "balance_loss_mlp": 1.02684498, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.5938407021577001, + "language_loss": 0.82030118, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.84180105, + "num_input_tokens_seen": 113035535, + "step": 5264, + "time_per_iteration": 2.5591118335723877 + }, + { + "auxiliary_loss_clip": 0.01119773, + "auxiliary_loss_mlp": 0.01038879, + "balance_loss_clip": 1.0485158, + "balance_loss_mlp": 1.02276182, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 4.878884896262556, + "language_loss": 0.79262143, + "learning_rate": 3.19873247349167e-06, + "loss": 0.81420791, + "num_input_tokens_seen": 113052720, + "step": 5265, + "time_per_iteration": 3.9543471336364746 + }, + { + "auxiliary_loss_clip": 0.01131748, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.05298448, + "balance_loss_mlp": 1.02506018, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.5823757242675567, + "language_loss": 0.74853837, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77026486, + "num_input_tokens_seen": 113071435, + "step": 5266, + "time_per_iteration": 2.501350164413452 + }, + { + "auxiliary_loss_clip": 0.01106434, + "auxiliary_loss_mlp": 0.01046717, + "balance_loss_clip": 1.04667687, + "balance_loss_mlp": 1.03148198, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.117613217418389, + "language_loss": 0.79154789, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81307936, + "num_input_tokens_seen": 113088645, + "step": 5267, + "time_per_iteration": 2.54248309135437 + }, + { + "auxiliary_loss_clip": 0.01046442, + "auxiliary_loss_mlp": 0.01007541, + "balance_loss_clip": 1.03391862, + "balance_loss_mlp": 1.00529945, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7432668923117276, + "language_loss": 0.57814384, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59868366, + "num_input_tokens_seen": 113152775, + "step": 5268, + "time_per_iteration": 4.621933937072754 + }, + { + "auxiliary_loss_clip": 0.01140842, + "auxiliary_loss_mlp": 0.01036777, + "balance_loss_clip": 1.05080199, + "balance_loss_mlp": 1.02155399, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 2.1163869059988247, + "language_loss": 0.73118317, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75295937, + "num_input_tokens_seen": 113171410, + "step": 5269, + "time_per_iteration": 2.4289438724517822 + }, + { + "auxiliary_loss_clip": 0.01103942, + "auxiliary_loss_mlp": 0.01042505, + "balance_loss_clip": 1.04698968, + "balance_loss_mlp": 1.0274961, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.9931089530007937, + "language_loss": 0.79988164, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82134616, + "num_input_tokens_seen": 113189965, + "step": 5270, + "time_per_iteration": 2.553274393081665 + }, + { + "auxiliary_loss_clip": 0.01144412, + "auxiliary_loss_mlp": 0.01048177, + "balance_loss_clip": 1.052827, + "balance_loss_mlp": 1.0315237, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 2.3359029182687063, + "language_loss": 0.79959822, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.82152408, + "num_input_tokens_seen": 113206355, + "step": 5271, + "time_per_iteration": 2.4182355403900146 + }, + { + "auxiliary_loss_clip": 0.01142335, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.05280888, + "balance_loss_mlp": 1.02449691, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 3.731459851317033, + "language_loss": 0.7318992, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75373387, + "num_input_tokens_seen": 113225440, + "step": 5272, + "time_per_iteration": 2.460787773132324 + }, + { + "auxiliary_loss_clip": 0.01117631, + "auxiliary_loss_mlp": 0.01046613, + "balance_loss_clip": 1.04574823, + "balance_loss_mlp": 1.02923226, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 2.0518635808385177, + "language_loss": 0.69451576, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71615815, + "num_input_tokens_seen": 113248840, + "step": 5273, + "time_per_iteration": 4.071844100952148 + }, + { + "auxiliary_loss_clip": 0.01126377, + "auxiliary_loss_mlp": 0.00782267, + "balance_loss_clip": 1.04919434, + "balance_loss_mlp": 1.00097799, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 2.0083055655079214, + "language_loss": 0.67868996, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69777644, + "num_input_tokens_seen": 113269630, + "step": 5274, + "time_per_iteration": 2.5519871711730957 + }, + { + "auxiliary_loss_clip": 0.01093507, + "auxiliary_loss_mlp": 0.01055055, + "balance_loss_clip": 1.0451026, + "balance_loss_mlp": 1.03903961, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.5636621916055105, + "language_loss": 0.80773079, + "learning_rate": 3.195612659536081e-06, + "loss": 0.82921636, + "num_input_tokens_seen": 113291200, + "step": 5275, + "time_per_iteration": 2.571871757507324 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.010473, + "balance_loss_clip": 1.04967737, + "balance_loss_mlp": 1.03121829, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.9299398789526672, + "language_loss": 0.72453231, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.74631172, + "num_input_tokens_seen": 113310170, + "step": 5276, + "time_per_iteration": 2.504868268966675 + }, + { + "auxiliary_loss_clip": 0.01120632, + "auxiliary_loss_mlp": 0.01043854, + "balance_loss_clip": 1.05429041, + "balance_loss_mlp": 1.02876234, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.4154782680890996, + "language_loss": 0.77802134, + "learning_rate": 3.194988152313236e-06, + "loss": 0.79966623, + "num_input_tokens_seen": 113331140, + "step": 5277, + "time_per_iteration": 2.5861475467681885 + }, + { + "auxiliary_loss_clip": 0.01108604, + "auxiliary_loss_mlp": 0.01050137, + "balance_loss_clip": 1.04424453, + "balance_loss_mlp": 1.03173125, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 2.040004716016829, + "language_loss": 0.79001838, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.81160581, + "num_input_tokens_seen": 113350030, + "step": 5278, + "time_per_iteration": 2.5426366329193115 + }, + { + "auxiliary_loss_clip": 0.01051899, + "auxiliary_loss_mlp": 0.01010445, + "balance_loss_clip": 1.03251839, + "balance_loss_mlp": 1.00859761, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8823554886150695, + "language_loss": 0.6283707, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64899415, + "num_input_tokens_seen": 113395820, + "step": 5279, + "time_per_iteration": 2.8451507091522217 + }, + { + "auxiliary_loss_clip": 0.01142109, + "auxiliary_loss_mlp": 0.01048969, + "balance_loss_clip": 1.04913878, + "balance_loss_mlp": 1.03182673, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.4818788931082991, + "language_loss": 0.81138289, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83329368, + "num_input_tokens_seen": 113416835, + "step": 5280, + "time_per_iteration": 2.537949323654175 + }, + { + "auxiliary_loss_clip": 0.01105682, + "auxiliary_loss_mlp": 0.01044159, + "balance_loss_clip": 1.05000734, + "balance_loss_mlp": 1.02916288, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.5463903727852186, + "language_loss": 0.78310311, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80460149, + "num_input_tokens_seen": 113440850, + "step": 5281, + "time_per_iteration": 2.6527018547058105 + }, + { + "auxiliary_loss_clip": 0.01118711, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.04982591, + "balance_loss_mlp": 1.02399063, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.6328730439554606, + "language_loss": 0.78556871, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80716032, + "num_input_tokens_seen": 113461000, + "step": 5282, + "time_per_iteration": 2.6080400943756104 + }, + { + "auxiliary_loss_clip": 0.01118913, + "auxiliary_loss_mlp": 0.0105073, + "balance_loss_clip": 1.04867291, + "balance_loss_mlp": 1.03323007, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 2.745938439734389, + "language_loss": 0.67442429, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69612074, + "num_input_tokens_seen": 113480820, + "step": 5283, + "time_per_iteration": 2.558712959289551 + }, + { + "auxiliary_loss_clip": 0.01051046, + "auxiliary_loss_mlp": 0.01003609, + "balance_loss_clip": 1.03248775, + "balance_loss_mlp": 1.00145149, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7433165745674162, + "language_loss": 0.52782667, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54837322, + "num_input_tokens_seen": 113536910, + "step": 5284, + "time_per_iteration": 3.0272154808044434 + }, + { + "auxiliary_loss_clip": 0.01124733, + "auxiliary_loss_mlp": 0.01039408, + "balance_loss_clip": 1.05568326, + "balance_loss_mlp": 1.02398229, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 2.872521940663371, + "language_loss": 0.70653641, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72817779, + "num_input_tokens_seen": 113555480, + "step": 5285, + "time_per_iteration": 4.049103498458862 + }, + { + "auxiliary_loss_clip": 0.01061664, + "auxiliary_loss_mlp": 0.01005707, + "balance_loss_clip": 1.03026795, + "balance_loss_mlp": 1.00376368, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8159386852344692, + "language_loss": 0.60515189, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62582564, + "num_input_tokens_seen": 113616790, + "step": 5286, + "time_per_iteration": 3.072540044784546 + }, + { + "auxiliary_loss_clip": 0.01144068, + "auxiliary_loss_mlp": 0.01042155, + "balance_loss_clip": 1.05252397, + "balance_loss_mlp": 1.02636003, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 3.1312974701275595, + "language_loss": 0.72243357, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74429584, + "num_input_tokens_seen": 113635320, + "step": 5287, + "time_per_iteration": 2.418241262435913 + }, + { + "auxiliary_loss_clip": 0.01128961, + "auxiliary_loss_mlp": 0.01046391, + "balance_loss_clip": 1.04939508, + "balance_loss_mlp": 1.03011918, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.806220229453088, + "language_loss": 0.75412786, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77588135, + "num_input_tokens_seen": 113654000, + "step": 5288, + "time_per_iteration": 2.465444564819336 + }, + { + "auxiliary_loss_clip": 0.01122518, + "auxiliary_loss_mlp": 0.01031438, + "balance_loss_clip": 1.04568601, + "balance_loss_mlp": 1.01727629, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 2.2481514603444435, + "language_loss": 0.87723416, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89877367, + "num_input_tokens_seen": 113672375, + "step": 5289, + "time_per_iteration": 2.4748568534851074 + }, + { + "auxiliary_loss_clip": 0.01122341, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.05119038, + "balance_loss_mlp": 1.02312303, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.6750281670439218, + "language_loss": 0.67684311, + "learning_rate": 3.190924441478572e-06, + "loss": 0.69843721, + "num_input_tokens_seen": 113692385, + "step": 5290, + "time_per_iteration": 2.4812214374542236 + }, + { + "auxiliary_loss_clip": 0.01121171, + "auxiliary_loss_mlp": 0.01040588, + "balance_loss_clip": 1.05109143, + "balance_loss_mlp": 1.02445889, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 6.571396638807908, + "language_loss": 0.80126858, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82288623, + "num_input_tokens_seen": 113712145, + "step": 5291, + "time_per_iteration": 2.551312208175659 + }, + { + "auxiliary_loss_clip": 0.01106192, + "auxiliary_loss_mlp": 0.0103581, + "balance_loss_clip": 1.05359995, + "balance_loss_mlp": 1.01941872, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.16830053369126, + "language_loss": 0.79885483, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82027483, + "num_input_tokens_seen": 113731435, + "step": 5292, + "time_per_iteration": 2.5864369869232178 + }, + { + "auxiliary_loss_clip": 0.0112368, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.04988909, + "balance_loss_mlp": 1.0241977, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 5.248850206829431, + "language_loss": 0.74999416, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.77160835, + "num_input_tokens_seen": 113750825, + "step": 5293, + "time_per_iteration": 2.480736255645752 + }, + { + "auxiliary_loss_clip": 0.01126978, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_clip": 1.05145717, + "balance_loss_mlp": 1.02681565, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.8317906105263215, + "language_loss": 0.73980188, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76148182, + "num_input_tokens_seen": 113770010, + "step": 5294, + "time_per_iteration": 2.538567543029785 + }, + { + "auxiliary_loss_clip": 0.0114086, + "auxiliary_loss_mlp": 0.01037038, + "balance_loss_clip": 1.05066276, + "balance_loss_mlp": 1.02061141, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.919384085246052, + "language_loss": 0.75885653, + "learning_rate": 3.189359442151152e-06, + "loss": 0.7806356, + "num_input_tokens_seen": 113788640, + "step": 5295, + "time_per_iteration": 2.4314026832580566 + }, + { + "auxiliary_loss_clip": 0.01107058, + "auxiliary_loss_mlp": 0.0104214, + "balance_loss_clip": 1.04746819, + "balance_loss_mlp": 1.0273937, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 2.0886315005492726, + "language_loss": 0.6954909, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71698284, + "num_input_tokens_seen": 113809515, + "step": 5296, + "time_per_iteration": 2.5742979049682617 + }, + { + "auxiliary_loss_clip": 0.011098, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.04700637, + "balance_loss_mlp": 1.02468109, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 1.7409091871432758, + "language_loss": 0.77751648, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79900599, + "num_input_tokens_seen": 113829770, + "step": 5297, + "time_per_iteration": 2.549226760864258 + }, + { + "auxiliary_loss_clip": 0.01106975, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.05122471, + "balance_loss_mlp": 1.01842904, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.8847355621094188, + "language_loss": 0.78938782, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.8107999, + "num_input_tokens_seen": 113849320, + "step": 5298, + "time_per_iteration": 2.613403081893921 + }, + { + "auxiliary_loss_clip": 0.01124064, + "auxiliary_loss_mlp": 0.01040089, + "balance_loss_clip": 1.05288148, + "balance_loss_mlp": 1.02498507, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 1.7790871301643887, + "language_loss": 0.73886502, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76050657, + "num_input_tokens_seen": 113867860, + "step": 5299, + "time_per_iteration": 2.5224485397338867 + }, + { + "auxiliary_loss_clip": 0.01126373, + "auxiliary_loss_mlp": 0.01043964, + "balance_loss_clip": 1.05224586, + "balance_loss_mlp": 1.02878845, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 2.1265009746094243, + "language_loss": 0.77888072, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80058408, + "num_input_tokens_seen": 113886375, + "step": 5300, + "time_per_iteration": 2.555837869644165 + }, + { + "auxiliary_loss_clip": 0.01116482, + "auxiliary_loss_mlp": 0.01043627, + "balance_loss_clip": 1.04796171, + "balance_loss_mlp": 1.02762938, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 2.0841616160989402, + "language_loss": 0.84071642, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86231756, + "num_input_tokens_seen": 113904065, + "step": 5301, + "time_per_iteration": 2.510014295578003 + }, + { + "auxiliary_loss_clip": 0.01128888, + "auxiliary_loss_mlp": 0.0104953, + "balance_loss_clip": 1.05342484, + "balance_loss_mlp": 1.03350806, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.9656491123820845, + "language_loss": 0.76910394, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79088807, + "num_input_tokens_seen": 113918415, + "step": 5302, + "time_per_iteration": 2.4724295139312744 + }, + { + "auxiliary_loss_clip": 0.01134275, + "auxiliary_loss_mlp": 0.01039277, + "balance_loss_clip": 1.04963911, + "balance_loss_mlp": 1.02355361, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 2.5735430417678353, + "language_loss": 0.79568577, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81742132, + "num_input_tokens_seen": 113938135, + "step": 5303, + "time_per_iteration": 2.4360549449920654 + }, + { + "auxiliary_loss_clip": 0.0112903, + "auxiliary_loss_mlp": 0.0104356, + "balance_loss_clip": 1.05018961, + "balance_loss_mlp": 1.02765727, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.9618643886603446, + "language_loss": 0.7344985, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75622439, + "num_input_tokens_seen": 113957125, + "step": 5304, + "time_per_iteration": 3.972581148147583 + }, + { + "auxiliary_loss_clip": 0.01102171, + "auxiliary_loss_mlp": 0.01044889, + "balance_loss_clip": 1.04310489, + "balance_loss_mlp": 1.03026199, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 2.54638497515436, + "language_loss": 0.72254932, + "learning_rate": 3.186226062434068e-06, + "loss": 0.74401993, + "num_input_tokens_seen": 113974875, + "step": 5305, + "time_per_iteration": 2.53428316116333 + }, + { + "auxiliary_loss_clip": 0.01118216, + "auxiliary_loss_mlp": 0.010376, + "balance_loss_clip": 1.05061948, + "balance_loss_mlp": 1.02365208, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.7224522843347434, + "language_loss": 0.64287508, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66443318, + "num_input_tokens_seen": 113994450, + "step": 5306, + "time_per_iteration": 2.529944658279419 + }, + { + "auxiliary_loss_clip": 0.01109889, + "auxiliary_loss_mlp": 0.01043996, + "balance_loss_clip": 1.04534149, + "balance_loss_mlp": 1.02790272, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.2712442311868073, + "language_loss": 0.79666364, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81820256, + "num_input_tokens_seen": 114013945, + "step": 5307, + "time_per_iteration": 2.5461342334747314 + }, + { + "auxiliary_loss_clip": 0.01105804, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.04526329, + "balance_loss_mlp": 1.02266324, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.8811783220019103, + "language_loss": 0.77811795, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.79956251, + "num_input_tokens_seen": 114031375, + "step": 5308, + "time_per_iteration": 3.987574815750122 + }, + { + "auxiliary_loss_clip": 0.011363, + "auxiliary_loss_mlp": 0.01047337, + "balance_loss_clip": 1.05149412, + "balance_loss_mlp": 1.02995634, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 3.2037520095572662, + "language_loss": 0.7379998, + "learning_rate": 3.184971450390961e-06, + "loss": 0.75983614, + "num_input_tokens_seen": 114048465, + "step": 5309, + "time_per_iteration": 2.431325912475586 + }, + { + "auxiliary_loss_clip": 0.01129924, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.050982, + "balance_loss_mlp": 1.02038956, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.9898531536942032, + "language_loss": 0.82747173, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84911841, + "num_input_tokens_seen": 114068415, + "step": 5310, + "time_per_iteration": 2.5343902111053467 + }, + { + "auxiliary_loss_clip": 0.01110583, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.0455122, + "balance_loss_mlp": 1.02680206, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.6541973098855978, + "language_loss": 0.78222322, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80373758, + "num_input_tokens_seen": 114088565, + "step": 5311, + "time_per_iteration": 2.550447463989258 + }, + { + "auxiliary_loss_clip": 0.01104296, + "auxiliary_loss_mlp": 0.01040723, + "balance_loss_clip": 1.04681635, + "balance_loss_mlp": 1.02529764, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 1.7340318074376297, + "language_loss": 0.84085345, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86230361, + "num_input_tokens_seen": 114107160, + "step": 5312, + "time_per_iteration": 4.062623500823975 + }, + { + "auxiliary_loss_clip": 0.01093328, + "auxiliary_loss_mlp": 0.01044814, + "balance_loss_clip": 1.04039001, + "balance_loss_mlp": 1.02813709, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.2702169752039527, + "language_loss": 0.78756225, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80894369, + "num_input_tokens_seen": 114123420, + "step": 5313, + "time_per_iteration": 2.4988462924957275 + }, + { + "auxiliary_loss_clip": 0.01122868, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.04611504, + "balance_loss_mlp": 1.02009249, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 4.631468394994663, + "language_loss": 0.86190283, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88348472, + "num_input_tokens_seen": 114139230, + "step": 5314, + "time_per_iteration": 2.480231523513794 + }, + { + "auxiliary_loss_clip": 0.01108753, + "auxiliary_loss_mlp": 0.01047257, + "balance_loss_clip": 1.04282105, + "balance_loss_mlp": 1.03011417, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.8837555798197296, + "language_loss": 0.79463303, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.8161931, + "num_input_tokens_seen": 114159290, + "step": 5315, + "time_per_iteration": 2.5135254859924316 + }, + { + "auxiliary_loss_clip": 0.01102282, + "auxiliary_loss_mlp": 0.01057827, + "balance_loss_clip": 1.04486299, + "balance_loss_mlp": 1.0396595, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 1.8832577888083821, + "language_loss": 0.67057073, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69217181, + "num_input_tokens_seen": 114177655, + "step": 5316, + "time_per_iteration": 2.5129430294036865 + }, + { + "auxiliary_loss_clip": 0.01125002, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.04573786, + "balance_loss_mlp": 1.02174842, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.5238800145017006, + "language_loss": 0.69560027, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71720809, + "num_input_tokens_seen": 114200880, + "step": 5317, + "time_per_iteration": 2.537358045578003 + }, + { + "auxiliary_loss_clip": 0.01040934, + "auxiliary_loss_mlp": 0.01009964, + "balance_loss_clip": 1.0287962, + "balance_loss_mlp": 1.0078423, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7283289367749232, + "language_loss": 0.53019428, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55070329, + "num_input_tokens_seen": 114267145, + "step": 5318, + "time_per_iteration": 3.2147791385650635 + }, + { + "auxiliary_loss_clip": 0.01134161, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.04828262, + "balance_loss_mlp": 1.01944041, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.712453921811079, + "language_loss": 0.84305215, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86473536, + "num_input_tokens_seen": 114284630, + "step": 5319, + "time_per_iteration": 2.4042136669158936 + }, + { + "auxiliary_loss_clip": 0.01121857, + "auxiliary_loss_mlp": 0.0104621, + "balance_loss_clip": 1.04529774, + "balance_loss_mlp": 1.03095102, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.696251239905187, + "language_loss": 0.63657242, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65825307, + "num_input_tokens_seen": 114305830, + "step": 5320, + "time_per_iteration": 2.5952744483947754 + }, + { + "auxiliary_loss_clip": 0.01115992, + "auxiliary_loss_mlp": 0.0103872, + "balance_loss_clip": 1.04595184, + "balance_loss_mlp": 1.02363968, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 2.167329266021357, + "language_loss": 0.71018636, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.73173356, + "num_input_tokens_seen": 114325165, + "step": 5321, + "time_per_iteration": 2.513094186782837 + }, + { + "auxiliary_loss_clip": 0.01144553, + "auxiliary_loss_mlp": 0.00784792, + "balance_loss_clip": 1.05009937, + "balance_loss_mlp": 1.0010612, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 3.221166465089937, + "language_loss": 0.86655414, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88584757, + "num_input_tokens_seen": 114341310, + "step": 5322, + "time_per_iteration": 2.421614408493042 + }, + { + "auxiliary_loss_clip": 0.01112903, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.04470229, + "balance_loss_mlp": 1.02125955, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.7930337650672876, + "language_loss": 0.83325678, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85474825, + "num_input_tokens_seen": 114360355, + "step": 5323, + "time_per_iteration": 2.5173556804656982 + }, + { + "auxiliary_loss_clip": 0.01121872, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_clip": 1.04548001, + "balance_loss_mlp": 1.02378523, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.8346153857877345, + "language_loss": 0.77789664, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.7995255, + "num_input_tokens_seen": 114379220, + "step": 5324, + "time_per_iteration": 2.459472417831421 + }, + { + "auxiliary_loss_clip": 0.01113818, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.04484558, + "balance_loss_mlp": 1.01845694, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.983779379337225, + "language_loss": 0.80192733, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82341278, + "num_input_tokens_seen": 114396365, + "step": 5325, + "time_per_iteration": 4.045304298400879 + }, + { + "auxiliary_loss_clip": 0.01129501, + "auxiliary_loss_mlp": 0.01039172, + "balance_loss_clip": 1.050524, + "balance_loss_mlp": 1.02403855, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.736390708375897, + "language_loss": 0.74666297, + "learning_rate": 3.179631337655037e-06, + "loss": 0.76834965, + "num_input_tokens_seen": 114416780, + "step": 5326, + "time_per_iteration": 2.562483549118042 + }, + { + "auxiliary_loss_clip": 0.01104057, + "auxiliary_loss_mlp": 0.01042903, + "balance_loss_clip": 1.05033422, + "balance_loss_mlp": 1.02723932, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.4400051594583476, + "language_loss": 0.80624443, + "learning_rate": 3.179316810218701e-06, + "loss": 0.82771409, + "num_input_tokens_seen": 114437405, + "step": 5327, + "time_per_iteration": 2.599337577819824 + }, + { + "auxiliary_loss_clip": 0.01111596, + "auxiliary_loss_mlp": 0.01037696, + "balance_loss_clip": 1.04938126, + "balance_loss_mlp": 1.0211494, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.5435550180142905, + "language_loss": 0.77624124, + "learning_rate": 3.179002238062554e-06, + "loss": 0.79773414, + "num_input_tokens_seen": 114458505, + "step": 5328, + "time_per_iteration": 2.5652999877929688 + }, + { + "auxiliary_loss_clip": 0.01086962, + "auxiliary_loss_mlp": 0.01045409, + "balance_loss_clip": 1.04420435, + "balance_loss_mlp": 1.02780163, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 1.6504376248101604, + "language_loss": 0.73581302, + "learning_rate": 3.178687621198524e-06, + "loss": 0.7571367, + "num_input_tokens_seen": 114479050, + "step": 5329, + "time_per_iteration": 2.702845335006714 + }, + { + "auxiliary_loss_clip": 0.01106095, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.04315615, + "balance_loss_mlp": 1.02292013, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 1.7926095986959258, + "language_loss": 0.71063137, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73206353, + "num_input_tokens_seen": 114497415, + "step": 5330, + "time_per_iteration": 2.481147289276123 + }, + { + "auxiliary_loss_clip": 0.0109143, + "auxiliary_loss_mlp": 0.01049974, + "balance_loss_clip": 1.04992497, + "balance_loss_mlp": 1.03113937, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 2.1001273445005344, + "language_loss": 0.80067098, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82208502, + "num_input_tokens_seen": 114518785, + "step": 5331, + "time_per_iteration": 2.63063383102417 + }, + { + "auxiliary_loss_clip": 0.01048169, + "auxiliary_loss_mlp": 0.01003139, + "balance_loss_clip": 1.03246236, + "balance_loss_mlp": 1.00093365, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8369983684880477, + "language_loss": 0.57789326, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59840631, + "num_input_tokens_seen": 114577710, + "step": 5332, + "time_per_iteration": 3.0018510818481445 + }, + { + "auxiliary_loss_clip": 0.01100391, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.04440463, + "balance_loss_mlp": 1.01998186, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.5797599501058752, + "language_loss": 0.73453116, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75589848, + "num_input_tokens_seen": 114598640, + "step": 5333, + "time_per_iteration": 2.6368095874786377 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.04551446, + "balance_loss_mlp": 1.02462947, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.945408000914704, + "language_loss": 0.70600331, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72757196, + "num_input_tokens_seen": 114618780, + "step": 5334, + "time_per_iteration": 2.5221118927001953 + }, + { + "auxiliary_loss_clip": 0.01098426, + "auxiliary_loss_mlp": 0.01045204, + "balance_loss_clip": 1.04505801, + "balance_loss_mlp": 1.02963495, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 2.131180250535317, + "language_loss": 0.77541596, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79685223, + "num_input_tokens_seen": 114637525, + "step": 5335, + "time_per_iteration": 2.575470447540283 + }, + { + "auxiliary_loss_clip": 0.01124111, + "auxiliary_loss_mlp": 0.01039927, + "balance_loss_clip": 1.04628325, + "balance_loss_mlp": 1.02427483, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.8334723266734076, + "language_loss": 0.68285608, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.7044965, + "num_input_tokens_seen": 114659705, + "step": 5336, + "time_per_iteration": 2.5942394733428955 + }, + { + "auxiliary_loss_clip": 0.01101184, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.04210877, + "balance_loss_mlp": 1.03148293, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.7067556399877686, + "language_loss": 0.78791803, + "learning_rate": 3.176169078234487e-06, + "loss": 0.80940604, + "num_input_tokens_seen": 114678340, + "step": 5337, + "time_per_iteration": 2.5364885330200195 + }, + { + "auxiliary_loss_clip": 0.01120955, + "auxiliary_loss_mlp": 0.01036246, + "balance_loss_clip": 1.04706466, + "balance_loss_mlp": 1.0216732, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.5527144695833517, + "language_loss": 0.74222279, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.7637949, + "num_input_tokens_seen": 114696980, + "step": 5338, + "time_per_iteration": 2.480372190475464 + }, + { + "auxiliary_loss_clip": 0.01117908, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.04485667, + "balance_loss_mlp": 1.02647555, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 2.0718795631878875, + "language_loss": 0.63065195, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65226144, + "num_input_tokens_seen": 114717330, + "step": 5339, + "time_per_iteration": 2.5682785511016846 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01044884, + "balance_loss_clip": 1.04685783, + "balance_loss_mlp": 1.02923179, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 2.2662782670227335, + "language_loss": 0.81725895, + "learning_rate": 3.175223888387192e-06, + "loss": 0.83906853, + "num_input_tokens_seen": 114736320, + "step": 5340, + "time_per_iteration": 2.4318807125091553 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01046793, + "balance_loss_clip": 1.04847586, + "balance_loss_mlp": 1.03182054, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 2.316609454484498, + "language_loss": 0.75965226, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78123194, + "num_input_tokens_seen": 114754575, + "step": 5341, + "time_per_iteration": 2.511115074157715 + }, + { + "auxiliary_loss_clip": 0.01098859, + "auxiliary_loss_mlp": 0.01040689, + "balance_loss_clip": 1.04583573, + "balance_loss_mlp": 1.02533507, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.5953244432690352, + "language_loss": 0.79154396, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81293941, + "num_input_tokens_seen": 114773590, + "step": 5342, + "time_per_iteration": 2.5468525886535645 + }, + { + "auxiliary_loss_clip": 0.01114844, + "auxiliary_loss_mlp": 0.01041719, + "balance_loss_clip": 1.04862618, + "balance_loss_mlp": 1.02606714, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 3.172903206843297, + "language_loss": 0.74369884, + "learning_rate": 3.174278297458438e-06, + "loss": 0.76526439, + "num_input_tokens_seen": 114790775, + "step": 5343, + "time_per_iteration": 4.026190757751465 + }, + { + "auxiliary_loss_clip": 0.01078357, + "auxiliary_loss_mlp": 0.01040157, + "balance_loss_clip": 1.04163933, + "balance_loss_mlp": 1.02434957, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.555672423616714, + "language_loss": 0.82494378, + "learning_rate": 3.173963011408748e-06, + "loss": 0.84612882, + "num_input_tokens_seen": 114809835, + "step": 5344, + "time_per_iteration": 2.632603406906128 + }, + { + "auxiliary_loss_clip": 0.01087639, + "auxiliary_loss_mlp": 0.01041016, + "balance_loss_clip": 1.04532456, + "balance_loss_mlp": 1.02498245, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.0830930968418886, + "language_loss": 0.79644275, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81772935, + "num_input_tokens_seen": 114826505, + "step": 5345, + "time_per_iteration": 2.534531354904175 + }, + { + "auxiliary_loss_clip": 0.01115153, + "auxiliary_loss_mlp": 0.01041882, + "balance_loss_clip": 1.04884374, + "balance_loss_mlp": 1.02644467, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.7639949136939168, + "language_loss": 0.83484411, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85641444, + "num_input_tokens_seen": 114846140, + "step": 5346, + "time_per_iteration": 2.5514278411865234 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01043467, + "balance_loss_clip": 1.04604709, + "balance_loss_mlp": 1.02712369, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.4437603352690738, + "language_loss": 0.81499493, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.8365159, + "num_input_tokens_seen": 114866660, + "step": 5347, + "time_per_iteration": 2.5539159774780273 + }, + { + "auxiliary_loss_clip": 0.01126928, + "auxiliary_loss_mlp": 0.01046409, + "balance_loss_clip": 1.04852057, + "balance_loss_mlp": 1.02975583, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 2.1572836916944027, + "language_loss": 0.80456769, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.8263011, + "num_input_tokens_seen": 114882820, + "step": 5348, + "time_per_iteration": 3.9618000984191895 + }, + { + "auxiliary_loss_clip": 0.01113481, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_clip": 1.04611802, + "balance_loss_mlp": 1.03263116, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 3.529794773554568, + "language_loss": 0.85386765, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87548232, + "num_input_tokens_seen": 114900745, + "step": 5349, + "time_per_iteration": 2.4819095134735107 + }, + { + "auxiliary_loss_clip": 0.01111011, + "auxiliary_loss_mlp": 0.01044217, + "balance_loss_clip": 1.04562485, + "balance_loss_mlp": 1.02779019, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 2.0111157119222978, + "language_loss": 0.80523777, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82679003, + "num_input_tokens_seen": 114917940, + "step": 5350, + "time_per_iteration": 2.47371768951416 + }, + { + "auxiliary_loss_clip": 0.01126033, + "auxiliary_loss_mlp": 0.01045418, + "balance_loss_clip": 1.04810119, + "balance_loss_mlp": 1.03061199, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.6017359001454086, + "language_loss": 0.80310869, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82482326, + "num_input_tokens_seen": 114937735, + "step": 5351, + "time_per_iteration": 2.519287586212158 + }, + { + "auxiliary_loss_clip": 0.01102747, + "auxiliary_loss_mlp": 0.010443, + "balance_loss_clip": 1.04720604, + "balance_loss_mlp": 1.02742028, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.756093255278612, + "language_loss": 0.75823855, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.7797091, + "num_input_tokens_seen": 114956630, + "step": 5352, + "time_per_iteration": 3.9395790100097656 + }, + { + "auxiliary_loss_clip": 0.01094864, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.04836535, + "balance_loss_mlp": 1.02320552, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 2.136113172971974, + "language_loss": 0.82058108, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.84191763, + "num_input_tokens_seen": 114976470, + "step": 5353, + "time_per_iteration": 2.5754904747009277 + }, + { + "auxiliary_loss_clip": 0.01076851, + "auxiliary_loss_mlp": 0.01042554, + "balance_loss_clip": 1.05128193, + "balance_loss_mlp": 1.02623463, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.6814220487689482, + "language_loss": 0.73219156, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75338566, + "num_input_tokens_seen": 114996710, + "step": 5354, + "time_per_iteration": 2.6649961471557617 + }, + { + "auxiliary_loss_clip": 0.01104613, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.04570317, + "balance_loss_mlp": 1.02088737, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.5166006581001563, + "language_loss": 0.83415073, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.85555232, + "num_input_tokens_seen": 115015775, + "step": 5355, + "time_per_iteration": 2.572265148162842 + }, + { + "auxiliary_loss_clip": 0.0114378, + "auxiliary_loss_mlp": 0.01046632, + "balance_loss_clip": 1.0528512, + "balance_loss_mlp": 1.03135538, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 7.438718935014444, + "language_loss": 0.70997167, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73187578, + "num_input_tokens_seen": 115034265, + "step": 5356, + "time_per_iteration": 2.5090065002441406 + }, + { + "auxiliary_loss_clip": 0.01098727, + "auxiliary_loss_mlp": 0.01039426, + "balance_loss_clip": 1.04850602, + "balance_loss_mlp": 1.02283263, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.3896052094432783, + "language_loss": 0.67985451, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70123601, + "num_input_tokens_seen": 115051945, + "step": 5357, + "time_per_iteration": 2.6076676845550537 + }, + { + "auxiliary_loss_clip": 0.0104677, + "auxiliary_loss_mlp": 0.01005849, + "balance_loss_clip": 1.03606248, + "balance_loss_mlp": 1.00391805, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7714144910543194, + "language_loss": 0.58290935, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60343552, + "num_input_tokens_seen": 115119090, + "step": 5358, + "time_per_iteration": 3.1825907230377197 + }, + { + "auxiliary_loss_clip": 0.0107677, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.043854, + "balance_loss_mlp": 1.02483535, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 1.763938637584604, + "language_loss": 0.8333801, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.8545565, + "num_input_tokens_seen": 115137755, + "step": 5359, + "time_per_iteration": 2.6130638122558594 + }, + { + "auxiliary_loss_clip": 0.01129395, + "auxiliary_loss_mlp": 0.01039235, + "balance_loss_clip": 1.04691803, + "balance_loss_mlp": 1.02333236, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.694197731692819, + "language_loss": 0.79299319, + "learning_rate": 3.168912388464595e-06, + "loss": 0.8146795, + "num_input_tokens_seen": 115158150, + "step": 5360, + "time_per_iteration": 2.5095784664154053 + }, + { + "auxiliary_loss_clip": 0.01052473, + "auxiliary_loss_mlp": 0.01006009, + "balance_loss_clip": 1.03104556, + "balance_loss_mlp": 1.00416148, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.6663739856327913, + "language_loss": 0.56954741, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59013224, + "num_input_tokens_seen": 115212755, + "step": 5361, + "time_per_iteration": 2.924516439437866 + }, + { + "auxiliary_loss_clip": 0.0108375, + "auxiliary_loss_mlp": 0.01048548, + "balance_loss_clip": 1.04708934, + "balance_loss_mlp": 1.03224051, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.8846134784204012, + "language_loss": 0.70928991, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73061287, + "num_input_tokens_seen": 115233090, + "step": 5362, + "time_per_iteration": 2.6588339805603027 + }, + { + "auxiliary_loss_clip": 0.01126062, + "auxiliary_loss_mlp": 0.01054746, + "balance_loss_clip": 1.04985929, + "balance_loss_mlp": 1.03847408, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.6287539920552208, + "language_loss": 0.7394371, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76124519, + "num_input_tokens_seen": 115252645, + "step": 5363, + "time_per_iteration": 2.5328097343444824 + }, + { + "auxiliary_loss_clip": 0.01131661, + "auxiliary_loss_mlp": 0.01043105, + "balance_loss_clip": 1.04939806, + "balance_loss_mlp": 1.02699935, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.844337809916728, + "language_loss": 0.76561439, + "learning_rate": 3.167647957801365e-06, + "loss": 0.78736198, + "num_input_tokens_seen": 115269085, + "step": 5364, + "time_per_iteration": 2.474036693572998 + }, + { + "auxiliary_loss_clip": 0.0111835, + "auxiliary_loss_mlp": 0.0104142, + "balance_loss_clip": 1.04880404, + "balance_loss_mlp": 1.02531457, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.628963693181413, + "language_loss": 0.7710048, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79260254, + "num_input_tokens_seen": 115286470, + "step": 5365, + "time_per_iteration": 4.065187215805054 + }, + { + "auxiliary_loss_clip": 0.01123726, + "auxiliary_loss_mlp": 0.01047524, + "balance_loss_clip": 1.05426264, + "balance_loss_mlp": 1.03144264, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.5455265113845613, + "language_loss": 0.7643286, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.78604108, + "num_input_tokens_seen": 115307000, + "step": 5366, + "time_per_iteration": 2.5273971557617188 + }, + { + "auxiliary_loss_clip": 0.01113812, + "auxiliary_loss_mlp": 0.01044943, + "balance_loss_clip": 1.04700208, + "balance_loss_mlp": 1.02830148, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.720968121787774, + "language_loss": 0.71855348, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74014103, + "num_input_tokens_seen": 115325925, + "step": 5367, + "time_per_iteration": 2.519792079925537 + }, + { + "auxiliary_loss_clip": 0.01137895, + "auxiliary_loss_mlp": 0.01038652, + "balance_loss_clip": 1.05015719, + "balance_loss_mlp": 1.02373922, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.7451443472591242, + "language_loss": 0.74201798, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76378345, + "num_input_tokens_seen": 115343705, + "step": 5368, + "time_per_iteration": 2.421349048614502 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01040308, + "balance_loss_clip": 1.04455769, + "balance_loss_mlp": 1.02385712, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.6096830399032465, + "language_loss": 0.78396177, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.80542797, + "num_input_tokens_seen": 115364170, + "step": 5369, + "time_per_iteration": 2.5618388652801514 + }, + { + "auxiliary_loss_clip": 0.0109971, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.04912496, + "balance_loss_mlp": 1.01480496, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.8144271231839324, + "language_loss": 0.83425212, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85554719, + "num_input_tokens_seen": 115382495, + "step": 5370, + "time_per_iteration": 2.5219979286193848 + }, + { + "auxiliary_loss_clip": 0.01142632, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.05380094, + "balance_loss_mlp": 1.01973283, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 2.4722462624235537, + "language_loss": 0.82622141, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.84800363, + "num_input_tokens_seen": 115399450, + "step": 5371, + "time_per_iteration": 2.464062452316284 + }, + { + "auxiliary_loss_clip": 0.0113236, + "auxiliary_loss_mlp": 0.00783211, + "balance_loss_clip": 1.05144024, + "balance_loss_mlp": 1.00093663, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 2.577511400960966, + "language_loss": 0.8865, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90565574, + "num_input_tokens_seen": 115417700, + "step": 5372, + "time_per_iteration": 2.5020875930786133 + }, + { + "auxiliary_loss_clip": 0.01140189, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.05239308, + "balance_loss_mlp": 1.02764726, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 2.5496197788411075, + "language_loss": 0.73262334, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75445729, + "num_input_tokens_seen": 115435840, + "step": 5373, + "time_per_iteration": 2.492798328399658 + }, + { + "auxiliary_loss_clip": 0.01113072, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.04836226, + "balance_loss_mlp": 1.02187204, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.1462760589300696, + "language_loss": 0.81722033, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83872062, + "num_input_tokens_seen": 115454210, + "step": 5374, + "time_per_iteration": 2.506256341934204 + }, + { + "auxiliary_loss_clip": 0.01097165, + "auxiliary_loss_mlp": 0.01041665, + "balance_loss_clip": 1.04554021, + "balance_loss_mlp": 1.02566671, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.3296439378649265, + "language_loss": 0.87647474, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89786303, + "num_input_tokens_seen": 115471785, + "step": 5375, + "time_per_iteration": 2.5953102111816406 + }, + { + "auxiliary_loss_clip": 0.01139774, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_clip": 1.04800296, + "balance_loss_mlp": 1.01762652, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 1.8060279205276333, + "language_loss": 0.75948846, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.7812295, + "num_input_tokens_seen": 115491405, + "step": 5376, + "time_per_iteration": 2.4378252029418945 + }, + { + "auxiliary_loss_clip": 0.01103947, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.05016375, + "balance_loss_mlp": 1.01856232, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 4.231736958418262, + "language_loss": 0.66710299, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.68848091, + "num_input_tokens_seen": 115511555, + "step": 5377, + "time_per_iteration": 2.5659918785095215 + }, + { + "auxiliary_loss_clip": 0.01103386, + "auxiliary_loss_mlp": 0.01045429, + "balance_loss_clip": 1.04495299, + "balance_loss_mlp": 1.02761865, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.4720416759019057, + "language_loss": 0.72495741, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74644554, + "num_input_tokens_seen": 115532860, + "step": 5378, + "time_per_iteration": 2.6075024604797363 + }, + { + "auxiliary_loss_clip": 0.01123078, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.05046177, + "balance_loss_mlp": 1.02133393, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.9720458620670462, + "language_loss": 0.82280582, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84440482, + "num_input_tokens_seen": 115553850, + "step": 5379, + "time_per_iteration": 2.535287380218506 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01037435, + "balance_loss_clip": 1.04993939, + "balance_loss_mlp": 1.0231185, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.5920480940052821, + "language_loss": 0.7865864, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80826181, + "num_input_tokens_seen": 115575530, + "step": 5380, + "time_per_iteration": 2.5527987480163574 + }, + { + "auxiliary_loss_clip": 0.01125476, + "auxiliary_loss_mlp": 0.01039936, + "balance_loss_clip": 1.05193686, + "balance_loss_mlp": 1.02518368, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.6522910308429974, + "language_loss": 0.77100253, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79265666, + "num_input_tokens_seen": 115594885, + "step": 5381, + "time_per_iteration": 2.516846179962158 + }, + { + "auxiliary_loss_clip": 0.01126347, + "auxiliary_loss_mlp": 0.01036576, + "balance_loss_clip": 1.05309629, + "balance_loss_mlp": 1.02248502, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 3.609964956908401, + "language_loss": 0.71723956, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.73886877, + "num_input_tokens_seen": 115614080, + "step": 5382, + "time_per_iteration": 2.5747761726379395 + }, + { + "auxiliary_loss_clip": 0.01113629, + "auxiliary_loss_mlp": 0.01047192, + "balance_loss_clip": 1.04613018, + "balance_loss_mlp": 1.03053808, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.5144537420298083, + "language_loss": 0.70563275, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72724092, + "num_input_tokens_seen": 115632820, + "step": 5383, + "time_per_iteration": 4.125161170959473 + }, + { + "auxiliary_loss_clip": 0.01124196, + "auxiliary_loss_mlp": 0.01036344, + "balance_loss_clip": 1.04986572, + "balance_loss_mlp": 1.02271295, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 1.9634935546092445, + "language_loss": 0.78495097, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80655634, + "num_input_tokens_seen": 115652860, + "step": 5384, + "time_per_iteration": 2.5239977836608887 + }, + { + "auxiliary_loss_clip": 0.01084039, + "auxiliary_loss_mlp": 0.01045046, + "balance_loss_clip": 1.04706693, + "balance_loss_mlp": 1.02736712, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.053105809552431, + "language_loss": 0.74494511, + "learning_rate": 3.16099809186998e-06, + "loss": 0.76623595, + "num_input_tokens_seen": 115670940, + "step": 5385, + "time_per_iteration": 2.576805591583252 + }, + { + "auxiliary_loss_clip": 0.01111188, + "auxiliary_loss_mlp": 0.0104082, + "balance_loss_clip": 1.05148637, + "balance_loss_mlp": 1.02560949, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.7972389932365356, + "language_loss": 0.71890157, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74042165, + "num_input_tokens_seen": 115691155, + "step": 5386, + "time_per_iteration": 2.5882527828216553 + }, + { + "auxiliary_loss_clip": 0.01139828, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.04939723, + "balance_loss_mlp": 1.02126241, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.768037853973843, + "language_loss": 0.93882626, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96059597, + "num_input_tokens_seen": 115710340, + "step": 5387, + "time_per_iteration": 3.913449764251709 + }, + { + "auxiliary_loss_clip": 0.01130806, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.05094552, + "balance_loss_mlp": 1.02821946, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 3.089780691097366, + "language_loss": 0.77423018, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79598308, + "num_input_tokens_seen": 115726745, + "step": 5388, + "time_per_iteration": 2.4711782932281494 + }, + { + "auxiliary_loss_clip": 0.011099, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.04337728, + "balance_loss_mlp": 1.01734304, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 1.898547134026297, + "language_loss": 0.7183044, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.73973227, + "num_input_tokens_seen": 115749385, + "step": 5389, + "time_per_iteration": 2.6195099353790283 + }, + { + "auxiliary_loss_clip": 0.01102094, + "auxiliary_loss_mlp": 0.01040275, + "balance_loss_clip": 1.05105197, + "balance_loss_mlp": 1.02474236, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 1.7434812081849433, + "language_loss": 0.80980599, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83122969, + "num_input_tokens_seen": 115768105, + "step": 5390, + "time_per_iteration": 2.5432393550872803 + }, + { + "auxiliary_loss_clip": 0.0111407, + "auxiliary_loss_mlp": 0.01043852, + "balance_loss_clip": 1.05053949, + "balance_loss_mlp": 1.02756834, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 1.8998834338634285, + "language_loss": 0.72219217, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.74377137, + "num_input_tokens_seen": 115787340, + "step": 5391, + "time_per_iteration": 3.98945689201355 + }, + { + "auxiliary_loss_clip": 0.01110751, + "auxiliary_loss_mlp": 0.01040137, + "balance_loss_clip": 1.04536223, + "balance_loss_mlp": 1.02558124, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.5534688630375808, + "language_loss": 0.77007425, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79158312, + "num_input_tokens_seen": 115805565, + "step": 5392, + "time_per_iteration": 2.4820613861083984 + }, + { + "auxiliary_loss_clip": 0.01110891, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.04519081, + "balance_loss_mlp": 1.02703059, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 1.9744922840584656, + "language_loss": 0.62659508, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64814854, + "num_input_tokens_seen": 115826725, + "step": 5393, + "time_per_iteration": 2.5749473571777344 + }, + { + "auxiliary_loss_clip": 0.01119877, + "auxiliary_loss_mlp": 0.01037691, + "balance_loss_clip": 1.04814935, + "balance_loss_mlp": 1.02237225, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.6638043622219085, + "language_loss": 0.82505369, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84662938, + "num_input_tokens_seen": 115846955, + "step": 5394, + "time_per_iteration": 2.490767002105713 + }, + { + "auxiliary_loss_clip": 0.01108542, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.0459578, + "balance_loss_mlp": 1.02558804, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 2.2947191925419737, + "language_loss": 0.81636441, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83784193, + "num_input_tokens_seen": 115865975, + "step": 5395, + "time_per_iteration": 2.532390832901001 + }, + { + "auxiliary_loss_clip": 0.01127165, + "auxiliary_loss_mlp": 0.01039706, + "balance_loss_clip": 1.0538578, + "balance_loss_mlp": 1.0257349, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 2.3975203783126586, + "language_loss": 0.83546144, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85713017, + "num_input_tokens_seen": 115884950, + "step": 5396, + "time_per_iteration": 2.491819381713867 + }, + { + "auxiliary_loss_clip": 0.01105369, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.05090189, + "balance_loss_mlp": 1.0300231, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 2.298240730508893, + "language_loss": 0.7634424, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.78496528, + "num_input_tokens_seen": 115904170, + "step": 5397, + "time_per_iteration": 2.5344350337982178 + }, + { + "auxiliary_loss_clip": 0.01106413, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.05317855, + "balance_loss_mlp": 1.01973176, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.6680060704373583, + "language_loss": 0.67431164, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.69571817, + "num_input_tokens_seen": 115919255, + "step": 5398, + "time_per_iteration": 2.5329957008361816 + }, + { + "auxiliary_loss_clip": 0.01107446, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.04783463, + "balance_loss_mlp": 1.01728845, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.5517129039420006, + "language_loss": 0.72650838, + "learning_rate": 3.156554054887718e-06, + "loss": 0.74790025, + "num_input_tokens_seen": 115938535, + "step": 5399, + "time_per_iteration": 2.50925612449646 + }, + { + "auxiliary_loss_clip": 0.0109894, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.04484582, + "balance_loss_mlp": 1.0164578, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.287135072147827, + "language_loss": 0.71207154, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73337257, + "num_input_tokens_seen": 115955005, + "step": 5400, + "time_per_iteration": 2.540844440460205 + }, + { + "auxiliary_loss_clip": 0.01125588, + "auxiliary_loss_mlp": 0.01034241, + "balance_loss_clip": 1.04657531, + "balance_loss_mlp": 1.01973271, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 2.174457894694914, + "language_loss": 0.79844362, + "learning_rate": 3.155918489984614e-06, + "loss": 0.82004189, + "num_input_tokens_seen": 115975305, + "step": 5401, + "time_per_iteration": 2.5497329235076904 + }, + { + "auxiliary_loss_clip": 0.01106659, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_clip": 1.04112053, + "balance_loss_mlp": 1.02523494, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.5931840023413868, + "language_loss": 0.87344986, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.8949371, + "num_input_tokens_seen": 115994810, + "step": 5402, + "time_per_iteration": 2.505237102508545 + }, + { + "auxiliary_loss_clip": 0.01081711, + "auxiliary_loss_mlp": 0.01041273, + "balance_loss_clip": 1.04151118, + "balance_loss_mlp": 1.02559757, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.905434293807769, + "language_loss": 0.84690225, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86813205, + "num_input_tokens_seen": 116011095, + "step": 5403, + "time_per_iteration": 2.547393321990967 + }, + { + "auxiliary_loss_clip": 0.01105441, + "auxiliary_loss_mlp": 0.01043071, + "balance_loss_clip": 1.04681909, + "balance_loss_mlp": 1.02907538, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 1.9556548320796157, + "language_loss": 0.88041073, + "learning_rate": 3.154964813916007e-06, + "loss": 0.90189582, + "num_input_tokens_seen": 116028805, + "step": 5404, + "time_per_iteration": 4.022004842758179 + }, + { + "auxiliary_loss_clip": 0.01122027, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.04869258, + "balance_loss_mlp": 1.02533412, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6566937690653452, + "language_loss": 0.7300899, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.75171459, + "num_input_tokens_seen": 116047765, + "step": 5405, + "time_per_iteration": 2.5170795917510986 + }, + { + "auxiliary_loss_clip": 0.0109429, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.04889917, + "balance_loss_mlp": 1.02453148, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.8089304333941418, + "language_loss": 0.83628666, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85762066, + "num_input_tokens_seen": 116068385, + "step": 5406, + "time_per_iteration": 2.550564765930176 + }, + { + "auxiliary_loss_clip": 0.01135552, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.05093169, + "balance_loss_mlp": 1.01773667, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.8022789541792457, + "language_loss": 0.8760891, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.89776105, + "num_input_tokens_seen": 116085350, + "step": 5407, + "time_per_iteration": 2.4099559783935547 + }, + { + "auxiliary_loss_clip": 0.01110287, + "auxiliary_loss_mlp": 0.01036223, + "balance_loss_clip": 1.04759574, + "balance_loss_mlp": 1.02184689, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.619391783119665, + "language_loss": 0.69695795, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71842313, + "num_input_tokens_seen": 116107560, + "step": 5408, + "time_per_iteration": 2.565514087677002 + }, + { + "auxiliary_loss_clip": 0.01127562, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.04823315, + "balance_loss_mlp": 1.01569414, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 2.0603202386193815, + "language_loss": 0.77735174, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79893374, + "num_input_tokens_seen": 116125980, + "step": 5409, + "time_per_iteration": 2.4476706981658936 + }, + { + "auxiliary_loss_clip": 0.0108093, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.04172325, + "balance_loss_mlp": 1.02930462, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 1.8059773094538853, + "language_loss": 0.83123541, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.8524797, + "num_input_tokens_seen": 116146530, + "step": 5410, + "time_per_iteration": 2.6641342639923096 + }, + { + "auxiliary_loss_clip": 0.01083021, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.04586673, + "balance_loss_mlp": 1.01775289, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.747322650445127, + "language_loss": 0.71510112, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73624855, + "num_input_tokens_seen": 116165695, + "step": 5411, + "time_per_iteration": 2.572136878967285 + }, + { + "auxiliary_loss_clip": 0.01081677, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.04388463, + "balance_loss_mlp": 1.02239919, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.5880251373465157, + "language_loss": 0.83022779, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85140729, + "num_input_tokens_seen": 116185375, + "step": 5412, + "time_per_iteration": 2.61547589302063 + }, + { + "auxiliary_loss_clip": 0.01105882, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.04807436, + "balance_loss_mlp": 1.0182941, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 1.8029687619269694, + "language_loss": 0.80759823, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82898879, + "num_input_tokens_seen": 116204335, + "step": 5413, + "time_per_iteration": 2.574319362640381 + }, + { + "auxiliary_loss_clip": 0.01112166, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.04856229, + "balance_loss_mlp": 1.02012539, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.6493632998607108, + "language_loss": 0.76942307, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79089856, + "num_input_tokens_seen": 116222840, + "step": 5414, + "time_per_iteration": 2.5355844497680664 + }, + { + "auxiliary_loss_clip": 0.01041697, + "auxiliary_loss_mlp": 0.01001085, + "balance_loss_clip": 1.03747892, + "balance_loss_mlp": 0.99920195, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9050773800754249, + "language_loss": 0.63981289, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66024071, + "num_input_tokens_seen": 116274940, + "step": 5415, + "time_per_iteration": 3.0410914421081543 + }, + { + "auxiliary_loss_clip": 0.01088787, + "auxiliary_loss_mlp": 0.0103488, + "balance_loss_clip": 1.04057062, + "balance_loss_mlp": 1.01971626, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 1.8257082070795754, + "language_loss": 0.74202549, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76326215, + "num_input_tokens_seen": 116297300, + "step": 5416, + "time_per_iteration": 2.586961030960083 + }, + { + "auxiliary_loss_clip": 0.01067767, + "auxiliary_loss_mlp": 0.01000587, + "balance_loss_clip": 1.03656769, + "balance_loss_mlp": 0.99878687, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7789692600483143, + "language_loss": 0.57973826, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.60042185, + "num_input_tokens_seen": 116362370, + "step": 5417, + "time_per_iteration": 3.118800640106201 + }, + { + "auxiliary_loss_clip": 0.01043865, + "auxiliary_loss_mlp": 0.01004853, + "balance_loss_clip": 1.02999306, + "balance_loss_mlp": 1.0031842, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.8239013058608944, + "language_loss": 0.63702649, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65751368, + "num_input_tokens_seen": 116430365, + "step": 5418, + "time_per_iteration": 3.177943706512451 + }, + { + "auxiliary_loss_clip": 0.01106172, + "auxiliary_loss_mlp": 0.01040902, + "balance_loss_clip": 1.0487051, + "balance_loss_mlp": 1.02718723, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 2.012828530715378, + "language_loss": 0.69330418, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71477497, + "num_input_tokens_seen": 116447525, + "step": 5419, + "time_per_iteration": 2.483708381652832 + }, + { + "auxiliary_loss_clip": 0.01123486, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.04810953, + "balance_loss_mlp": 1.02127481, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 1.6523877723239215, + "language_loss": 0.7698313, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79142493, + "num_input_tokens_seen": 116466310, + "step": 5420, + "time_per_iteration": 2.477461576461792 + }, + { + "auxiliary_loss_clip": 0.01121207, + "auxiliary_loss_mlp": 0.00781426, + "balance_loss_clip": 1.04386175, + "balance_loss_mlp": 1.00074863, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.5991360748963201, + "language_loss": 0.8044796, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82350594, + "num_input_tokens_seen": 116487825, + "step": 5421, + "time_per_iteration": 2.532094955444336 + }, + { + "auxiliary_loss_clip": 0.01130067, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_clip": 1.0473634, + "balance_loss_mlp": 1.027385, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.7609665086939514, + "language_loss": 0.75326329, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77497399, + "num_input_tokens_seen": 116509950, + "step": 5422, + "time_per_iteration": 3.9323930740356445 + }, + { + "auxiliary_loss_clip": 0.01101899, + "auxiliary_loss_mlp": 0.00780974, + "balance_loss_clip": 1.0448575, + "balance_loss_mlp": 1.0006423, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.022394791492307, + "language_loss": 0.62747419, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64630294, + "num_input_tokens_seen": 116527695, + "step": 5423, + "time_per_iteration": 2.5020103454589844 + }, + { + "auxiliary_loss_clip": 0.01098948, + "auxiliary_loss_mlp": 0.01034228, + "balance_loss_clip": 1.04308474, + "balance_loss_mlp": 1.02134776, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 1.8982981488009232, + "language_loss": 0.74697089, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76830262, + "num_input_tokens_seen": 116547800, + "step": 5424, + "time_per_iteration": 2.542133331298828 + }, + { + "auxiliary_loss_clip": 0.01105197, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.04782546, + "balance_loss_mlp": 1.03056407, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6434254504556571, + "language_loss": 0.76759732, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.78908855, + "num_input_tokens_seen": 116568460, + "step": 5425, + "time_per_iteration": 2.537121534347534 + }, + { + "auxiliary_loss_clip": 0.01102636, + "auxiliary_loss_mlp": 0.01039785, + "balance_loss_clip": 1.04394603, + "balance_loss_mlp": 1.02407312, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 2.253777073831091, + "language_loss": 0.77752185, + "learning_rate": 3.147959166423428e-06, + "loss": 0.79894602, + "num_input_tokens_seen": 116588705, + "step": 5426, + "time_per_iteration": 4.1702492237091064 + }, + { + "auxiliary_loss_clip": 0.0108766, + "auxiliary_loss_mlp": 0.01041082, + "balance_loss_clip": 1.04084086, + "balance_loss_mlp": 1.02564406, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.7107847200284207, + "language_loss": 0.73981774, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76110512, + "num_input_tokens_seen": 116608845, + "step": 5427, + "time_per_iteration": 2.579777717590332 + }, + { + "auxiliary_loss_clip": 0.01100225, + "auxiliary_loss_mlp": 0.01043497, + "balance_loss_clip": 1.04066706, + "balance_loss_mlp": 1.02796423, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.7548353675742183, + "language_loss": 0.79220378, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81364101, + "num_input_tokens_seen": 116628145, + "step": 5428, + "time_per_iteration": 2.5324480533599854 + }, + { + "auxiliary_loss_clip": 0.01120183, + "auxiliary_loss_mlp": 0.01041848, + "balance_loss_clip": 1.04467607, + "balance_loss_mlp": 1.02763796, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.7272322601061223, + "language_loss": 0.71120369, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73282397, + "num_input_tokens_seen": 116646920, + "step": 5429, + "time_per_iteration": 2.5104901790618896 + }, + { + "auxiliary_loss_clip": 0.01097708, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.04505134, + "balance_loss_mlp": 1.02701831, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.7791515111532457, + "language_loss": 0.78662241, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80800807, + "num_input_tokens_seen": 116665100, + "step": 5430, + "time_per_iteration": 2.5422914028167725 + }, + { + "auxiliary_loss_clip": 0.01080745, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_clip": 1.04404902, + "balance_loss_mlp": 1.02544022, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 2.8166784672055893, + "language_loss": 0.84815907, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86938184, + "num_input_tokens_seen": 116682205, + "step": 5431, + "time_per_iteration": 3.9664835929870605 + }, + { + "auxiliary_loss_clip": 0.01114605, + "auxiliary_loss_mlp": 0.01036354, + "balance_loss_clip": 1.04489589, + "balance_loss_mlp": 1.02264476, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.8042005797531016, + "language_loss": 0.70335257, + "learning_rate": 3.146044873294678e-06, + "loss": 0.7248621, + "num_input_tokens_seen": 116702575, + "step": 5432, + "time_per_iteration": 2.5217292308807373 + }, + { + "auxiliary_loss_clip": 0.01072665, + "auxiliary_loss_mlp": 0.01041146, + "balance_loss_clip": 1.03668272, + "balance_loss_mlp": 1.02531505, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 2.142631341593269, + "language_loss": 0.84231758, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86345565, + "num_input_tokens_seen": 116720885, + "step": 5433, + "time_per_iteration": 2.5617218017578125 + }, + { + "auxiliary_loss_clip": 0.01108012, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.04525948, + "balance_loss_mlp": 1.02097011, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.5437507789117946, + "language_loss": 0.85745323, + "learning_rate": 3.145406427790931e-06, + "loss": 0.87888634, + "num_input_tokens_seen": 116740395, + "step": 5434, + "time_per_iteration": 2.552183151245117 + }, + { + "auxiliary_loss_clip": 0.01114222, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.04400873, + "balance_loss_mlp": 1.02144635, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.768757625344028, + "language_loss": 0.87635303, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.89786589, + "num_input_tokens_seen": 116758870, + "step": 5435, + "time_per_iteration": 2.577071189880371 + }, + { + "auxiliary_loss_clip": 0.01130827, + "auxiliary_loss_mlp": 0.01035897, + "balance_loss_clip": 1.04541337, + "balance_loss_mlp": 1.02150846, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.6067393419160094, + "language_loss": 0.7665047, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78817195, + "num_input_tokens_seen": 116773440, + "step": 5436, + "time_per_iteration": 2.4970548152923584 + }, + { + "auxiliary_loss_clip": 0.01130715, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.04587507, + "balance_loss_mlp": 1.02255082, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.6464166192593142, + "language_loss": 0.72062027, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74229705, + "num_input_tokens_seen": 116794375, + "step": 5437, + "time_per_iteration": 2.5020394325256348 + }, + { + "auxiliary_loss_clip": 0.01095539, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.04142356, + "balance_loss_mlp": 1.02236986, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 2.50078509405046, + "language_loss": 0.64050484, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66185522, + "num_input_tokens_seen": 116815095, + "step": 5438, + "time_per_iteration": 2.571873426437378 + }, + { + "auxiliary_loss_clip": 0.01121044, + "auxiliary_loss_mlp": 0.01036202, + "balance_loss_clip": 1.04631555, + "balance_loss_mlp": 1.02132499, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.7368302551754082, + "language_loss": 0.74516946, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76674193, + "num_input_tokens_seen": 116836630, + "step": 5439, + "time_per_iteration": 2.513540029525757 + }, + { + "auxiliary_loss_clip": 0.01124106, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.0472995, + "balance_loss_mlp": 1.02998352, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 3.2372799494722133, + "language_loss": 0.74852884, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77022618, + "num_input_tokens_seen": 116856880, + "step": 5440, + "time_per_iteration": 2.5001354217529297 + }, + { + "auxiliary_loss_clip": 0.0111722, + "auxiliary_loss_mlp": 0.00782167, + "balance_loss_clip": 1.04304874, + "balance_loss_mlp": 1.00064611, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 2.267290124326973, + "language_loss": 0.84743023, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86642408, + "num_input_tokens_seen": 116873770, + "step": 5441, + "time_per_iteration": 2.45809268951416 + }, + { + "auxiliary_loss_clip": 0.01120067, + "auxiliary_loss_mlp": 0.01039155, + "balance_loss_clip": 1.04215693, + "balance_loss_mlp": 1.02316904, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 2.769747688980541, + "language_loss": 0.87022132, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.89181352, + "num_input_tokens_seen": 116891225, + "step": 5442, + "time_per_iteration": 2.467672824859619 + }, + { + "auxiliary_loss_clip": 0.01100535, + "auxiliary_loss_mlp": 0.01038985, + "balance_loss_clip": 1.04306793, + "balance_loss_mlp": 1.02212906, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.715257145920322, + "language_loss": 0.77237958, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79377478, + "num_input_tokens_seen": 116912300, + "step": 5443, + "time_per_iteration": 2.536341667175293 + }, + { + "auxiliary_loss_clip": 0.01100473, + "auxiliary_loss_mlp": 0.00781536, + "balance_loss_clip": 1.0420897, + "balance_loss_mlp": 1.0007689, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.2933732838386556, + "language_loss": 0.81823575, + "learning_rate": 3.142211596174343e-06, + "loss": 0.8370558, + "num_input_tokens_seen": 116929425, + "step": 5444, + "time_per_iteration": 4.0678558349609375 + }, + { + "auxiliary_loss_clip": 0.01093276, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.04503214, + "balance_loss_mlp": 1.0202111, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.935792722386716, + "language_loss": 0.59515667, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61645114, + "num_input_tokens_seen": 116948255, + "step": 5445, + "time_per_iteration": 2.572235345840454 + }, + { + "auxiliary_loss_clip": 0.01124209, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.04692125, + "balance_loss_mlp": 1.02482748, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.2394273807442318, + "language_loss": 0.8816328, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90328068, + "num_input_tokens_seen": 116964905, + "step": 5446, + "time_per_iteration": 2.4502675533294678 + }, + { + "auxiliary_loss_clip": 0.01125946, + "auxiliary_loss_mlp": 0.01046824, + "balance_loss_clip": 1.04981172, + "balance_loss_mlp": 1.02877593, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.7104985350036466, + "language_loss": 0.78822827, + "learning_rate": 3.141252301538802e-06, + "loss": 0.80995595, + "num_input_tokens_seen": 116983650, + "step": 5447, + "time_per_iteration": 2.541890859603882 + }, + { + "auxiliary_loss_clip": 0.01102751, + "auxiliary_loss_mlp": 0.00781992, + "balance_loss_clip": 1.04093623, + "balance_loss_mlp": 1.00071549, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 2.0036361230753683, + "language_loss": 0.73656672, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75541419, + "num_input_tokens_seen": 117003265, + "step": 5448, + "time_per_iteration": 2.5162792205810547 + }, + { + "auxiliary_loss_clip": 0.01136184, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.0481503, + "balance_loss_mlp": 1.02788198, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.5348042700183082, + "language_loss": 0.66954124, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69134533, + "num_input_tokens_seen": 117025370, + "step": 5449, + "time_per_iteration": 2.5066559314727783 + }, + { + "auxiliary_loss_clip": 0.01103021, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.04637539, + "balance_loss_mlp": 1.01896024, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 2.868431640543321, + "language_loss": 0.65153503, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.6729027, + "num_input_tokens_seen": 117044350, + "step": 5450, + "time_per_iteration": 2.591703176498413 + }, + { + "auxiliary_loss_clip": 0.01123805, + "auxiliary_loss_mlp": 0.01039851, + "balance_loss_clip": 1.0458833, + "balance_loss_mlp": 1.02425241, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.6944958834967123, + "language_loss": 0.77758372, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79922032, + "num_input_tokens_seen": 117064450, + "step": 5451, + "time_per_iteration": 2.497929096221924 + }, + { + "auxiliary_loss_clip": 0.01126536, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_clip": 1.04825628, + "balance_loss_mlp": 1.02794421, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.502025321580886, + "language_loss": 0.70854491, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.73025918, + "num_input_tokens_seen": 117083060, + "step": 5452, + "time_per_iteration": 2.509033203125 + }, + { + "auxiliary_loss_clip": 0.0111143, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.04558897, + "balance_loss_mlp": 1.01740348, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.6870512034607215, + "language_loss": 0.7888118, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.8102507, + "num_input_tokens_seen": 117101860, + "step": 5453, + "time_per_iteration": 2.5867934226989746 + }, + { + "auxiliary_loss_clip": 0.01127422, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.04765081, + "balance_loss_mlp": 1.02030993, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 2.230713239493001, + "language_loss": 0.74825585, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.76988494, + "num_input_tokens_seen": 117123100, + "step": 5454, + "time_per_iteration": 2.531569242477417 + }, + { + "auxiliary_loss_clip": 0.01072493, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.03616595, + "balance_loss_mlp": 1.02659452, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 2.5587043001341447, + "language_loss": 0.77179229, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79292941, + "num_input_tokens_seen": 117140515, + "step": 5455, + "time_per_iteration": 2.5463650226593018 + }, + { + "auxiliary_loss_clip": 0.01129712, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.04924989, + "balance_loss_mlp": 1.02472687, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.9159332067716939, + "language_loss": 0.73754728, + "learning_rate": 3.138372082016768e-06, + "loss": 0.7592566, + "num_input_tokens_seen": 117161485, + "step": 5456, + "time_per_iteration": 2.5561535358428955 + }, + { + "auxiliary_loss_clip": 0.01135891, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_clip": 1.04747605, + "balance_loss_mlp": 1.03108978, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.460548394132377, + "language_loss": 0.78235495, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80418068, + "num_input_tokens_seen": 117181870, + "step": 5457, + "time_per_iteration": 2.455064535140991 + }, + { + "auxiliary_loss_clip": 0.01101581, + "auxiliary_loss_mlp": 0.01037511, + "balance_loss_clip": 1.04396522, + "balance_loss_mlp": 1.02190661, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 16.41518667571018, + "language_loss": 0.78627396, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.80766487, + "num_input_tokens_seen": 117201380, + "step": 5458, + "time_per_iteration": 2.5489425659179688 + }, + { + "auxiliary_loss_clip": 0.01117415, + "auxiliary_loss_mlp": 0.0103627, + "balance_loss_clip": 1.04709339, + "balance_loss_mlp": 1.02041531, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 3.029285785283382, + "language_loss": 0.73193669, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75347358, + "num_input_tokens_seen": 117221040, + "step": 5459, + "time_per_iteration": 2.4761054515838623 + }, + { + "auxiliary_loss_clip": 0.01117802, + "auxiliary_loss_mlp": 0.01037808, + "balance_loss_clip": 1.04963064, + "balance_loss_mlp": 1.02291322, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.902554340043946, + "language_loss": 0.84196281, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86351883, + "num_input_tokens_seen": 117241395, + "step": 5460, + "time_per_iteration": 2.588724136352539 + }, + { + "auxiliary_loss_clip": 0.01134046, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.04524517, + "balance_loss_mlp": 1.02228856, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 2.235294551163021, + "language_loss": 0.77023679, + "learning_rate": 3.136770448642288e-06, + "loss": 0.79194689, + "num_input_tokens_seen": 117259340, + "step": 5461, + "time_per_iteration": 3.9339139461517334 + }, + { + "auxiliary_loss_clip": 0.01121332, + "auxiliary_loss_mlp": 0.01039941, + "balance_loss_clip": 1.04692483, + "balance_loss_mlp": 1.02173805, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 2.0234548357765068, + "language_loss": 0.63132745, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65294015, + "num_input_tokens_seen": 117282375, + "step": 5462, + "time_per_iteration": 2.616425037384033 + }, + { + "auxiliary_loss_clip": 0.01132631, + "auxiliary_loss_mlp": 0.00781374, + "balance_loss_clip": 1.04667759, + "balance_loss_mlp": 1.00084615, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.422586502568173, + "language_loss": 0.78194827, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80108833, + "num_input_tokens_seen": 117303830, + "step": 5463, + "time_per_iteration": 2.48748517036438 + }, + { + "auxiliary_loss_clip": 0.01108177, + "auxiliary_loss_mlp": 0.01040229, + "balance_loss_clip": 1.0474726, + "balance_loss_mlp": 1.02477908, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 1.989351193098673, + "language_loss": 0.69998038, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72146446, + "num_input_tokens_seen": 117320665, + "step": 5464, + "time_per_iteration": 2.5224530696868896 + }, + { + "auxiliary_loss_clip": 0.01126103, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.0501157, + "balance_loss_mlp": 1.02466178, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.9545567016089878, + "language_loss": 0.72402906, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74569166, + "num_input_tokens_seen": 117339795, + "step": 5465, + "time_per_iteration": 3.9870707988739014 + }, + { + "auxiliary_loss_clip": 0.01111557, + "auxiliary_loss_mlp": 0.0105117, + "balance_loss_clip": 1.0462482, + "balance_loss_mlp": 1.03433776, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.4985781565752558, + "language_loss": 0.83127987, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85290718, + "num_input_tokens_seen": 117359525, + "step": 5466, + "time_per_iteration": 2.520693302154541 + }, + { + "auxiliary_loss_clip": 0.01112753, + "auxiliary_loss_mlp": 0.01039314, + "balance_loss_clip": 1.04511201, + "balance_loss_mlp": 1.02483594, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 2.237333026206424, + "language_loss": 0.79485863, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81637931, + "num_input_tokens_seen": 117380320, + "step": 5467, + "time_per_iteration": 2.5544135570526123 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01037303, + "balance_loss_clip": 1.04531229, + "balance_loss_mlp": 1.02205598, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 2.011739714165676, + "language_loss": 0.74705774, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76854956, + "num_input_tokens_seen": 117400695, + "step": 5468, + "time_per_iteration": 2.564411163330078 + }, + { + "auxiliary_loss_clip": 0.01113058, + "auxiliary_loss_mlp": 0.01040435, + "balance_loss_clip": 1.04461217, + "balance_loss_mlp": 1.02293539, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.9581726521699507, + "language_loss": 0.78557682, + "learning_rate": 3.134205594339942e-06, + "loss": 0.80711174, + "num_input_tokens_seen": 117418800, + "step": 5469, + "time_per_iteration": 2.49562668800354 + }, + { + "auxiliary_loss_clip": 0.01104448, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.04597199, + "balance_loss_mlp": 1.02247286, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.9929676254170292, + "language_loss": 0.81594473, + "learning_rate": 3.133884793883107e-06, + "loss": 0.83736289, + "num_input_tokens_seen": 117438220, + "step": 5470, + "time_per_iteration": 3.9554443359375 + }, + { + "auxiliary_loss_clip": 0.01138424, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_clip": 1.04743767, + "balance_loss_mlp": 1.02766597, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 2.078775880847211, + "language_loss": 0.68066019, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.70248151, + "num_input_tokens_seen": 117462560, + "step": 5471, + "time_per_iteration": 2.719928503036499 + }, + { + "auxiliary_loss_clip": 0.01141978, + "auxiliary_loss_mlp": 0.01046229, + "balance_loss_clip": 1.04937708, + "balance_loss_mlp": 1.02841902, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.6283952670546744, + "language_loss": 0.64703798, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.66892004, + "num_input_tokens_seen": 117483665, + "step": 5472, + "time_per_iteration": 2.569080114364624 + }, + { + "auxiliary_loss_clip": 0.01125789, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_clip": 1.0487268, + "balance_loss_mlp": 1.03527749, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.8344562156713669, + "language_loss": 0.88350111, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90528119, + "num_input_tokens_seen": 117503565, + "step": 5473, + "time_per_iteration": 2.4841134548187256 + }, + { + "auxiliary_loss_clip": 0.0110432, + "auxiliary_loss_mlp": 0.01044346, + "balance_loss_clip": 1.0455004, + "balance_loss_mlp": 1.02690601, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 1.6444329824339143, + "language_loss": 0.77980542, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80129206, + "num_input_tokens_seen": 117521460, + "step": 5474, + "time_per_iteration": 2.5707497596740723 + }, + { + "auxiliary_loss_clip": 0.01035545, + "auxiliary_loss_mlp": 0.01014005, + "balance_loss_clip": 1.02470517, + "balance_loss_mlp": 1.01213384, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.8119544435889754, + "language_loss": 0.60185736, + "learning_rate": 3.132280146886911e-06, + "loss": 0.6223529, + "num_input_tokens_seen": 117580550, + "step": 5475, + "time_per_iteration": 3.096194267272949 + }, + { + "auxiliary_loss_clip": 0.01092182, + "auxiliary_loss_mlp": 0.01065116, + "balance_loss_clip": 1.040411, + "balance_loss_mlp": 1.04380167, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 4.081183292960252, + "language_loss": 0.76944637, + "learning_rate": 3.131959088630455e-06, + "loss": 0.79101944, + "num_input_tokens_seen": 117600645, + "step": 5476, + "time_per_iteration": 2.615300416946411 + }, + { + "auxiliary_loss_clip": 0.01100425, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.04406166, + "balance_loss_mlp": 1.02535319, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 2.837924819911592, + "language_loss": 0.74722922, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76864505, + "num_input_tokens_seen": 117618880, + "step": 5477, + "time_per_iteration": 2.5586161613464355 + }, + { + "auxiliary_loss_clip": 0.01132321, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.04693091, + "balance_loss_mlp": 1.02432823, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 2.498137255511663, + "language_loss": 0.76099575, + "learning_rate": 3.131316843357713e-06, + "loss": 0.78270674, + "num_input_tokens_seen": 117636445, + "step": 5478, + "time_per_iteration": 2.4741883277893066 + }, + { + "auxiliary_loss_clip": 0.0112543, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.0461812, + "balance_loss_mlp": 1.02179146, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 2.1475912005591167, + "language_loss": 0.80472201, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82635015, + "num_input_tokens_seen": 117653105, + "step": 5479, + "time_per_iteration": 2.46291184425354 + }, + { + "auxiliary_loss_clip": 0.01038234, + "auxiliary_loss_mlp": 0.01002091, + "balance_loss_clip": 1.02551866, + "balance_loss_mlp": 1.00021923, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7499437876932878, + "language_loss": 0.56512427, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58552754, + "num_input_tokens_seen": 117719225, + "step": 5480, + "time_per_iteration": 3.1421711444854736 + }, + { + "auxiliary_loss_clip": 0.01124296, + "auxiliary_loss_mlp": 0.00783146, + "balance_loss_clip": 1.04526711, + "balance_loss_mlp": 1.00109148, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 2.76428664528919, + "language_loss": 0.77445966, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.7935341, + "num_input_tokens_seen": 117738725, + "step": 5481, + "time_per_iteration": 2.505162239074707 + }, + { + "auxiliary_loss_clip": 0.01117002, + "auxiliary_loss_mlp": 0.01038068, + "balance_loss_clip": 1.04866946, + "balance_loss_mlp": 1.02286887, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.6657043715169322, + "language_loss": 0.7839601, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80551088, + "num_input_tokens_seen": 117757765, + "step": 5482, + "time_per_iteration": 2.619135856628418 + }, + { + "auxiliary_loss_clip": 0.01129196, + "auxiliary_loss_mlp": 0.01037528, + "balance_loss_clip": 1.04626656, + "balance_loss_mlp": 1.02109504, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 1.8581236984587717, + "language_loss": 0.73706305, + "learning_rate": 3.129710479645185e-06, + "loss": 0.75873029, + "num_input_tokens_seen": 117776810, + "step": 5483, + "time_per_iteration": 4.009558439254761 + }, + { + "auxiliary_loss_clip": 0.011201, + "auxiliary_loss_mlp": 0.01043578, + "balance_loss_clip": 1.04506338, + "balance_loss_mlp": 1.02762794, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.874603751337511, + "language_loss": 0.75742924, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77906609, + "num_input_tokens_seen": 117797730, + "step": 5484, + "time_per_iteration": 2.557971954345703 + }, + { + "auxiliary_loss_clip": 0.01137382, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.04956746, + "balance_loss_mlp": 1.0240649, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 2.305197464920171, + "language_loss": 0.71612287, + "learning_rate": 3.129067634203742e-06, + "loss": 0.73789883, + "num_input_tokens_seen": 117815365, + "step": 5485, + "time_per_iteration": 2.4343597888946533 + }, + { + "auxiliary_loss_clip": 0.01081096, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.04642737, + "balance_loss_mlp": 1.02551115, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.6021138353815327, + "language_loss": 0.80264544, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82386005, + "num_input_tokens_seen": 117836095, + "step": 5486, + "time_per_iteration": 2.698591947555542 + }, + { + "auxiliary_loss_clip": 0.01104215, + "auxiliary_loss_mlp": 0.01041322, + "balance_loss_clip": 1.04168367, + "balance_loss_mlp": 1.02470386, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.3191308098111043, + "language_loss": 0.84020764, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86166298, + "num_input_tokens_seen": 117854655, + "step": 5487, + "time_per_iteration": 2.5239646434783936 + }, + { + "auxiliary_loss_clip": 0.01090395, + "auxiliary_loss_mlp": 0.01042523, + "balance_loss_clip": 1.04251075, + "balance_loss_mlp": 1.02509475, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 3.7128801843419534, + "language_loss": 0.74272645, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76405573, + "num_input_tokens_seen": 117873300, + "step": 5488, + "time_per_iteration": 2.5777342319488525 + }, + { + "auxiliary_loss_clip": 0.01137335, + "auxiliary_loss_mlp": 0.01040462, + "balance_loss_clip": 1.0483768, + "balance_loss_mlp": 1.02434444, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 3.2858042641974183, + "language_loss": 0.72578186, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74755979, + "num_input_tokens_seen": 117891540, + "step": 5489, + "time_per_iteration": 2.4230685234069824 + }, + { + "auxiliary_loss_clip": 0.01132013, + "auxiliary_loss_mlp": 0.01035233, + "balance_loss_clip": 1.04391098, + "balance_loss_mlp": 1.01966453, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 3.477230589270236, + "language_loss": 0.88839251, + "learning_rate": 3.127459771562238e-06, + "loss": 0.91006494, + "num_input_tokens_seen": 117907690, + "step": 5490, + "time_per_iteration": 2.449683427810669 + }, + { + "auxiliary_loss_clip": 0.01124369, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.04555178, + "balance_loss_mlp": 1.01870978, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 2.202071979289657, + "language_loss": 0.83209634, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85368049, + "num_input_tokens_seen": 117925640, + "step": 5491, + "time_per_iteration": 2.4523377418518066 + }, + { + "auxiliary_loss_clip": 0.01111315, + "auxiliary_loss_mlp": 0.01040639, + "balance_loss_clip": 1.0481112, + "balance_loss_mlp": 1.0242002, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 1.798673947796688, + "language_loss": 0.77956605, + "learning_rate": 3.126816327146554e-06, + "loss": 0.80108559, + "num_input_tokens_seen": 117944525, + "step": 5492, + "time_per_iteration": 2.540423631668091 + }, + { + "auxiliary_loss_clip": 0.01142411, + "auxiliary_loss_mlp": 0.01044326, + "balance_loss_clip": 1.05117893, + "balance_loss_mlp": 1.02720797, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 10.024904948031331, + "language_loss": 0.74904478, + "learning_rate": 3.12649454083913e-06, + "loss": 0.77091217, + "num_input_tokens_seen": 117962515, + "step": 5493, + "time_per_iteration": 2.4263994693756104 + }, + { + "auxiliary_loss_clip": 0.01011026, + "auxiliary_loss_mlp": 0.01009529, + "balance_loss_clip": 1.02631354, + "balance_loss_mlp": 1.00782454, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7869196156147311, + "language_loss": 0.53901649, + "learning_rate": 3.12617271181492e-06, + "loss": 0.5592221, + "num_input_tokens_seen": 118018780, + "step": 5494, + "time_per_iteration": 3.0994327068328857 + }, + { + "auxiliary_loss_clip": 0.01116078, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.04556417, + "balance_loss_mlp": 1.02369356, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.5486280651862505, + "language_loss": 0.86860323, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89016294, + "num_input_tokens_seen": 118038610, + "step": 5495, + "time_per_iteration": 2.5224978923797607 + }, + { + "auxiliary_loss_clip": 0.01104272, + "auxiliary_loss_mlp": 0.01046995, + "balance_loss_clip": 1.04825711, + "balance_loss_mlp": 1.02980459, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 2.336524828197387, + "language_loss": 0.73320937, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.754722, + "num_input_tokens_seen": 118055905, + "step": 5496, + "time_per_iteration": 2.6641197204589844 + }, + { + "auxiliary_loss_clip": 0.0110747, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.04494834, + "balance_loss_mlp": 1.01921463, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 4.585250701489895, + "language_loss": 0.72232234, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74374545, + "num_input_tokens_seen": 118073695, + "step": 5497, + "time_per_iteration": 2.5447158813476562 + }, + { + "auxiliary_loss_clip": 0.01113245, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.04811871, + "balance_loss_mlp": 1.02572501, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 2.061759917413593, + "language_loss": 0.80531037, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82685363, + "num_input_tokens_seen": 118094030, + "step": 5498, + "time_per_iteration": 2.5757153034210205 + }, + { + "auxiliary_loss_clip": 0.01119568, + "auxiliary_loss_mlp": 0.01042539, + "balance_loss_clip": 1.0414902, + "balance_loss_mlp": 1.02579045, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.1337784352543965, + "language_loss": 0.76171714, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78333819, + "num_input_tokens_seen": 118111665, + "step": 5499, + "time_per_iteration": 2.4822607040405273 + }, + { + "auxiliary_loss_clip": 0.01121633, + "auxiliary_loss_mlp": 0.01041603, + "balance_loss_clip": 1.05244076, + "balance_loss_mlp": 1.02533078, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.5927252843646917, + "language_loss": 0.78738487, + "learning_rate": 3.124240841300681e-06, + "loss": 0.80901724, + "num_input_tokens_seen": 118132435, + "step": 5500, + "time_per_iteration": 2.589662790298462 + }, + { + "auxiliary_loss_clip": 0.01129414, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.04933226, + "balance_loss_mlp": 1.01948142, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.097859757077392, + "language_loss": 0.65931308, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68096685, + "num_input_tokens_seen": 118155255, + "step": 5501, + "time_per_iteration": 4.067103385925293 + }, + { + "auxiliary_loss_clip": 0.01130014, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_clip": 1.04914331, + "balance_loss_mlp": 1.03010726, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.252787021850016, + "language_loss": 0.77695352, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79871958, + "num_input_tokens_seen": 118169865, + "step": 5502, + "time_per_iteration": 2.440213680267334 + }, + { + "auxiliary_loss_clip": 0.01117348, + "auxiliary_loss_mlp": 0.01042906, + "balance_loss_clip": 1.05306959, + "balance_loss_mlp": 1.02672899, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 1.7544945218837449, + "language_loss": 0.7283932, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74999571, + "num_input_tokens_seen": 118190760, + "step": 5503, + "time_per_iteration": 2.5426342487335205 + }, + { + "auxiliary_loss_clip": 0.01109753, + "auxiliary_loss_mlp": 0.01052544, + "balance_loss_clip": 1.04305696, + "balance_loss_mlp": 1.0349133, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 2.3450163556195602, + "language_loss": 0.75445902, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77608192, + "num_input_tokens_seen": 118213620, + "step": 5504, + "time_per_iteration": 2.5730509757995605 + }, + { + "auxiliary_loss_clip": 0.01118422, + "auxiliary_loss_mlp": 0.0104801, + "balance_loss_clip": 1.04940867, + "balance_loss_mlp": 1.03210747, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.665928693549226, + "language_loss": 0.69987202, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72153634, + "num_input_tokens_seen": 118235010, + "step": 5505, + "time_per_iteration": 4.068846940994263 + }, + { + "auxiliary_loss_clip": 0.01124787, + "auxiliary_loss_mlp": 0.010508, + "balance_loss_clip": 1.04826152, + "balance_loss_mlp": 1.03462291, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.6345733885786922, + "language_loss": 0.82302439, + "learning_rate": 3.122307436058899e-06, + "loss": 0.84478033, + "num_input_tokens_seen": 118255820, + "step": 5506, + "time_per_iteration": 2.5328798294067383 + }, + { + "auxiliary_loss_clip": 0.01128342, + "auxiliary_loss_mlp": 0.01043344, + "balance_loss_clip": 1.04921722, + "balance_loss_mlp": 1.02723885, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.8027403447863546, + "language_loss": 0.79240084, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81411773, + "num_input_tokens_seen": 118274160, + "step": 5507, + "time_per_iteration": 2.5052781105041504 + }, + { + "auxiliary_loss_clip": 0.01118697, + "auxiliary_loss_mlp": 0.01050244, + "balance_loss_clip": 1.05037189, + "balance_loss_mlp": 1.03430569, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.8421934264572017, + "language_loss": 0.71282816, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73451757, + "num_input_tokens_seen": 118294385, + "step": 5508, + "time_per_iteration": 2.5609049797058105 + }, + { + "auxiliary_loss_clip": 0.01109197, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.04820168, + "balance_loss_mlp": 1.02533126, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 2.274175469113212, + "language_loss": 0.71928692, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74078465, + "num_input_tokens_seen": 118313105, + "step": 5509, + "time_per_iteration": 2.5689144134521484 + }, + { + "auxiliary_loss_clip": 0.01121723, + "auxiliary_loss_mlp": 0.01035328, + "balance_loss_clip": 1.05015612, + "balance_loss_mlp": 1.01985478, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.598260826602121, + "language_loss": 0.72928935, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75085986, + "num_input_tokens_seen": 118335250, + "step": 5510, + "time_per_iteration": 3.916597604751587 + }, + { + "auxiliary_loss_clip": 0.01096702, + "auxiliary_loss_mlp": 0.0104771, + "balance_loss_clip": 1.04248619, + "balance_loss_mlp": 1.03220046, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.25415094119727, + "language_loss": 0.88029307, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90173721, + "num_input_tokens_seen": 118351470, + "step": 5511, + "time_per_iteration": 2.5219290256500244 + }, + { + "auxiliary_loss_clip": 0.01081306, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_clip": 1.04491711, + "balance_loss_mlp": 1.03107822, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 2.2241690626914394, + "language_loss": 0.73173499, + "learning_rate": 3.12037249872891e-06, + "loss": 0.7530154, + "num_input_tokens_seen": 118370970, + "step": 5512, + "time_per_iteration": 2.5968265533447266 + }, + { + "auxiliary_loss_clip": 0.01097039, + "auxiliary_loss_mlp": 0.01048401, + "balance_loss_clip": 1.04695415, + "balance_loss_mlp": 1.03372657, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 2.5023911354140624, + "language_loss": 0.72279876, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.74425316, + "num_input_tokens_seen": 118393125, + "step": 5513, + "time_per_iteration": 2.705090284347534 + }, + { + "auxiliary_loss_clip": 0.01102699, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.04559445, + "balance_loss_mlp": 1.02129328, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.950430236605196, + "language_loss": 0.68298435, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70437771, + "num_input_tokens_seen": 118410860, + "step": 5514, + "time_per_iteration": 2.535696268081665 + }, + { + "auxiliary_loss_clip": 0.01112663, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_clip": 1.04634571, + "balance_loss_mlp": 1.03017604, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.1584447469162305, + "language_loss": 0.65898407, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.68058956, + "num_input_tokens_seen": 118429570, + "step": 5515, + "time_per_iteration": 2.562440872192383 + }, + { + "auxiliary_loss_clip": 0.01122052, + "auxiliary_loss_mlp": 0.01037866, + "balance_loss_clip": 1.04803896, + "balance_loss_mlp": 1.02209413, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.6592167098554973, + "language_loss": 0.69508898, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71668816, + "num_input_tokens_seen": 118450285, + "step": 5516, + "time_per_iteration": 2.549008846282959 + }, + { + "auxiliary_loss_clip": 0.01128571, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.04781604, + "balance_loss_mlp": 1.02688551, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 1.9356488784110524, + "language_loss": 0.80430788, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82602119, + "num_input_tokens_seen": 118468270, + "step": 5517, + "time_per_iteration": 2.4704277515411377 + }, + { + "auxiliary_loss_clip": 0.01117073, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.04417038, + "balance_loss_mlp": 1.02388549, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 1.8001930803927735, + "language_loss": 0.74047101, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76204962, + "num_input_tokens_seen": 118486615, + "step": 5518, + "time_per_iteration": 2.4844589233398438 + }, + { + "auxiliary_loss_clip": 0.01043858, + "auxiliary_loss_mlp": 0.01008176, + "balance_loss_clip": 1.03127313, + "balance_loss_mlp": 1.00643528, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6186991118849439, + "language_loss": 0.54330087, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56382126, + "num_input_tokens_seen": 118553580, + "step": 5519, + "time_per_iteration": 3.201590061187744 + }, + { + "auxiliary_loss_clip": 0.0112715, + "auxiliary_loss_mlp": 0.01040818, + "balance_loss_clip": 1.04848909, + "balance_loss_mlp": 1.02433133, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 3.5487432937653676, + "language_loss": 0.78764796, + "learning_rate": 3.117790203606336e-06, + "loss": 0.8093276, + "num_input_tokens_seen": 118570280, + "step": 5520, + "time_per_iteration": 2.481940746307373 + }, + { + "auxiliary_loss_clip": 0.01110741, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.04572022, + "balance_loss_mlp": 1.02051485, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 1.94652978769838, + "language_loss": 0.76552963, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78699249, + "num_input_tokens_seen": 118590455, + "step": 5521, + "time_per_iteration": 2.569601058959961 + }, + { + "auxiliary_loss_clip": 0.01126785, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.04639983, + "balance_loss_mlp": 1.03019142, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 2.189052929795115, + "language_loss": 0.70227861, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72401148, + "num_input_tokens_seen": 118609495, + "step": 5522, + "time_per_iteration": 2.4856700897216797 + }, + { + "auxiliary_loss_clip": 0.01114785, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_clip": 1.0466783, + "balance_loss_mlp": 1.02121484, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.738283567395391, + "language_loss": 0.73292875, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.75444245, + "num_input_tokens_seen": 118628720, + "step": 5523, + "time_per_iteration": 4.093482732772827 + }, + { + "auxiliary_loss_clip": 0.01108187, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.04358053, + "balance_loss_mlp": 1.02130985, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.881728273884042, + "language_loss": 0.81989443, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84135002, + "num_input_tokens_seen": 118645955, + "step": 5524, + "time_per_iteration": 2.485557794570923 + }, + { + "auxiliary_loss_clip": 0.01097492, + "auxiliary_loss_mlp": 0.00781967, + "balance_loss_clip": 1.0472312, + "balance_loss_mlp": 1.00108123, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.6695061218705562, + "language_loss": 0.8345241, + "learning_rate": 3.116174891188636e-06, + "loss": 0.85331869, + "num_input_tokens_seen": 118665605, + "step": 5525, + "time_per_iteration": 2.5841455459594727 + }, + { + "auxiliary_loss_clip": 0.01058615, + "auxiliary_loss_mlp": 0.01005803, + "balance_loss_clip": 1.02828944, + "balance_loss_mlp": 1.00396729, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7582515476271626, + "language_loss": 0.52615654, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54680073, + "num_input_tokens_seen": 118728155, + "step": 5526, + "time_per_iteration": 3.027667999267578 + }, + { + "auxiliary_loss_clip": 0.0109712, + "auxiliary_loss_mlp": 0.00783025, + "balance_loss_clip": 1.04559827, + "balance_loss_mlp": 1.00113356, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.1497147274951143, + "language_loss": 0.7792241, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79802555, + "num_input_tokens_seen": 118743955, + "step": 5527, + "time_per_iteration": 2.5691232681274414 + }, + { + "auxiliary_loss_clip": 0.01096326, + "auxiliary_loss_mlp": 0.0104695, + "balance_loss_clip": 1.05141735, + "balance_loss_mlp": 1.03173876, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 1.694944252567218, + "language_loss": 0.71776915, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.7392019, + "num_input_tokens_seen": 118763275, + "step": 5528, + "time_per_iteration": 2.626265048980713 + }, + { + "auxiliary_loss_clip": 0.01115917, + "auxiliary_loss_mlp": 0.01036082, + "balance_loss_clip": 1.04747069, + "balance_loss_mlp": 1.02158582, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 2.2512166504897855, + "language_loss": 0.83141142, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.85293138, + "num_input_tokens_seen": 118781110, + "step": 5529, + "time_per_iteration": 2.5226023197174072 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.00783584, + "balance_loss_clip": 1.04845858, + "balance_loss_mlp": 1.00103045, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.6944202060106892, + "language_loss": 0.69702202, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71602172, + "num_input_tokens_seen": 118800620, + "step": 5530, + "time_per_iteration": 2.5723767280578613 + }, + { + "auxiliary_loss_clip": 0.01126466, + "auxiliary_loss_mlp": 0.01048226, + "balance_loss_clip": 1.04713082, + "balance_loss_mlp": 1.03157222, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.5481993920924455, + "language_loss": 0.76203281, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78377968, + "num_input_tokens_seen": 118818725, + "step": 5531, + "time_per_iteration": 2.554718494415283 + }, + { + "auxiliary_loss_clip": 0.01118299, + "auxiliary_loss_mlp": 0.01039454, + "balance_loss_clip": 1.04931521, + "balance_loss_mlp": 1.02318239, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.7522920684294934, + "language_loss": 0.7312215, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75279897, + "num_input_tokens_seen": 118839390, + "step": 5532, + "time_per_iteration": 2.5643668174743652 + }, + { + "auxiliary_loss_clip": 0.01115709, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.05072165, + "balance_loss_mlp": 1.01907277, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.8495310456855707, + "language_loss": 0.66270697, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.68420035, + "num_input_tokens_seen": 118856275, + "step": 5533, + "time_per_iteration": 2.496163845062256 + }, + { + "auxiliary_loss_clip": 0.01087004, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.04827809, + "balance_loss_mlp": 1.02068722, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.8323417727902687, + "language_loss": 0.71398276, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73521221, + "num_input_tokens_seen": 118873830, + "step": 5534, + "time_per_iteration": 2.604780435562134 + }, + { + "auxiliary_loss_clip": 0.01092513, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.0465349, + "balance_loss_mlp": 1.01951003, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 2.0432783862767487, + "language_loss": 0.67150009, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69277763, + "num_input_tokens_seen": 118891560, + "step": 5535, + "time_per_iteration": 2.563861846923828 + }, + { + "auxiliary_loss_clip": 0.011268, + "auxiliary_loss_mlp": 0.00782287, + "balance_loss_clip": 1.04693031, + "balance_loss_mlp": 1.00106502, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.449305659482382, + "language_loss": 0.72481382, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.74390465, + "num_input_tokens_seen": 118910260, + "step": 5536, + "time_per_iteration": 2.556793212890625 + }, + { + "auxiliary_loss_clip": 0.01125919, + "auxiliary_loss_mlp": 0.010385, + "balance_loss_clip": 1.04828918, + "balance_loss_mlp": 1.02384961, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.5455202812837565, + "language_loss": 0.81804812, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83969235, + "num_input_tokens_seen": 118929985, + "step": 5537, + "time_per_iteration": 2.5159003734588623 + }, + { + "auxiliary_loss_clip": 0.01134451, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.05400658, + "balance_loss_mlp": 1.02501178, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 2.0920475856598864, + "language_loss": 0.71839571, + "learning_rate": 3.111970130648789e-06, + "loss": 0.7401458, + "num_input_tokens_seen": 118951355, + "step": 5538, + "time_per_iteration": 2.5654873847961426 + }, + { + "auxiliary_loss_clip": 0.0112372, + "auxiliary_loss_mlp": 0.01038499, + "balance_loss_clip": 1.0479399, + "balance_loss_mlp": 1.02326441, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 1.8909531369949164, + "language_loss": 0.74542588, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76704812, + "num_input_tokens_seen": 118970910, + "step": 5539, + "time_per_iteration": 2.517429828643799 + }, + { + "auxiliary_loss_clip": 0.01144716, + "auxiliary_loss_mlp": 0.01048024, + "balance_loss_clip": 1.05062652, + "balance_loss_mlp": 1.03201461, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.88317794999386, + "language_loss": 0.70904642, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73097384, + "num_input_tokens_seen": 118989200, + "step": 5540, + "time_per_iteration": 3.8494722843170166 + }, + { + "auxiliary_loss_clip": 0.01123183, + "auxiliary_loss_mlp": 0.01039335, + "balance_loss_clip": 1.04568374, + "balance_loss_mlp": 1.02424347, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 1.7154340271149466, + "language_loss": 0.60994428, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.63156945, + "num_input_tokens_seen": 119011030, + "step": 5541, + "time_per_iteration": 2.6424176692962646 + }, + { + "auxiliary_loss_clip": 0.011213, + "auxiliary_loss_mlp": 0.01045206, + "balance_loss_clip": 1.05102539, + "balance_loss_mlp": 1.02897, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.5882118241740903, + "language_loss": 0.68462646, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70629156, + "num_input_tokens_seen": 119030620, + "step": 5542, + "time_per_iteration": 2.528303384780884 + }, + { + "auxiliary_loss_clip": 0.01126045, + "auxiliary_loss_mlp": 0.01039429, + "balance_loss_clip": 1.04756665, + "balance_loss_mlp": 1.02487957, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.6922148756519437, + "language_loss": 0.75560784, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77726257, + "num_input_tokens_seen": 119048015, + "step": 5543, + "time_per_iteration": 2.463300943374634 + }, + { + "auxiliary_loss_clip": 0.01065015, + "auxiliary_loss_mlp": 0.01048096, + "balance_loss_clip": 1.0479933, + "balance_loss_mlp": 1.03103709, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.8193297920446534, + "language_loss": 0.75330621, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77443731, + "num_input_tokens_seen": 119066280, + "step": 5544, + "time_per_iteration": 4.265071630477905 + }, + { + "auxiliary_loss_clip": 0.01134443, + "auxiliary_loss_mlp": 0.01035404, + "balance_loss_clip": 1.04708409, + "balance_loss_mlp": 1.02061641, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.4666214261635702, + "language_loss": 0.70434564, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.72604412, + "num_input_tokens_seen": 119087680, + "step": 5545, + "time_per_iteration": 2.677765130996704 + }, + { + "auxiliary_loss_clip": 0.01092019, + "auxiliary_loss_mlp": 0.01041226, + "balance_loss_clip": 1.04474998, + "balance_loss_mlp": 1.02609897, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.6225459981447992, + "language_loss": 0.69203806, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71337056, + "num_input_tokens_seen": 119105820, + "step": 5546, + "time_per_iteration": 2.5753936767578125 + }, + { + "auxiliary_loss_clip": 0.01105946, + "auxiliary_loss_mlp": 0.01039758, + "balance_loss_clip": 1.04752803, + "balance_loss_mlp": 1.02504742, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.567586957648527, + "language_loss": 0.64751959, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.6689766, + "num_input_tokens_seen": 119126630, + "step": 5547, + "time_per_iteration": 2.6261417865753174 + }, + { + "auxiliary_loss_clip": 0.01119338, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.05134058, + "balance_loss_mlp": 1.0241574, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.491173104219708, + "language_loss": 0.85009539, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87167436, + "num_input_tokens_seen": 119143375, + "step": 5548, + "time_per_iteration": 2.5139098167419434 + }, + { + "auxiliary_loss_clip": 0.01127906, + "auxiliary_loss_mlp": 0.0104197, + "balance_loss_clip": 1.04874599, + "balance_loss_mlp": 1.02512538, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 1.9931801345978069, + "language_loss": 0.74698263, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76868141, + "num_input_tokens_seen": 119166450, + "step": 5549, + "time_per_iteration": 4.0467164516448975 + }, + { + "auxiliary_loss_clip": 0.01132093, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.05029345, + "balance_loss_mlp": 1.02563596, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.72681416680985, + "language_loss": 0.6819399, + "learning_rate": 3.108082487713921e-06, + "loss": 0.7036835, + "num_input_tokens_seen": 119189645, + "step": 5550, + "time_per_iteration": 2.7227697372436523 + }, + { + "auxiliary_loss_clip": 0.01096845, + "auxiliary_loss_mlp": 0.01051476, + "balance_loss_clip": 1.04598343, + "balance_loss_mlp": 1.03516817, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 2.1707193820914656, + "language_loss": 0.60560012, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6270833, + "num_input_tokens_seen": 119208045, + "step": 5551, + "time_per_iteration": 2.5420336723327637 + }, + { + "auxiliary_loss_clip": 0.01098606, + "auxiliary_loss_mlp": 0.01048267, + "balance_loss_clip": 1.04794478, + "balance_loss_mlp": 1.03156567, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.8066308661311425, + "language_loss": 0.70261157, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72408032, + "num_input_tokens_seen": 119224910, + "step": 5552, + "time_per_iteration": 2.5330827236175537 + }, + { + "auxiliary_loss_clip": 0.01103021, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.04389715, + "balance_loss_mlp": 1.02147615, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.6767594005004898, + "language_loss": 0.82918298, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85058582, + "num_input_tokens_seen": 119243290, + "step": 5553, + "time_per_iteration": 2.5609493255615234 + }, + { + "auxiliary_loss_clip": 0.01117509, + "auxiliary_loss_mlp": 0.00782749, + "balance_loss_clip": 1.04845321, + "balance_loss_mlp": 1.00106525, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.1600220527407643, + "language_loss": 0.80999541, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.82899803, + "num_input_tokens_seen": 119261195, + "step": 5554, + "time_per_iteration": 2.5280213356018066 + }, + { + "auxiliary_loss_clip": 0.01126684, + "auxiliary_loss_mlp": 0.01043136, + "balance_loss_clip": 1.04880035, + "balance_loss_mlp": 1.02786577, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.4980554592298068, + "language_loss": 0.81578338, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83748162, + "num_input_tokens_seen": 119282845, + "step": 5555, + "time_per_iteration": 2.565319299697876 + }, + { + "auxiliary_loss_clip": 0.01123406, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.04561853, + "balance_loss_mlp": 1.02487183, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.693257603809905, + "language_loss": 0.7425288, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76416373, + "num_input_tokens_seen": 119304430, + "step": 5556, + "time_per_iteration": 2.578383207321167 + }, + { + "auxiliary_loss_clip": 0.0112552, + "auxiliary_loss_mlp": 0.01040231, + "balance_loss_clip": 1.04812264, + "balance_loss_mlp": 1.02530575, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 2.6720426769065075, + "language_loss": 0.82438433, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84604192, + "num_input_tokens_seen": 119323830, + "step": 5557, + "time_per_iteration": 2.5105090141296387 + }, + { + "auxiliary_loss_clip": 0.01119922, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.05110002, + "balance_loss_mlp": 1.02398407, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 3.21291278063664, + "language_loss": 0.8008666, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82246518, + "num_input_tokens_seen": 119346340, + "step": 5558, + "time_per_iteration": 2.6127729415893555 + }, + { + "auxiliary_loss_clip": 0.01108426, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.04459894, + "balance_loss_mlp": 1.02115488, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.6302037502100155, + "language_loss": 0.81540644, + "learning_rate": 3.105162783594788e-06, + "loss": 0.83685088, + "num_input_tokens_seen": 119367285, + "step": 5559, + "time_per_iteration": 2.5731611251831055 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01040585, + "balance_loss_clip": 1.04698622, + "balance_loss_mlp": 1.02512324, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 1.8557638655779012, + "language_loss": 0.72299933, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74442446, + "num_input_tokens_seen": 119385370, + "step": 5560, + "time_per_iteration": 2.531609058380127 + }, + { + "auxiliary_loss_clip": 0.01120599, + "auxiliary_loss_mlp": 0.01046233, + "balance_loss_clip": 1.04884231, + "balance_loss_mlp": 1.03041339, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 1.5026441512541746, + "language_loss": 0.75189978, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77356803, + "num_input_tokens_seen": 119409150, + "step": 5561, + "time_per_iteration": 2.634615182876587 + }, + { + "auxiliary_loss_clip": 0.01115936, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.0499239, + "balance_loss_mlp": 1.02206945, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 2.3764017130877537, + "language_loss": 0.70000535, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.72154099, + "num_input_tokens_seen": 119426475, + "step": 5562, + "time_per_iteration": 2.5130293369293213 + }, + { + "auxiliary_loss_clip": 0.01125428, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.04924428, + "balance_loss_mlp": 1.02553797, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.610787777917899, + "language_loss": 0.64748877, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.66914201, + "num_input_tokens_seen": 119446900, + "step": 5563, + "time_per_iteration": 4.052947759628296 + }, + { + "auxiliary_loss_clip": 0.01086816, + "auxiliary_loss_mlp": 0.01044853, + "balance_loss_clip": 1.05203629, + "balance_loss_mlp": 1.02817595, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 9.609959357425678, + "language_loss": 0.74090225, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76221895, + "num_input_tokens_seen": 119470945, + "step": 5564, + "time_per_iteration": 2.9077606201171875 + }, + { + "auxiliary_loss_clip": 0.0104009, + "auxiliary_loss_mlp": 0.01002268, + "balance_loss_clip": 1.03598559, + "balance_loss_mlp": 1.00067031, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7812846634562318, + "language_loss": 0.55473578, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57515937, + "num_input_tokens_seen": 119529925, + "step": 5565, + "time_per_iteration": 3.0853569507598877 + }, + { + "auxiliary_loss_clip": 0.01140523, + "auxiliary_loss_mlp": 0.01037853, + "balance_loss_clip": 1.05338156, + "balance_loss_mlp": 1.02323198, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.9126943804276926, + "language_loss": 0.64648068, + "learning_rate": 3.102889555312721e-06, + "loss": 0.66826445, + "num_input_tokens_seen": 119550700, + "step": 5566, + "time_per_iteration": 2.609206438064575 + }, + { + "auxiliary_loss_clip": 0.01121575, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.05271924, + "balance_loss_mlp": 1.02601302, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.963946065954169, + "language_loss": 0.77867311, + "learning_rate": 3.102564641030016e-06, + "loss": 0.80030042, + "num_input_tokens_seen": 119569295, + "step": 5567, + "time_per_iteration": 2.5084004402160645 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.05134809, + "balance_loss_mlp": 1.02455306, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.6907214253034553, + "language_loss": 0.76608473, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78768659, + "num_input_tokens_seen": 119587375, + "step": 5568, + "time_per_iteration": 2.501331329345703 + }, + { + "auxiliary_loss_clip": 0.01106074, + "auxiliary_loss_mlp": 0.01045671, + "balance_loss_clip": 1.05310476, + "balance_loss_mlp": 1.02981067, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.605815941495928, + "language_loss": 0.71218383, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73370135, + "num_input_tokens_seen": 119604530, + "step": 5569, + "time_per_iteration": 2.5671985149383545 + }, + { + "auxiliary_loss_clip": 0.01105357, + "auxiliary_loss_mlp": 0.01041889, + "balance_loss_clip": 1.04791772, + "balance_loss_mlp": 1.02467537, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.9456194446837376, + "language_loss": 0.89433593, + "learning_rate": 3.10158964737502e-06, + "loss": 0.91580838, + "num_input_tokens_seen": 119621025, + "step": 5570, + "time_per_iteration": 2.5312135219573975 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.04486704, + "balance_loss_mlp": 1.02007747, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.6702275619391083, + "language_loss": 0.79941082, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82077813, + "num_input_tokens_seen": 119641725, + "step": 5571, + "time_per_iteration": 2.586954355239868 + }, + { + "auxiliary_loss_clip": 0.01067143, + "auxiliary_loss_mlp": 0.00756405, + "balance_loss_clip": 1.03653038, + "balance_loss_mlp": 1.00090778, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.8963740793310949, + "language_loss": 0.55991912, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.57815468, + "num_input_tokens_seen": 119693560, + "step": 5572, + "time_per_iteration": 2.9910168647766113 + }, + { + "auxiliary_loss_clip": 0.01144764, + "auxiliary_loss_mlp": 0.01047396, + "balance_loss_clip": 1.05684209, + "balance_loss_mlp": 1.03198862, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 2.008304796571621, + "language_loss": 0.78308618, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80500782, + "num_input_tokens_seen": 119712935, + "step": 5573, + "time_per_iteration": 2.523134708404541 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.01046103, + "balance_loss_clip": 1.05762076, + "balance_loss_mlp": 1.02960479, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.018216853280142, + "language_loss": 0.7244994, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.74611855, + "num_input_tokens_seen": 119731680, + "step": 5574, + "time_per_iteration": 2.676769971847534 + }, + { + "auxiliary_loss_clip": 0.01126504, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.05044007, + "balance_loss_mlp": 1.02006519, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 1.686579328227153, + "language_loss": 0.87913209, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90074599, + "num_input_tokens_seen": 119752155, + "step": 5575, + "time_per_iteration": 2.5623888969421387 + }, + { + "auxiliary_loss_clip": 0.01128322, + "auxiliary_loss_mlp": 0.01046265, + "balance_loss_clip": 1.05682635, + "balance_loss_mlp": 1.02936077, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.1844957665588622, + "language_loss": 0.82900035, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.85074621, + "num_input_tokens_seen": 119769195, + "step": 5576, + "time_per_iteration": 2.568674325942993 + }, + { + "auxiliary_loss_clip": 0.01128238, + "auxiliary_loss_mlp": 0.01039049, + "balance_loss_clip": 1.04950762, + "balance_loss_mlp": 1.02305079, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 2.143199210533924, + "language_loss": 0.7338202, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75549299, + "num_input_tokens_seen": 119786810, + "step": 5577, + "time_per_iteration": 2.5301549434661865 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.0104084, + "balance_loss_clip": 1.05476725, + "balance_loss_mlp": 1.02416217, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.556655638683543, + "language_loss": 0.81443435, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.83592165, + "num_input_tokens_seen": 119805395, + "step": 5578, + "time_per_iteration": 2.5625462532043457 + }, + { + "auxiliary_loss_clip": 0.01080752, + "auxiliary_loss_mlp": 0.00782328, + "balance_loss_clip": 1.05188406, + "balance_loss_mlp": 1.00067878, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.7870232379954625, + "language_loss": 0.71556199, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73419285, + "num_input_tokens_seen": 119823135, + "step": 5579, + "time_per_iteration": 2.6088743209838867 + }, + { + "auxiliary_loss_clip": 0.0108765, + "auxiliary_loss_mlp": 0.01045198, + "balance_loss_clip": 1.04687262, + "balance_loss_mlp": 1.02860379, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 1.9351780239630185, + "language_loss": 0.81224686, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83357537, + "num_input_tokens_seen": 119842265, + "step": 5580, + "time_per_iteration": 3.975846529006958 + }, + { + "auxiliary_loss_clip": 0.01121148, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.05205512, + "balance_loss_mlp": 1.01761663, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.7612930084947322, + "language_loss": 0.77686322, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.79841208, + "num_input_tokens_seen": 119862500, + "step": 5581, + "time_per_iteration": 2.5579915046691895 + }, + { + "auxiliary_loss_clip": 0.01108606, + "auxiliary_loss_mlp": 0.01047301, + "balance_loss_clip": 1.04735887, + "balance_loss_mlp": 1.02944374, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 3.3053332389668153, + "language_loss": 0.74603403, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76759315, + "num_input_tokens_seen": 119880160, + "step": 5582, + "time_per_iteration": 2.5100624561309814 + }, + { + "auxiliary_loss_clip": 0.01118747, + "auxiliary_loss_mlp": 0.01047408, + "balance_loss_clip": 1.04843235, + "balance_loss_mlp": 1.03108859, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 3.000581153754226, + "language_loss": 0.82457173, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84623331, + "num_input_tokens_seen": 119899040, + "step": 5583, + "time_per_iteration": 3.9998221397399902 + }, + { + "auxiliary_loss_clip": 0.01119057, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.05137014, + "balance_loss_mlp": 1.0349071, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.8743601626136324, + "language_loss": 0.77564359, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79733336, + "num_input_tokens_seen": 119921120, + "step": 5584, + "time_per_iteration": 2.630998134613037 + }, + { + "auxiliary_loss_clip": 0.01122881, + "auxiliary_loss_mlp": 0.01044688, + "balance_loss_clip": 1.05361986, + "balance_loss_mlp": 1.0289402, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 2.104297478741434, + "language_loss": 0.76154113, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78321689, + "num_input_tokens_seen": 119940165, + "step": 5585, + "time_per_iteration": 2.5967392921447754 + }, + { + "auxiliary_loss_clip": 0.01123771, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_clip": 1.04667461, + "balance_loss_mlp": 1.02709532, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 2.2022309607059354, + "language_loss": 0.77737582, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79904103, + "num_input_tokens_seen": 119959730, + "step": 5586, + "time_per_iteration": 2.5999066829681396 + }, + { + "auxiliary_loss_clip": 0.01104399, + "auxiliary_loss_mlp": 0.01056192, + "balance_loss_clip": 1.04992473, + "balance_loss_mlp": 1.03684461, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 2.050272006089287, + "language_loss": 0.80469817, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.82630408, + "num_input_tokens_seen": 119979315, + "step": 5587, + "time_per_iteration": 2.6127266883850098 + }, + { + "auxiliary_loss_clip": 0.0113876, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.05373669, + "balance_loss_mlp": 1.02604055, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.8117772609367044, + "language_loss": 0.66921496, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69100714, + "num_input_tokens_seen": 119996140, + "step": 5588, + "time_per_iteration": 2.4447340965270996 + }, + { + "auxiliary_loss_clip": 0.0111899, + "auxiliary_loss_mlp": 0.00783087, + "balance_loss_clip": 1.04828501, + "balance_loss_mlp": 1.0006268, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 1.7141874053775945, + "language_loss": 0.70376998, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72279072, + "num_input_tokens_seen": 120017720, + "step": 5589, + "time_per_iteration": 4.100366115570068 + }, + { + "auxiliary_loss_clip": 0.01119266, + "auxiliary_loss_mlp": 0.01044631, + "balance_loss_clip": 1.04955554, + "balance_loss_mlp": 1.02779841, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 1.8050790975667794, + "language_loss": 0.66614348, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.68778247, + "num_input_tokens_seen": 120036335, + "step": 5590, + "time_per_iteration": 2.701064109802246 + }, + { + "auxiliary_loss_clip": 0.01111791, + "auxiliary_loss_mlp": 0.01042186, + "balance_loss_clip": 1.05374289, + "balance_loss_mlp": 1.0262121, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 1.9877696794101885, + "language_loss": 0.7359823, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75752205, + "num_input_tokens_seen": 120056120, + "step": 5591, + "time_per_iteration": 2.7056353092193604 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.05112529, + "balance_loss_mlp": 1.02567291, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 2.053504485599545, + "language_loss": 0.69939148, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72118115, + "num_input_tokens_seen": 120073650, + "step": 5592, + "time_per_iteration": 2.4451308250427246 + }, + { + "auxiliary_loss_clip": 0.01116089, + "auxiliary_loss_mlp": 0.01037542, + "balance_loss_clip": 1.05080521, + "balance_loss_mlp": 1.02227139, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.1309664935987036, + "language_loss": 0.76484597, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78638238, + "num_input_tokens_seen": 120093260, + "step": 5593, + "time_per_iteration": 2.5523643493652344 + }, + { + "auxiliary_loss_clip": 0.01104884, + "auxiliary_loss_mlp": 0.00784855, + "balance_loss_clip": 1.04381371, + "balance_loss_mlp": 1.00060678, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.5313486308314808, + "language_loss": 0.72123694, + "learning_rate": 3.093776191858731e-06, + "loss": 0.74013436, + "num_input_tokens_seen": 120111830, + "step": 5594, + "time_per_iteration": 2.557530403137207 + }, + { + "auxiliary_loss_clip": 0.01087015, + "auxiliary_loss_mlp": 0.00787658, + "balance_loss_clip": 1.04460955, + "balance_loss_mlp": 1.00070214, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.7489494956300273, + "language_loss": 0.80148685, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82023358, + "num_input_tokens_seen": 120130470, + "step": 5595, + "time_per_iteration": 2.67352294921875 + }, + { + "auxiliary_loss_clip": 0.01116492, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.05010533, + "balance_loss_mlp": 1.01771426, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 2.0507068623473166, + "language_loss": 0.8144592, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83594525, + "num_input_tokens_seen": 120150735, + "step": 5596, + "time_per_iteration": 2.6099114418029785 + }, + { + "auxiliary_loss_clip": 0.01118182, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.05086637, + "balance_loss_mlp": 1.01882315, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.6050959710664703, + "language_loss": 0.75969684, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.7812109, + "num_input_tokens_seen": 120173230, + "step": 5597, + "time_per_iteration": 2.6543304920196533 + }, + { + "auxiliary_loss_clip": 0.01127852, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.05044818, + "balance_loss_mlp": 1.02271938, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 2.7728900511842363, + "language_loss": 0.78743583, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.8090924, + "num_input_tokens_seen": 120191860, + "step": 5598, + "time_per_iteration": 2.54213809967041 + }, + { + "auxiliary_loss_clip": 0.01147287, + "auxiliary_loss_mlp": 0.01038827, + "balance_loss_clip": 1.05394077, + "balance_loss_mlp": 1.02201891, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.513815407191818, + "language_loss": 0.64921498, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.67107618, + "num_input_tokens_seen": 120219195, + "step": 5599, + "time_per_iteration": 2.6563873291015625 + }, + { + "auxiliary_loss_clip": 0.01109604, + "auxiliary_loss_mlp": 0.0105263, + "balance_loss_clip": 1.04896188, + "balance_loss_mlp": 1.03260314, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.8043978231330025, + "language_loss": 0.82027304, + "learning_rate": 3.091819088459249e-06, + "loss": 0.8418954, + "num_input_tokens_seen": 120232950, + "step": 5600, + "time_per_iteration": 2.4965426921844482 + }, + { + "auxiliary_loss_clip": 0.011337, + "auxiliary_loss_mlp": 0.01045698, + "balance_loss_clip": 1.05164218, + "balance_loss_mlp": 1.02844834, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 2.094382443157601, + "language_loss": 0.83403176, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.85582566, + "num_input_tokens_seen": 120248865, + "step": 5601, + "time_per_iteration": 2.463759183883667 + }, + { + "auxiliary_loss_clip": 0.01131043, + "auxiliary_loss_mlp": 0.01034232, + "balance_loss_clip": 1.05707932, + "balance_loss_mlp": 1.01941454, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.5722598711943625, + "language_loss": 0.83252001, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85417277, + "num_input_tokens_seen": 120267820, + "step": 5602, + "time_per_iteration": 3.9228768348693848 + }, + { + "auxiliary_loss_clip": 0.01147209, + "auxiliary_loss_mlp": 0.01049789, + "balance_loss_clip": 1.05690503, + "balance_loss_mlp": 1.03397024, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.7782025820634055, + "language_loss": 0.69944304, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72141302, + "num_input_tokens_seen": 120286540, + "step": 5603, + "time_per_iteration": 2.4412314891815186 + }, + { + "auxiliary_loss_clip": 0.0111702, + "auxiliary_loss_mlp": 0.01039145, + "balance_loss_clip": 1.04855084, + "balance_loss_mlp": 1.02316523, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.4629135468987267, + "language_loss": 0.83296406, + "learning_rate": 3.090513524656898e-06, + "loss": 0.85452569, + "num_input_tokens_seen": 120307305, + "step": 5604, + "time_per_iteration": 2.5196187496185303 + }, + { + "auxiliary_loss_clip": 0.01102729, + "auxiliary_loss_mlp": 0.01039591, + "balance_loss_clip": 1.04876363, + "balance_loss_mlp": 1.02346849, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 1.323189845828559, + "language_loss": 0.73530608, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75672936, + "num_input_tokens_seen": 120327845, + "step": 5605, + "time_per_iteration": 2.554112195968628 + }, + { + "auxiliary_loss_clip": 0.01125476, + "auxiliary_loss_mlp": 0.01040039, + "balance_loss_clip": 1.05157924, + "balance_loss_mlp": 1.02367163, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.7454715707794555, + "language_loss": 0.83466071, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85631585, + "num_input_tokens_seen": 120343255, + "step": 5606, + "time_per_iteration": 2.494563341140747 + }, + { + "auxiliary_loss_clip": 0.01115274, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_clip": 1.04630852, + "balance_loss_mlp": 1.02595973, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.87448718609162, + "language_loss": 0.68108445, + "learning_rate": 3.089533917561809e-06, + "loss": 0.70264798, + "num_input_tokens_seen": 120361745, + "step": 5607, + "time_per_iteration": 2.566521644592285 + }, + { + "auxiliary_loss_clip": 0.01129406, + "auxiliary_loss_mlp": 0.01051083, + "balance_loss_clip": 1.05063951, + "balance_loss_mlp": 1.03277218, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 2.327719023895629, + "language_loss": 0.70985472, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73165953, + "num_input_tokens_seen": 120380565, + "step": 5608, + "time_per_iteration": 2.5192995071411133 + }, + { + "auxiliary_loss_clip": 0.01061122, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.04173994, + "balance_loss_mlp": 1.02520418, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 15.409914635955568, + "language_loss": 0.79143733, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81245512, + "num_input_tokens_seen": 120399235, + "step": 5609, + "time_per_iteration": 2.6187777519226074 + }, + { + "auxiliary_loss_clip": 0.01136756, + "auxiliary_loss_mlp": 0.010436, + "balance_loss_clip": 1.05645764, + "balance_loss_mlp": 1.02604043, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 1.8298486111694168, + "language_loss": 0.82337081, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84517443, + "num_input_tokens_seen": 120420095, + "step": 5610, + "time_per_iteration": 2.5154073238372803 + }, + { + "auxiliary_loss_clip": 0.01128871, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.05202162, + "balance_loss_mlp": 1.02545142, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.8684255428865384, + "language_loss": 0.81853908, + "learning_rate": 3.088227196412879e-06, + "loss": 0.8402552, + "num_input_tokens_seen": 120437690, + "step": 5611, + "time_per_iteration": 2.6000397205352783 + }, + { + "auxiliary_loss_clip": 0.01123373, + "auxiliary_loss_mlp": 0.01045708, + "balance_loss_clip": 1.05308533, + "balance_loss_mlp": 1.02806461, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.7156000622991847, + "language_loss": 0.79247224, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81416303, + "num_input_tokens_seen": 120459240, + "step": 5612, + "time_per_iteration": 2.6889255046844482 + }, + { + "auxiliary_loss_clip": 0.01087392, + "auxiliary_loss_mlp": 0.01040951, + "balance_loss_clip": 1.04870033, + "balance_loss_mlp": 1.02442873, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.182828535153932, + "language_loss": 0.70131731, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72260076, + "num_input_tokens_seen": 120481090, + "step": 5613, + "time_per_iteration": 2.7548201084136963 + }, + { + "auxiliary_loss_clip": 0.011259, + "auxiliary_loss_mlp": 0.01037265, + "balance_loss_clip": 1.05312586, + "balance_loss_mlp": 1.0207541, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.7787336067457098, + "language_loss": 0.79824591, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81987756, + "num_input_tokens_seen": 120500045, + "step": 5614, + "time_per_iteration": 2.5341572761535645 + }, + { + "auxiliary_loss_clip": 0.01114463, + "auxiliary_loss_mlp": 0.01048692, + "balance_loss_clip": 1.04935455, + "balance_loss_mlp": 1.03027427, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 2.8460715104513996, + "language_loss": 0.91196251, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93359405, + "num_input_tokens_seen": 120521125, + "step": 5615, + "time_per_iteration": 2.587909698486328 + }, + { + "auxiliary_loss_clip": 0.01130835, + "auxiliary_loss_mlp": 0.01039634, + "balance_loss_clip": 1.05388904, + "balance_loss_mlp": 1.02455389, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.7046370368795767, + "language_loss": 0.80896771, + "learning_rate": 3.086592866591809e-06, + "loss": 0.83067238, + "num_input_tokens_seen": 120539180, + "step": 5616, + "time_per_iteration": 2.5214452743530273 + }, + { + "auxiliary_loss_clip": 0.01138738, + "auxiliary_loss_mlp": 0.00784823, + "balance_loss_clip": 1.05405271, + "balance_loss_mlp": 1.00075197, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 1.7580016484826053, + "language_loss": 0.84098399, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.8602196, + "num_input_tokens_seen": 120556280, + "step": 5617, + "time_per_iteration": 2.479529619216919 + }, + { + "auxiliary_loss_clip": 0.01073765, + "auxiliary_loss_mlp": 0.01046662, + "balance_loss_clip": 1.04502451, + "balance_loss_mlp": 1.02895904, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.7910407223355196, + "language_loss": 0.8002677, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82147199, + "num_input_tokens_seen": 120575395, + "step": 5618, + "time_per_iteration": 2.779834270477295 + }, + { + "auxiliary_loss_clip": 0.01103045, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.05331051, + "balance_loss_mlp": 1.02210486, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 2.5802246682257834, + "language_loss": 0.71175349, + "learning_rate": 3.085611774155481e-06, + "loss": 0.73316401, + "num_input_tokens_seen": 120596075, + "step": 5619, + "time_per_iteration": 4.2044970989227295 + }, + { + "auxiliary_loss_clip": 0.01122843, + "auxiliary_loss_mlp": 0.0105133, + "balance_loss_clip": 1.05232978, + "balance_loss_mlp": 1.0352726, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 3.3009033461178947, + "language_loss": 0.70110434, + "learning_rate": 3.085284660993821e-06, + "loss": 0.72284609, + "num_input_tokens_seen": 120614195, + "step": 5620, + "time_per_iteration": 2.545714855194092 + }, + { + "auxiliary_loss_clip": 0.01144526, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.05624092, + "balance_loss_mlp": 1.02954841, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 1.888133018050881, + "language_loss": 0.68366981, + "learning_rate": 3.084957506678058e-06, + "loss": 0.70556349, + "num_input_tokens_seen": 120634475, + "step": 5621, + "time_per_iteration": 2.508659839630127 + }, + { + "auxiliary_loss_clip": 0.01113437, + "auxiliary_loss_mlp": 0.01041141, + "balance_loss_clip": 1.05122077, + "balance_loss_mlp": 1.02596569, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.9327529925263787, + "language_loss": 0.82766068, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84920651, + "num_input_tokens_seen": 120654980, + "step": 5622, + "time_per_iteration": 2.5569350719451904 + }, + { + "auxiliary_loss_clip": 0.01106836, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.04940438, + "balance_loss_mlp": 1.0236938, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.4093114899623644, + "language_loss": 0.7356208, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75707537, + "num_input_tokens_seen": 120676245, + "step": 5623, + "time_per_iteration": 4.329667806625366 + }, + { + "auxiliary_loss_clip": 0.01064455, + "auxiliary_loss_mlp": 0.01005275, + "balance_loss_clip": 1.05117667, + "balance_loss_mlp": 1.00326014, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7537544027453362, + "language_loss": 0.54907596, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56977332, + "num_input_tokens_seen": 120741965, + "step": 5624, + "time_per_iteration": 3.2693254947662354 + }, + { + "auxiliary_loss_clip": 0.01097695, + "auxiliary_loss_mlp": 0.01053913, + "balance_loss_clip": 1.04715383, + "balance_loss_mlp": 1.03557897, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 2.5873607473662257, + "language_loss": 0.73232377, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75383985, + "num_input_tokens_seen": 120760410, + "step": 5625, + "time_per_iteration": 2.5768885612487793 + }, + { + "auxiliary_loss_clip": 0.01133453, + "auxiliary_loss_mlp": 0.01039929, + "balance_loss_clip": 1.05117536, + "balance_loss_mlp": 1.02339435, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 3.9298571914719798, + "language_loss": 0.70568776, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72742152, + "num_input_tokens_seen": 120777705, + "step": 5626, + "time_per_iteration": 2.4979827404022217 + }, + { + "auxiliary_loss_clip": 0.01117734, + "auxiliary_loss_mlp": 0.0103726, + "balance_loss_clip": 1.05077767, + "balance_loss_mlp": 1.0216676, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.8895236296529017, + "language_loss": 0.80965948, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83120942, + "num_input_tokens_seen": 120798660, + "step": 5627, + "time_per_iteration": 2.5941848754882812 + }, + { + "auxiliary_loss_clip": 0.01137858, + "auxiliary_loss_mlp": 0.00783237, + "balance_loss_clip": 1.05607986, + "balance_loss_mlp": 1.00069416, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 1.7380949396323158, + "language_loss": 0.80384409, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82305509, + "num_input_tokens_seen": 120816705, + "step": 5628, + "time_per_iteration": 3.9064035415649414 + }, + { + "auxiliary_loss_clip": 0.01086257, + "auxiliary_loss_mlp": 0.01039463, + "balance_loss_clip": 1.04629612, + "balance_loss_mlp": 1.02252388, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.8574327399407828, + "language_loss": 0.76992011, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79117739, + "num_input_tokens_seen": 120835375, + "step": 5629, + "time_per_iteration": 2.621758222579956 + }, + { + "auxiliary_loss_clip": 0.01124147, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.05053067, + "balance_loss_mlp": 1.02610612, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.6922000387332898, + "language_loss": 0.84780431, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.86948436, + "num_input_tokens_seen": 120854260, + "step": 5630, + "time_per_iteration": 2.526289463043213 + }, + { + "auxiliary_loss_clip": 0.01098003, + "auxiliary_loss_mlp": 0.01049349, + "balance_loss_clip": 1.05338955, + "balance_loss_mlp": 1.03280258, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 2.1239794561883807, + "language_loss": 0.715913, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73738652, + "num_input_tokens_seen": 120871590, + "step": 5631, + "time_per_iteration": 2.582690715789795 + }, + { + "auxiliary_loss_clip": 0.01048121, + "auxiliary_loss_mlp": 0.01011489, + "balance_loss_clip": 1.03638208, + "balance_loss_mlp": 1.00972462, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.8657721151791853, + "language_loss": 0.56136847, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58196461, + "num_input_tokens_seen": 120925550, + "step": 5632, + "time_per_iteration": 3.1611530780792236 + }, + { + "auxiliary_loss_clip": 0.01128869, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.05439103, + "balance_loss_mlp": 1.01841354, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.6310030300315483, + "language_loss": 0.80152619, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82315767, + "num_input_tokens_seen": 120947620, + "step": 5633, + "time_per_iteration": 2.5552284717559814 + }, + { + "auxiliary_loss_clip": 0.01108524, + "auxiliary_loss_mlp": 0.01040177, + "balance_loss_clip": 1.0466224, + "balance_loss_mlp": 1.02382159, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.638902880003122, + "language_loss": 0.59181571, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61330277, + "num_input_tokens_seen": 120965205, + "step": 5634, + "time_per_iteration": 2.6133716106414795 + }, + { + "auxiliary_loss_clip": 0.01107603, + "auxiliary_loss_mlp": 0.0103615, + "balance_loss_clip": 1.0492785, + "balance_loss_mlp": 1.02097487, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.6754427783003139, + "language_loss": 0.92605507, + "learning_rate": 3.080373032026589e-06, + "loss": 0.9474926, + "num_input_tokens_seen": 120983560, + "step": 5635, + "time_per_iteration": 2.504169464111328 + }, + { + "auxiliary_loss_clip": 0.01098613, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.0534699, + "balance_loss_mlp": 1.0190587, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 2.3538074315592983, + "language_loss": 0.75052965, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.77186298, + "num_input_tokens_seen": 121001400, + "step": 5636, + "time_per_iteration": 2.5262184143066406 + }, + { + "auxiliary_loss_clip": 0.01130609, + "auxiliary_loss_mlp": 0.01041208, + "balance_loss_clip": 1.05066395, + "balance_loss_mlp": 1.0249722, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 2.0208669813169315, + "language_loss": 0.83359408, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85531223, + "num_input_tokens_seen": 121021760, + "step": 5637, + "time_per_iteration": 2.5079843997955322 + }, + { + "auxiliary_loss_clip": 0.01094388, + "auxiliary_loss_mlp": 0.01045366, + "balance_loss_clip": 1.0496738, + "balance_loss_mlp": 1.02678096, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.8655945653395893, + "language_loss": 0.69981635, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72121382, + "num_input_tokens_seen": 121041070, + "step": 5638, + "time_per_iteration": 2.565218687057495 + }, + { + "auxiliary_loss_clip": 0.011153, + "auxiliary_loss_mlp": 0.01051803, + "balance_loss_clip": 1.05303991, + "balance_loss_mlp": 1.03556693, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.6191717724096808, + "language_loss": 0.80669749, + "learning_rate": 3.079061705792765e-06, + "loss": 0.82836854, + "num_input_tokens_seen": 121060890, + "step": 5639, + "time_per_iteration": 2.601036787033081 + }, + { + "auxiliary_loss_clip": 0.01148655, + "auxiliary_loss_mlp": 0.01048276, + "balance_loss_clip": 1.05606377, + "balance_loss_mlp": 1.03156328, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.4678273406420956, + "language_loss": 0.67708749, + "learning_rate": 3.078733771907907e-06, + "loss": 0.6990568, + "num_input_tokens_seen": 121079135, + "step": 5640, + "time_per_iteration": 2.4673287868499756 + }, + { + "auxiliary_loss_clip": 0.01116187, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_clip": 1.04912567, + "balance_loss_mlp": 1.02307391, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.640584843311317, + "language_loss": 0.70007324, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72162247, + "num_input_tokens_seen": 121097685, + "step": 5641, + "time_per_iteration": 2.5415525436401367 + }, + { + "auxiliary_loss_clip": 0.01145548, + "auxiliary_loss_mlp": 0.01043757, + "balance_loss_clip": 1.05454946, + "balance_loss_mlp": 1.02843893, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 2.124451373220178, + "language_loss": 0.87385321, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89574623, + "num_input_tokens_seen": 121115640, + "step": 5642, + "time_per_iteration": 3.931314468383789 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.05286968, + "balance_loss_mlp": 1.02020216, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.7737963355491808, + "language_loss": 0.84039557, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86197996, + "num_input_tokens_seen": 121132485, + "step": 5643, + "time_per_iteration": 2.4763851165771484 + }, + { + "auxiliary_loss_clip": 0.01112487, + "auxiliary_loss_mlp": 0.01046902, + "balance_loss_clip": 1.04733074, + "balance_loss_mlp": 1.03150034, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.6581243445505955, + "language_loss": 0.77078748, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79238135, + "num_input_tokens_seen": 121152935, + "step": 5644, + "time_per_iteration": 2.593820810317993 + }, + { + "auxiliary_loss_clip": 0.01126776, + "auxiliary_loss_mlp": 0.01045497, + "balance_loss_clip": 1.05219865, + "balance_loss_mlp": 1.02988052, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 5.918157673026993, + "language_loss": 0.62918508, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65090781, + "num_input_tokens_seen": 121169835, + "step": 5645, + "time_per_iteration": 2.477781295776367 + }, + { + "auxiliary_loss_clip": 0.01127428, + "auxiliary_loss_mlp": 0.01040568, + "balance_loss_clip": 1.05127966, + "balance_loss_mlp": 1.02547646, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 1.8387251285369364, + "language_loss": 0.76261562, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78429562, + "num_input_tokens_seen": 121190290, + "step": 5646, + "time_per_iteration": 2.5524940490722656 + }, + { + "auxiliary_loss_clip": 0.01131634, + "auxiliary_loss_mlp": 0.01044563, + "balance_loss_clip": 1.05398428, + "balance_loss_mlp": 1.0286845, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 2.742040497862227, + "language_loss": 0.79512066, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81688261, + "num_input_tokens_seen": 121209060, + "step": 5647, + "time_per_iteration": 2.4977810382843018 + }, + { + "auxiliary_loss_clip": 0.01116151, + "auxiliary_loss_mlp": 0.00780981, + "balance_loss_clip": 1.05418086, + "balance_loss_mlp": 1.0005455, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 2.0410184116229644, + "language_loss": 0.77098763, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.78995895, + "num_input_tokens_seen": 121227480, + "step": 5648, + "time_per_iteration": 2.5587165355682373 + }, + { + "auxiliary_loss_clip": 0.01006222, + "auxiliary_loss_mlp": 0.01008963, + "balance_loss_clip": 1.04114425, + "balance_loss_mlp": 1.00704348, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7801870945675364, + "language_loss": 0.56327647, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58342832, + "num_input_tokens_seen": 121291305, + "step": 5649, + "time_per_iteration": 3.48738694190979 + }, + { + "auxiliary_loss_clip": 0.01113959, + "auxiliary_loss_mlp": 0.00784756, + "balance_loss_clip": 1.04846907, + "balance_loss_mlp": 1.00060093, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.462986385819792, + "language_loss": 0.85688287, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87586999, + "num_input_tokens_seen": 121312740, + "step": 5650, + "time_per_iteration": 2.942862033843994 + }, + { + "auxiliary_loss_clip": 0.01128432, + "auxiliary_loss_mlp": 0.01031658, + "balance_loss_clip": 1.04966223, + "balance_loss_mlp": 1.01694155, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.5087359819203088, + "language_loss": 0.70616889, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.72776973, + "num_input_tokens_seen": 121334220, + "step": 5651, + "time_per_iteration": 3.0530426502227783 + }, + { + "auxiliary_loss_clip": 0.01092804, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.0480305, + "balance_loss_mlp": 1.02688658, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.9014468396608386, + "language_loss": 0.80999124, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83134311, + "num_input_tokens_seen": 121351870, + "step": 5652, + "time_per_iteration": 2.6417784690856934 + }, + { + "auxiliary_loss_clip": 0.01144692, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.05400753, + "balance_loss_mlp": 1.02623534, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.895955841115862, + "language_loss": 0.77280647, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79467714, + "num_input_tokens_seen": 121373400, + "step": 5653, + "time_per_iteration": 2.6644105911254883 + }, + { + "auxiliary_loss_clip": 0.01125437, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.04958916, + "balance_loss_mlp": 1.02572453, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 2.656247616443698, + "language_loss": 0.86098856, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.88266218, + "num_input_tokens_seen": 121385225, + "step": 5654, + "time_per_iteration": 2.437903881072998 + }, + { + "auxiliary_loss_clip": 0.01122761, + "auxiliary_loss_mlp": 0.01039598, + "balance_loss_clip": 1.04812348, + "balance_loss_mlp": 1.024423, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 2.5484484464334622, + "language_loss": 0.64922929, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67085284, + "num_input_tokens_seen": 121404735, + "step": 5655, + "time_per_iteration": 2.6721978187561035 + }, + { + "auxiliary_loss_clip": 0.01131132, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.05314541, + "balance_loss_mlp": 1.02319002, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.8588785781477373, + "language_loss": 0.76512361, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78681511, + "num_input_tokens_seen": 121426780, + "step": 5656, + "time_per_iteration": 2.655317544937134 + }, + { + "auxiliary_loss_clip": 0.01106009, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.04460537, + "balance_loss_mlp": 1.02225399, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.8370356465559983, + "language_loss": 0.8269161, + "learning_rate": 3.073152647447525e-06, + "loss": 0.84835732, + "num_input_tokens_seen": 121447245, + "step": 5657, + "time_per_iteration": 4.070458173751831 + }, + { + "auxiliary_loss_clip": 0.0111283, + "auxiliary_loss_mlp": 0.01037878, + "balance_loss_clip": 1.04860616, + "balance_loss_mlp": 1.02319753, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 3.204641878496659, + "language_loss": 0.84873116, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87023824, + "num_input_tokens_seen": 121468165, + "step": 5658, + "time_per_iteration": 2.577155590057373 + }, + { + "auxiliary_loss_clip": 0.01060576, + "auxiliary_loss_mlp": 0.01003838, + "balance_loss_clip": 1.03816688, + "balance_loss_mlp": 1.00193024, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8156385659961748, + "language_loss": 0.60057247, + "learning_rate": 3.072495270199477e-06, + "loss": 0.6212166, + "num_input_tokens_seen": 121523795, + "step": 5659, + "time_per_iteration": 3.054455518722534 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.05449986, + "balance_loss_mlp": 1.02160931, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.8612962327918194, + "language_loss": 0.68119133, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70295537, + "num_input_tokens_seen": 121542950, + "step": 5660, + "time_per_iteration": 2.5065553188323975 + }, + { + "auxiliary_loss_clip": 0.01143526, + "auxiliary_loss_mlp": 0.01046869, + "balance_loss_clip": 1.05509257, + "balance_loss_mlp": 1.03145552, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 2.3642563566999826, + "language_loss": 0.67991167, + "learning_rate": 3.071837730274918e-06, + "loss": 0.70181561, + "num_input_tokens_seen": 121562765, + "step": 5661, + "time_per_iteration": 4.016141414642334 + }, + { + "auxiliary_loss_clip": 0.01119537, + "auxiliary_loss_mlp": 0.0104002, + "balance_loss_clip": 1.0535593, + "balance_loss_mlp": 1.0258584, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 2.1998638744221277, + "language_loss": 0.79023218, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81182778, + "num_input_tokens_seen": 121581610, + "step": 5662, + "time_per_iteration": 2.5508041381835938 + }, + { + "auxiliary_loss_clip": 0.01103092, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.04855037, + "balance_loss_mlp": 1.0227685, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 1.8708479238770652, + "language_loss": 0.73524404, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75666261, + "num_input_tokens_seen": 121601885, + "step": 5663, + "time_per_iteration": 2.598572254180908 + }, + { + "auxiliary_loss_clip": 0.01097713, + "auxiliary_loss_mlp": 0.01037768, + "balance_loss_clip": 1.04912019, + "balance_loss_mlp": 1.0233674, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 1.621262003158948, + "language_loss": 0.86446047, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88581526, + "num_input_tokens_seen": 121621335, + "step": 5664, + "time_per_iteration": 2.5743048191070557 + }, + { + "auxiliary_loss_clip": 0.01143132, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.0542239, + "balance_loss_mlp": 1.02456141, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 2.1153513799138857, + "language_loss": 0.68817735, + "learning_rate": 3.070522162795235e-06, + "loss": 0.70999992, + "num_input_tokens_seen": 121641310, + "step": 5665, + "time_per_iteration": 2.464111089706421 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.01039872, + "balance_loss_clip": 1.05081272, + "balance_loss_mlp": 1.02383804, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 3.3220214880643537, + "language_loss": 0.7321893, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.75398469, + "num_input_tokens_seen": 121659625, + "step": 5666, + "time_per_iteration": 2.461148977279663 + }, + { + "auxiliary_loss_clip": 0.01131698, + "auxiliary_loss_mlp": 0.01037808, + "balance_loss_clip": 1.05024004, + "balance_loss_mlp": 1.02264452, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 2.976869991605678, + "language_loss": 0.73042607, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75212109, + "num_input_tokens_seen": 121679205, + "step": 5667, + "time_per_iteration": 3.8865129947662354 + }, + { + "auxiliary_loss_clip": 0.01041822, + "auxiliary_loss_mlp": 0.01000238, + "balance_loss_clip": 1.03437757, + "balance_loss_mlp": 0.99840242, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.870902906947516, + "language_loss": 0.63275176, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65317237, + "num_input_tokens_seen": 121751085, + "step": 5668, + "time_per_iteration": 3.2599847316741943 + }, + { + "auxiliary_loss_clip": 0.01051254, + "auxiliary_loss_mlp": 0.0104517, + "balance_loss_clip": 1.04012132, + "balance_loss_mlp": 1.02895784, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.50224080264903, + "language_loss": 0.71892953, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.73989367, + "num_input_tokens_seen": 121768565, + "step": 5669, + "time_per_iteration": 2.688992500305176 + }, + { + "auxiliary_loss_clip": 0.01104541, + "auxiliary_loss_mlp": 0.00780006, + "balance_loss_clip": 1.04933882, + "balance_loss_mlp": 1.00051129, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 2.086315425817454, + "language_loss": 0.80451834, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82336378, + "num_input_tokens_seen": 121784925, + "step": 5670, + "time_per_iteration": 2.5249602794647217 + }, + { + "auxiliary_loss_clip": 0.01091582, + "auxiliary_loss_mlp": 0.01040529, + "balance_loss_clip": 1.04341602, + "balance_loss_mlp": 1.02511573, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.6451433322223659, + "language_loss": 0.77033228, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79165339, + "num_input_tokens_seen": 121804425, + "step": 5671, + "time_per_iteration": 2.6494100093841553 + }, + { + "auxiliary_loss_clip": 0.01139752, + "auxiliary_loss_mlp": 0.00781873, + "balance_loss_clip": 1.05106235, + "balance_loss_mlp": 1.00047672, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 2.1201246767253568, + "language_loss": 0.74578202, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.7649982, + "num_input_tokens_seen": 121825145, + "step": 5672, + "time_per_iteration": 2.519521474838257 + }, + { + "auxiliary_loss_clip": 0.01125202, + "auxiliary_loss_mlp": 0.01048054, + "balance_loss_clip": 1.04811049, + "balance_loss_mlp": 1.03155518, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.7676234200082555, + "language_loss": 0.7365278, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75826031, + "num_input_tokens_seen": 121842185, + "step": 5673, + "time_per_iteration": 2.4780852794647217 + }, + { + "auxiliary_loss_clip": 0.01129683, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.05110431, + "balance_loss_mlp": 1.02262068, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.8099697946394466, + "language_loss": 0.80174005, + "learning_rate": 3.067559762415682e-06, + "loss": 0.82341576, + "num_input_tokens_seen": 121862260, + "step": 5674, + "time_per_iteration": 2.509626626968384 + }, + { + "auxiliary_loss_clip": 0.01066413, + "auxiliary_loss_mlp": 0.01008128, + "balance_loss_clip": 1.03551722, + "balance_loss_mlp": 1.00658989, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7900643178684715, + "language_loss": 0.56111622, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58186162, + "num_input_tokens_seen": 121923560, + "step": 5675, + "time_per_iteration": 3.192898750305176 + }, + { + "auxiliary_loss_clip": 0.01119003, + "auxiliary_loss_mlp": 0.0078071, + "balance_loss_clip": 1.05249548, + "balance_loss_mlp": 1.00042927, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.7963156755774818, + "language_loss": 0.78855503, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.8075521, + "num_input_tokens_seen": 121943515, + "step": 5676, + "time_per_iteration": 2.561347723007202 + }, + { + "auxiliary_loss_clip": 0.01125347, + "auxiliary_loss_mlp": 0.01036277, + "balance_loss_clip": 1.04725623, + "balance_loss_mlp": 1.02104235, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8208771286817038, + "language_loss": 0.85497129, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87658751, + "num_input_tokens_seen": 121962540, + "step": 5677, + "time_per_iteration": 2.5007851123809814 + }, + { + "auxiliary_loss_clip": 0.01115785, + "auxiliary_loss_mlp": 0.01042952, + "balance_loss_clip": 1.05094326, + "balance_loss_mlp": 1.02691245, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 2.0004919928070057, + "language_loss": 0.79221237, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81379968, + "num_input_tokens_seen": 121979830, + "step": 5678, + "time_per_iteration": 2.551588296890259 + }, + { + "auxiliary_loss_clip": 0.0112816, + "auxiliary_loss_mlp": 0.01033233, + "balance_loss_clip": 1.04999781, + "balance_loss_mlp": 1.01884472, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.668929546416599, + "language_loss": 0.7497282, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.77134216, + "num_input_tokens_seen": 121999055, + "step": 5679, + "time_per_iteration": 2.537214756011963 + }, + { + "auxiliary_loss_clip": 0.01059877, + "auxiliary_loss_mlp": 0.01002615, + "balance_loss_clip": 1.0390811, + "balance_loss_mlp": 1.00116658, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7252124178382523, + "language_loss": 0.59449106, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.615116, + "num_input_tokens_seen": 122067015, + "step": 5680, + "time_per_iteration": 3.122957944869995 + }, + { + "auxiliary_loss_clip": 0.01113637, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.04729557, + "balance_loss_mlp": 1.02206421, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 3.133830369582173, + "language_loss": 0.72378623, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.7452904, + "num_input_tokens_seen": 122085295, + "step": 5681, + "time_per_iteration": 2.535559892654419 + }, + { + "auxiliary_loss_clip": 0.01114312, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.04933679, + "balance_loss_mlp": 1.02559328, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.1627692574969184, + "language_loss": 0.71487379, + "learning_rate": 3.064923764577233e-06, + "loss": 0.73641694, + "num_input_tokens_seen": 122104020, + "step": 5682, + "time_per_iteration": 4.424094200134277 + }, + { + "auxiliary_loss_clip": 0.01137445, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.04879916, + "balance_loss_mlp": 1.02551436, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.9644949138402519, + "language_loss": 0.83718908, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.85897005, + "num_input_tokens_seen": 122125080, + "step": 5683, + "time_per_iteration": 2.5336108207702637 + }, + { + "auxiliary_loss_clip": 0.01119215, + "auxiliary_loss_mlp": 0.01046656, + "balance_loss_clip": 1.05207586, + "balance_loss_mlp": 1.03129613, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 2.200253290682589, + "language_loss": 0.70617539, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72783411, + "num_input_tokens_seen": 122146350, + "step": 5684, + "time_per_iteration": 2.577786684036255 + }, + { + "auxiliary_loss_clip": 0.01136869, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.05165029, + "balance_loss_mlp": 1.02065492, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.483257085015041, + "language_loss": 0.7500912, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77180731, + "num_input_tokens_seen": 122168085, + "step": 5685, + "time_per_iteration": 2.514254331588745 + }, + { + "auxiliary_loss_clip": 0.0111967, + "auxiliary_loss_mlp": 0.01042407, + "balance_loss_clip": 1.04685462, + "balance_loss_mlp": 1.02698135, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 3.8727406329220586, + "language_loss": 0.70361292, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72523367, + "num_input_tokens_seen": 122191040, + "step": 5686, + "time_per_iteration": 2.5719048976898193 + }, + { + "auxiliary_loss_clip": 0.01128013, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.04987645, + "balance_loss_mlp": 1.02557755, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 1.9393800590880514, + "language_loss": 0.77654302, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79823351, + "num_input_tokens_seen": 122209225, + "step": 5687, + "time_per_iteration": 2.4857137203216553 + }, + { + "auxiliary_loss_clip": 0.01110422, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.04885268, + "balance_loss_mlp": 1.02064037, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.7602708453303504, + "language_loss": 0.86544812, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88690996, + "num_input_tokens_seen": 122226160, + "step": 5688, + "time_per_iteration": 2.519315004348755 + }, + { + "auxiliary_loss_clip": 0.01124082, + "auxiliary_loss_mlp": 0.0104161, + "balance_loss_clip": 1.04925096, + "balance_loss_mlp": 1.0253973, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 2.0892186135611293, + "language_loss": 0.79386002, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.81551695, + "num_input_tokens_seen": 122243115, + "step": 5689, + "time_per_iteration": 2.606715202331543 + }, + { + "auxiliary_loss_clip": 0.01128461, + "auxiliary_loss_mlp": 0.01043423, + "balance_loss_clip": 1.04849839, + "balance_loss_mlp": 1.02685261, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 2.3157557567199603, + "language_loss": 0.73725367, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75897253, + "num_input_tokens_seen": 122261105, + "step": 5690, + "time_per_iteration": 2.507883071899414 + }, + { + "auxiliary_loss_clip": 0.0111897, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.04415154, + "balance_loss_mlp": 1.02832496, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 1.9972863473082052, + "language_loss": 0.76311392, + "learning_rate": 3.061955178104237e-06, + "loss": 0.7847442, + "num_input_tokens_seen": 122279995, + "step": 5691, + "time_per_iteration": 2.5972414016723633 + }, + { + "auxiliary_loss_clip": 0.01121983, + "auxiliary_loss_mlp": 0.01034333, + "balance_loss_clip": 1.04802597, + "balance_loss_mlp": 1.02072501, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.5743577414502286, + "language_loss": 0.67901015, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70057333, + "num_input_tokens_seen": 122299070, + "step": 5692, + "time_per_iteration": 2.5593082904815674 + }, + { + "auxiliary_loss_clip": 0.01125021, + "auxiliary_loss_mlp": 0.01043442, + "balance_loss_clip": 1.04638076, + "balance_loss_mlp": 1.02728927, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.844383042423285, + "language_loss": 0.72189802, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74358261, + "num_input_tokens_seen": 122316800, + "step": 5693, + "time_per_iteration": 2.4708852767944336 + }, + { + "auxiliary_loss_clip": 0.01090854, + "auxiliary_loss_mlp": 0.01042736, + "balance_loss_clip": 1.04101634, + "balance_loss_mlp": 1.02783513, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.7824379691480354, + "language_loss": 0.75291586, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.7742517, + "num_input_tokens_seen": 122335275, + "step": 5694, + "time_per_iteration": 2.582296133041382 + }, + { + "auxiliary_loss_clip": 0.0109428, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.04697418, + "balance_loss_mlp": 1.02530932, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.9186625152911285, + "language_loss": 0.79872149, + "learning_rate": 3.060634758790747e-06, + "loss": 0.82006061, + "num_input_tokens_seen": 122353215, + "step": 5695, + "time_per_iteration": 2.5608091354370117 + }, + { + "auxiliary_loss_clip": 0.01083366, + "auxiliary_loss_mlp": 0.01041624, + "balance_loss_clip": 1.04174447, + "balance_loss_mlp": 1.02647257, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 2.1739580419647044, + "language_loss": 0.73478276, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75603265, + "num_input_tokens_seen": 122372495, + "step": 5696, + "time_per_iteration": 2.6543667316436768 + }, + { + "auxiliary_loss_clip": 0.01093336, + "auxiliary_loss_mlp": 0.01057071, + "balance_loss_clip": 1.04012132, + "balance_loss_mlp": 1.03966618, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.7064406984625555, + "language_loss": 0.70820123, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.72970533, + "num_input_tokens_seen": 122394600, + "step": 5697, + "time_per_iteration": 4.106454610824585 + }, + { + "auxiliary_loss_clip": 0.01112971, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.04809713, + "balance_loss_mlp": 1.01767111, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.8452510628861571, + "language_loss": 0.82200456, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84345764, + "num_input_tokens_seen": 122414700, + "step": 5698, + "time_per_iteration": 2.546560287475586 + }, + { + "auxiliary_loss_clip": 0.01083597, + "auxiliary_loss_mlp": 0.01054168, + "balance_loss_clip": 1.04474783, + "balance_loss_mlp": 1.03592873, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 2.1682699180645577, + "language_loss": 0.6932199, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71459758, + "num_input_tokens_seen": 122432760, + "step": 5699, + "time_per_iteration": 2.620602607727051 + }, + { + "auxiliary_loss_clip": 0.01112004, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.04589415, + "balance_loss_mlp": 1.0190016, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 5.248386577563968, + "language_loss": 0.72469509, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74615169, + "num_input_tokens_seen": 122449105, + "step": 5700, + "time_per_iteration": 2.5555646419525146 + }, + { + "auxiliary_loss_clip": 0.01109959, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.05070961, + "balance_loss_mlp": 1.02107549, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 1.9868301933961605, + "language_loss": 0.81808859, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83954179, + "num_input_tokens_seen": 122468700, + "step": 5701, + "time_per_iteration": 4.018882989883423 + }, + { + "auxiliary_loss_clip": 0.01119546, + "auxiliary_loss_mlp": 0.01042327, + "balance_loss_clip": 1.04751158, + "balance_loss_mlp": 1.02766454, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.7224144949429419, + "language_loss": 0.71628535, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73790407, + "num_input_tokens_seen": 122488160, + "step": 5702, + "time_per_iteration": 2.4770407676696777 + }, + { + "auxiliary_loss_clip": 0.0103483, + "auxiliary_loss_mlp": 0.01002685, + "balance_loss_clip": 1.02708256, + "balance_loss_mlp": 1.0009799, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.7820971018084307, + "language_loss": 0.5745163, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59489155, + "num_input_tokens_seen": 122542890, + "step": 5703, + "time_per_iteration": 3.023967742919922 + }, + { + "auxiliary_loss_clip": 0.01123916, + "auxiliary_loss_mlp": 0.01042873, + "balance_loss_clip": 1.04504681, + "balance_loss_mlp": 1.0263561, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.7550343181029182, + "language_loss": 0.75059116, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77225912, + "num_input_tokens_seen": 122561770, + "step": 5704, + "time_per_iteration": 2.5258281230926514 + }, + { + "auxiliary_loss_clip": 0.01097952, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.04505277, + "balance_loss_mlp": 1.02449882, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.149933928107565, + "language_loss": 0.7255553, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.74691534, + "num_input_tokens_seen": 122580580, + "step": 5705, + "time_per_iteration": 2.555765151977539 + }, + { + "auxiliary_loss_clip": 0.01097795, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.0477773, + "balance_loss_mlp": 1.02042961, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 4.724657855897227, + "language_loss": 0.79684424, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81817436, + "num_input_tokens_seen": 122599810, + "step": 5706, + "time_per_iteration": 2.660663366317749 + }, + { + "auxiliary_loss_clip": 0.01121685, + "auxiliary_loss_mlp": 0.01033519, + "balance_loss_clip": 1.05055356, + "balance_loss_mlp": 1.01811111, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 2.3931213900116033, + "language_loss": 0.83025503, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85180706, + "num_input_tokens_seen": 122616035, + "step": 5707, + "time_per_iteration": 3.9280917644500732 + }, + { + "auxiliary_loss_clip": 0.01124814, + "auxiliary_loss_mlp": 0.01036593, + "balance_loss_clip": 1.04847121, + "balance_loss_mlp": 1.02201343, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.5336323703419843, + "language_loss": 0.75362885, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77524292, + "num_input_tokens_seen": 122633785, + "step": 5708, + "time_per_iteration": 2.461540937423706 + }, + { + "auxiliary_loss_clip": 0.0110324, + "auxiliary_loss_mlp": 0.01041519, + "balance_loss_clip": 1.04422641, + "balance_loss_mlp": 1.02713048, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.7018124196860187, + "language_loss": 0.81199992, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83344746, + "num_input_tokens_seen": 122652100, + "step": 5709, + "time_per_iteration": 2.5656092166900635 + }, + { + "auxiliary_loss_clip": 0.01115632, + "auxiliary_loss_mlp": 0.01040204, + "balance_loss_clip": 1.04846895, + "balance_loss_mlp": 1.02383661, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.0940646434712584, + "language_loss": 0.7880187, + "learning_rate": 3.055677461649329e-06, + "loss": 0.80957705, + "num_input_tokens_seen": 122669720, + "step": 5710, + "time_per_iteration": 2.5627315044403076 + }, + { + "auxiliary_loss_clip": 0.01126699, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.04677773, + "balance_loss_mlp": 1.02177167, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 2.692368065858351, + "language_loss": 0.70306259, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72471118, + "num_input_tokens_seen": 122688715, + "step": 5711, + "time_per_iteration": 2.481107711791992 + }, + { + "auxiliary_loss_clip": 0.01103451, + "auxiliary_loss_mlp": 0.00783529, + "balance_loss_clip": 1.04402936, + "balance_loss_mlp": 1.00039756, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.96117695264813, + "language_loss": 0.67066348, + "learning_rate": 3.055015807239812e-06, + "loss": 0.68953329, + "num_input_tokens_seen": 122706970, + "step": 5712, + "time_per_iteration": 2.5606679916381836 + }, + { + "auxiliary_loss_clip": 0.01032962, + "auxiliary_loss_mlp": 0.00999348, + "balance_loss_clip": 1.03010106, + "balance_loss_mlp": 0.99742854, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8490560894347438, + "language_loss": 0.58112723, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60145032, + "num_input_tokens_seen": 122758095, + "step": 5713, + "time_per_iteration": 3.0973591804504395 + }, + { + "auxiliary_loss_clip": 0.01134618, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.04619157, + "balance_loss_mlp": 1.02103782, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6270572204567706, + "language_loss": 0.80557072, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82727659, + "num_input_tokens_seen": 122777815, + "step": 5714, + "time_per_iteration": 2.459057092666626 + }, + { + "auxiliary_loss_clip": 0.01134961, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.04715621, + "balance_loss_mlp": 1.02668619, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 2.8806725342299373, + "language_loss": 0.72155035, + "learning_rate": 3.05402302560962e-06, + "loss": 0.74332446, + "num_input_tokens_seen": 122797555, + "step": 5715, + "time_per_iteration": 2.518897771835327 + }, + { + "auxiliary_loss_clip": 0.01043007, + "auxiliary_loss_mlp": 0.01022851, + "balance_loss_clip": 1.02825141, + "balance_loss_mlp": 1.02090764, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.9267962452524166, + "language_loss": 0.65832251, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67898113, + "num_input_tokens_seen": 122863955, + "step": 5716, + "time_per_iteration": 3.0879368782043457 + }, + { + "auxiliary_loss_clip": 0.01120665, + "auxiliary_loss_mlp": 0.01045318, + "balance_loss_clip": 1.04568458, + "balance_loss_mlp": 1.03004682, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 2.381001777849116, + "language_loss": 0.74587822, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76753801, + "num_input_tokens_seen": 122883000, + "step": 5717, + "time_per_iteration": 2.458216428756714 + }, + { + "auxiliary_loss_clip": 0.01088599, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.0521003, + "balance_loss_mlp": 1.02820301, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.8429205499896222, + "language_loss": 0.7524296, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77374494, + "num_input_tokens_seen": 122903265, + "step": 5718, + "time_per_iteration": 2.6746954917907715 + }, + { + "auxiliary_loss_clip": 0.01098199, + "auxiliary_loss_mlp": 0.01047498, + "balance_loss_clip": 1.04634643, + "balance_loss_mlp": 1.03146994, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.761142743510998, + "language_loss": 0.63434112, + "learning_rate": 3.052698757266734e-06, + "loss": 0.65579808, + "num_input_tokens_seen": 122923860, + "step": 5719, + "time_per_iteration": 2.731374502182007 + }, + { + "auxiliary_loss_clip": 0.010933, + "auxiliary_loss_mlp": 0.01041323, + "balance_loss_clip": 1.04265773, + "balance_loss_mlp": 1.02420402, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 1.8225115807239567, + "language_loss": 0.73579776, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75714397, + "num_input_tokens_seen": 122945305, + "step": 5720, + "time_per_iteration": 2.704456090927124 + }, + { + "auxiliary_loss_clip": 0.01120495, + "auxiliary_loss_mlp": 0.01053436, + "balance_loss_clip": 1.04494286, + "balance_loss_mlp": 1.03522062, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.6818306695724652, + "language_loss": 0.74183965, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76357895, + "num_input_tokens_seen": 122962535, + "step": 5721, + "time_per_iteration": 2.5114221572875977 + }, + { + "auxiliary_loss_clip": 0.01111883, + "auxiliary_loss_mlp": 0.00783892, + "balance_loss_clip": 1.04493904, + "balance_loss_mlp": 1.0003351, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 1.9955333822818804, + "language_loss": 0.80121946, + "learning_rate": 3.051705136821992e-06, + "loss": 0.8201772, + "num_input_tokens_seen": 122979750, + "step": 5722, + "time_per_iteration": 4.14131498336792 + }, + { + "auxiliary_loss_clip": 0.01083513, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.04261208, + "balance_loss_mlp": 1.02172327, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.7511704603829592, + "language_loss": 0.8157208, + "learning_rate": 3.051373850228801e-06, + "loss": 0.83691132, + "num_input_tokens_seen": 122998955, + "step": 5723, + "time_per_iteration": 2.5830039978027344 + }, + { + "auxiliary_loss_clip": 0.01100646, + "auxiliary_loss_mlp": 0.01057359, + "balance_loss_clip": 1.04477906, + "balance_loss_mlp": 1.04115856, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.9152082541362871, + "language_loss": 0.81174016, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.8333202, + "num_input_tokens_seen": 123016165, + "step": 5724, + "time_per_iteration": 2.5312397480010986 + }, + { + "auxiliary_loss_clip": 0.01112177, + "auxiliary_loss_mlp": 0.01048351, + "balance_loss_clip": 1.04697299, + "balance_loss_mlp": 1.03162622, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.8370656209652805, + "language_loss": 0.6923033, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71390855, + "num_input_tokens_seen": 123036900, + "step": 5725, + "time_per_iteration": 2.715420961380005 + }, + { + "auxiliary_loss_clip": 0.01127761, + "auxiliary_loss_mlp": 0.01044609, + "balance_loss_clip": 1.04897499, + "balance_loss_mlp": 1.02768135, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.6102309660093235, + "language_loss": 0.6959132, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71763682, + "num_input_tokens_seen": 123057480, + "step": 5726, + "time_per_iteration": 2.6806788444519043 + }, + { + "auxiliary_loss_clip": 0.01106581, + "auxiliary_loss_mlp": 0.01040597, + "balance_loss_clip": 1.04927444, + "balance_loss_mlp": 1.02636981, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 2.887105528574855, + "language_loss": 0.73596299, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75743479, + "num_input_tokens_seen": 123076890, + "step": 5727, + "time_per_iteration": 2.6849591732025146 + }, + { + "auxiliary_loss_clip": 0.01098747, + "auxiliary_loss_mlp": 0.01046269, + "balance_loss_clip": 1.04686666, + "balance_loss_mlp": 1.02994978, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 2.105291378388228, + "language_loss": 0.88205469, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90350485, + "num_input_tokens_seen": 123092530, + "step": 5728, + "time_per_iteration": 2.643383502960205 + }, + { + "auxiliary_loss_clip": 0.01089164, + "auxiliary_loss_mlp": 0.0104627, + "balance_loss_clip": 1.04273307, + "balance_loss_mlp": 1.03073096, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.779039449555202, + "language_loss": 0.69841301, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.71976739, + "num_input_tokens_seen": 123110560, + "step": 5729, + "time_per_iteration": 2.7237987518310547 + }, + { + "auxiliary_loss_clip": 0.01122341, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.04730463, + "balance_loss_mlp": 1.01791167, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.8593484224157901, + "language_loss": 0.74322987, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76478046, + "num_input_tokens_seen": 123128655, + "step": 5730, + "time_per_iteration": 2.566742181777954 + }, + { + "auxiliary_loss_clip": 0.01096929, + "auxiliary_loss_mlp": 0.01050708, + "balance_loss_clip": 1.04135633, + "balance_loss_mlp": 1.03404272, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.2207733440077098, + "language_loss": 0.79522359, + "learning_rate": 3.048722123283578e-06, + "loss": 0.81669998, + "num_input_tokens_seen": 123145130, + "step": 5731, + "time_per_iteration": 2.5719852447509766 + }, + { + "auxiliary_loss_clip": 0.0112652, + "auxiliary_loss_mlp": 0.01039337, + "balance_loss_clip": 1.04929495, + "balance_loss_mlp": 1.02442956, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 1.969541688253782, + "language_loss": 0.77929914, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.80095768, + "num_input_tokens_seen": 123162265, + "step": 5732, + "time_per_iteration": 2.525360107421875 + }, + { + "auxiliary_loss_clip": 0.01027331, + "auxiliary_loss_mlp": 0.01039638, + "balance_loss_clip": 1.02487183, + "balance_loss_mlp": 1.03795767, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7491996299720339, + "language_loss": 0.53526843, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55593812, + "num_input_tokens_seen": 123218620, + "step": 5733, + "time_per_iteration": 3.2304935455322266 + }, + { + "auxiliary_loss_clip": 0.01117443, + "auxiliary_loss_mlp": 0.01042568, + "balance_loss_clip": 1.04873133, + "balance_loss_mlp": 1.02726138, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.9119867873280838, + "language_loss": 0.83434367, + "learning_rate": 3.047727069167207e-06, + "loss": 0.8559438, + "num_input_tokens_seen": 123237325, + "step": 5734, + "time_per_iteration": 2.623608112335205 + }, + { + "auxiliary_loss_clip": 0.01115009, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.04782701, + "balance_loss_mlp": 1.01645756, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 2.0842068177039565, + "language_loss": 0.92499912, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.946468, + "num_input_tokens_seen": 123258650, + "step": 5735, + "time_per_iteration": 2.660661458969116 + }, + { + "auxiliary_loss_clip": 0.01098992, + "auxiliary_loss_mlp": 0.01040419, + "balance_loss_clip": 1.05079794, + "balance_loss_mlp": 1.02451634, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.702164799753229, + "language_loss": 0.76641321, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78780729, + "num_input_tokens_seen": 123277155, + "step": 5736, + "time_per_iteration": 4.114710569381714 + }, + { + "auxiliary_loss_clip": 0.01120314, + "auxiliary_loss_mlp": 0.01040223, + "balance_loss_clip": 1.05113792, + "balance_loss_mlp": 1.02494073, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.6545738319076506, + "language_loss": 0.78952843, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.8111338, + "num_input_tokens_seen": 123297640, + "step": 5737, + "time_per_iteration": 2.5613651275634766 + }, + { + "auxiliary_loss_clip": 0.01088296, + "auxiliary_loss_mlp": 0.01047873, + "balance_loss_clip": 1.04368174, + "balance_loss_mlp": 1.02971697, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.247889812089208, + "language_loss": 0.71167076, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73303246, + "num_input_tokens_seen": 123314370, + "step": 5738, + "time_per_iteration": 2.6306090354919434 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.04447126, + "balance_loss_mlp": 1.01958954, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 2.249716379153133, + "language_loss": 0.8167212, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83809072, + "num_input_tokens_seen": 123336085, + "step": 5739, + "time_per_iteration": 2.642112970352173 + }, + { + "auxiliary_loss_clip": 0.01107795, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.0497911, + "balance_loss_mlp": 1.02216959, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 1.9054645186185382, + "language_loss": 0.82675087, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.84820622, + "num_input_tokens_seen": 123354460, + "step": 5740, + "time_per_iteration": 4.059816122055054 + }, + { + "auxiliary_loss_clip": 0.01132986, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.05419052, + "balance_loss_mlp": 1.01661515, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 3.2364808487897605, + "language_loss": 0.76960278, + "learning_rate": 3.045403886269181e-06, + "loss": 0.79126626, + "num_input_tokens_seen": 123373420, + "step": 5741, + "time_per_iteration": 2.4972126483917236 + }, + { + "auxiliary_loss_clip": 0.01113632, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.04596782, + "balance_loss_mlp": 1.02137411, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.6362676442377677, + "language_loss": 0.77111024, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79261446, + "num_input_tokens_seen": 123394730, + "step": 5742, + "time_per_iteration": 2.6037678718566895 + }, + { + "auxiliary_loss_clip": 0.01125435, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.04661512, + "balance_loss_mlp": 1.02201271, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.8903710413624384, + "language_loss": 0.75668156, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.77831, + "num_input_tokens_seen": 123412895, + "step": 5743, + "time_per_iteration": 2.477011203765869 + }, + { + "auxiliary_loss_clip": 0.01127173, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.05093336, + "balance_loss_mlp": 1.02359092, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 3.4014269033386397, + "language_loss": 0.70119441, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.7228508, + "num_input_tokens_seen": 123432320, + "step": 5744, + "time_per_iteration": 2.596914768218994 + }, + { + "auxiliary_loss_clip": 0.01135709, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.04968214, + "balance_loss_mlp": 1.01744819, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 2.1798076372217747, + "language_loss": 0.79631412, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81800002, + "num_input_tokens_seen": 123450980, + "step": 5745, + "time_per_iteration": 2.440426826477051 + }, + { + "auxiliary_loss_clip": 0.0109685, + "auxiliary_loss_mlp": 0.01041047, + "balance_loss_clip": 1.04989409, + "balance_loss_mlp": 1.02484655, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.761290588675956, + "language_loss": 0.89332896, + "learning_rate": 3.043743280407182e-06, + "loss": 0.9147079, + "num_input_tokens_seen": 123469365, + "step": 5746, + "time_per_iteration": 3.955479145050049 + }, + { + "auxiliary_loss_clip": 0.01130358, + "auxiliary_loss_mlp": 0.01038709, + "balance_loss_clip": 1.05013239, + "balance_loss_mlp": 1.02315235, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 1.7007618387491183, + "language_loss": 0.64554524, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66723597, + "num_input_tokens_seen": 123489425, + "step": 5747, + "time_per_iteration": 2.5044970512390137 + }, + { + "auxiliary_loss_clip": 0.01115118, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.0442251, + "balance_loss_mlp": 1.02299809, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5692549153498605, + "language_loss": 0.72811985, + "learning_rate": 3.043078760922264e-06, + "loss": 0.74964291, + "num_input_tokens_seen": 123509970, + "step": 5748, + "time_per_iteration": 2.6688244342803955 + }, + { + "auxiliary_loss_clip": 0.01080386, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.04505014, + "balance_loss_mlp": 1.01746917, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.5972644007234946, + "language_loss": 0.75503021, + "learning_rate": 3.042746441843029e-06, + "loss": 0.7761445, + "num_input_tokens_seen": 123531055, + "step": 5749, + "time_per_iteration": 2.6216351985931396 + }, + { + "auxiliary_loss_clip": 0.01040982, + "auxiliary_loss_mlp": 0.01002294, + "balance_loss_clip": 1.02756238, + "balance_loss_mlp": 1.00052965, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8821056370281806, + "language_loss": 0.62707347, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64750624, + "num_input_tokens_seen": 123584720, + "step": 5750, + "time_per_iteration": 2.9753830432891846 + }, + { + "auxiliary_loss_clip": 0.01110269, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.04902864, + "balance_loss_mlp": 1.01919007, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 2.2350678547704494, + "language_loss": 0.80462497, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82605904, + "num_input_tokens_seen": 123604465, + "step": 5751, + "time_per_iteration": 2.533536195755005 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.01044182, + "balance_loss_clip": 1.04992795, + "balance_loss_mlp": 1.02938807, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 2.189775197148131, + "language_loss": 0.83983672, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86163616, + "num_input_tokens_seen": 123622320, + "step": 5752, + "time_per_iteration": 2.4350905418395996 + }, + { + "auxiliary_loss_clip": 0.01036135, + "auxiliary_loss_mlp": 0.00755538, + "balance_loss_clip": 1.02516174, + "balance_loss_mlp": 1.00025678, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7457118993538018, + "language_loss": 0.63077748, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.64869416, + "num_input_tokens_seen": 123678010, + "step": 5753, + "time_per_iteration": 2.958757162094116 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.01037368, + "balance_loss_clip": 1.04841709, + "balance_loss_mlp": 1.02148962, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.7573394126973787, + "language_loss": 0.70295411, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.72444254, + "num_input_tokens_seen": 123696830, + "step": 5754, + "time_per_iteration": 2.488718032836914 + }, + { + "auxiliary_loss_clip": 0.01129128, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.0498836, + "balance_loss_mlp": 1.02303779, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 1.8241780882880554, + "language_loss": 0.7287733, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75044686, + "num_input_tokens_seen": 123714360, + "step": 5755, + "time_per_iteration": 2.4551854133605957 + }, + { + "auxiliary_loss_clip": 0.0112249, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.04654241, + "balance_loss_mlp": 1.01866317, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.7008789592314593, + "language_loss": 0.72664773, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74820125, + "num_input_tokens_seen": 123739250, + "step": 5756, + "time_per_iteration": 2.6302530765533447 + }, + { + "auxiliary_loss_clip": 0.01046854, + "auxiliary_loss_mlp": 0.0100562, + "balance_loss_clip": 1.02640212, + "balance_loss_mlp": 1.00392687, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7225483678005024, + "language_loss": 0.62560415, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64612889, + "num_input_tokens_seen": 123802845, + "step": 5757, + "time_per_iteration": 3.067471981048584 + }, + { + "auxiliary_loss_clip": 0.01025026, + "auxiliary_loss_mlp": 0.007557, + "balance_loss_clip": 1.02325749, + "balance_loss_mlp": 1.0003047, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.821135930405923, + "language_loss": 0.59231865, + "learning_rate": 3.039753792295362e-06, + "loss": 0.6101259, + "num_input_tokens_seen": 123861805, + "step": 5758, + "time_per_iteration": 3.0870556831359863 + }, + { + "auxiliary_loss_clip": 0.01119145, + "auxiliary_loss_mlp": 0.01041319, + "balance_loss_clip": 1.05168188, + "balance_loss_mlp": 1.02734733, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.645673831240158, + "language_loss": 0.71385002, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73545468, + "num_input_tokens_seen": 123881820, + "step": 5759, + "time_per_iteration": 2.543752908706665 + }, + { + "auxiliary_loss_clip": 0.01083038, + "auxiliary_loss_mlp": 0.01057936, + "balance_loss_clip": 1.04265952, + "balance_loss_mlp": 1.04078174, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.7672220499118843, + "language_loss": 0.8301506, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85156035, + "num_input_tokens_seen": 123903700, + "step": 5760, + "time_per_iteration": 2.6106250286102295 + }, + { + "auxiliary_loss_clip": 0.01030766, + "auxiliary_loss_mlp": 0.01008775, + "balance_loss_clip": 1.0278306, + "balance_loss_mlp": 1.00704634, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8358658489081676, + "language_loss": 0.56514931, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58554471, + "num_input_tokens_seen": 123960075, + "step": 5761, + "time_per_iteration": 4.6813085079193115 + }, + { + "auxiliary_loss_clip": 0.01119358, + "auxiliary_loss_mlp": 0.00780466, + "balance_loss_clip": 1.04309046, + "balance_loss_mlp": 1.00054169, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.1269007267367055, + "language_loss": 0.95232338, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97132158, + "num_input_tokens_seen": 123975805, + "step": 5762, + "time_per_iteration": 2.449319839477539 + }, + { + "auxiliary_loss_clip": 0.01103921, + "auxiliary_loss_mlp": 0.01043248, + "balance_loss_clip": 1.04204619, + "balance_loss_mlp": 1.02757192, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 1.6210144910936746, + "language_loss": 0.69559544, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71706712, + "num_input_tokens_seen": 123997530, + "step": 5763, + "time_per_iteration": 2.6665990352630615 + }, + { + "auxiliary_loss_clip": 0.01126843, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.04718721, + "balance_loss_mlp": 1.02945065, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 1.743605898861913, + "language_loss": 0.83481485, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.85653961, + "num_input_tokens_seen": 124016375, + "step": 5764, + "time_per_iteration": 2.5270297527313232 + }, + { + "auxiliary_loss_clip": 0.01105152, + "auxiliary_loss_mlp": 0.01039334, + "balance_loss_clip": 1.04765844, + "balance_loss_mlp": 1.02451015, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.359193501300126, + "language_loss": 0.67772567, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69917053, + "num_input_tokens_seen": 124033975, + "step": 5765, + "time_per_iteration": 2.5917158126831055 + }, + { + "auxiliary_loss_clip": 0.01112357, + "auxiliary_loss_mlp": 0.01044829, + "balance_loss_clip": 1.04926372, + "balance_loss_mlp": 1.02841353, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 1.8596585677517257, + "language_loss": 0.7710706, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79264247, + "num_input_tokens_seen": 124051930, + "step": 5766, + "time_per_iteration": 2.511326313018799 + }, + { + "auxiliary_loss_clip": 0.01081504, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.04508996, + "balance_loss_mlp": 1.02580619, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.5956629051989144, + "language_loss": 0.73197079, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75318265, + "num_input_tokens_seen": 124071220, + "step": 5767, + "time_per_iteration": 2.586455821990967 + }, + { + "auxiliary_loss_clip": 0.01110641, + "auxiliary_loss_mlp": 0.01044349, + "balance_loss_clip": 1.04905927, + "balance_loss_mlp": 1.02901888, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 1.8076361131450032, + "language_loss": 0.78264582, + "learning_rate": 3.036424880912893e-06, + "loss": 0.8041957, + "num_input_tokens_seen": 124090140, + "step": 5768, + "time_per_iteration": 2.5664663314819336 + }, + { + "auxiliary_loss_clip": 0.0104644, + "auxiliary_loss_mlp": 0.01004586, + "balance_loss_clip": 1.02602756, + "balance_loss_mlp": 1.00278556, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7715264794083121, + "language_loss": 0.57457453, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59508479, + "num_input_tokens_seen": 124152025, + "step": 5769, + "time_per_iteration": 3.088996171951294 + }, + { + "auxiliary_loss_clip": 0.01110362, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.04777777, + "balance_loss_mlp": 1.02072716, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.7810457656021295, + "language_loss": 0.85591745, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87740463, + "num_input_tokens_seen": 124165795, + "step": 5770, + "time_per_iteration": 2.523674964904785 + }, + { + "auxiliary_loss_clip": 0.01039251, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.03243029, + "balance_loss_mlp": 1.02422607, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7742374177055293, + "language_loss": 0.59775209, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61841238, + "num_input_tokens_seen": 124222925, + "step": 5771, + "time_per_iteration": 2.871022939682007 + }, + { + "auxiliary_loss_clip": 0.01127777, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.04964662, + "balance_loss_mlp": 1.03319025, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 1.8141757192296388, + "language_loss": 0.71495372, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73671782, + "num_input_tokens_seen": 124240915, + "step": 5772, + "time_per_iteration": 2.661123514175415 + }, + { + "auxiliary_loss_clip": 0.01112801, + "auxiliary_loss_mlp": 0.00781404, + "balance_loss_clip": 1.05183721, + "balance_loss_mlp": 1.00063121, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.57253520932542, + "language_loss": 0.76512229, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78406429, + "num_input_tokens_seen": 124262770, + "step": 5773, + "time_per_iteration": 2.7275662422180176 + }, + { + "auxiliary_loss_clip": 0.01129354, + "auxiliary_loss_mlp": 0.01038101, + "balance_loss_clip": 1.04835272, + "balance_loss_mlp": 1.02216244, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.360617662369923, + "language_loss": 0.70676017, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72843468, + "num_input_tokens_seen": 124280950, + "step": 5774, + "time_per_iteration": 2.5648081302642822 + }, + { + "auxiliary_loss_clip": 0.0111301, + "auxiliary_loss_mlp": 0.00780154, + "balance_loss_clip": 1.04747033, + "balance_loss_mlp": 1.00060821, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.740080927488621, + "language_loss": 0.76360583, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78253746, + "num_input_tokens_seen": 124299540, + "step": 5775, + "time_per_iteration": 4.138142824172974 + }, + { + "auxiliary_loss_clip": 0.011158, + "auxiliary_loss_mlp": 0.0103819, + "balance_loss_clip": 1.04617083, + "balance_loss_mlp": 1.02208519, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.0390398000948835, + "language_loss": 0.77426499, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.79580486, + "num_input_tokens_seen": 124316285, + "step": 5776, + "time_per_iteration": 2.487018585205078 + }, + { + "auxiliary_loss_clip": 0.01040236, + "auxiliary_loss_mlp": 0.0101186, + "balance_loss_clip": 1.02908182, + "balance_loss_mlp": 1.01000011, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.880428296781552, + "language_loss": 0.63369066, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65421164, + "num_input_tokens_seen": 124376650, + "step": 5777, + "time_per_iteration": 3.1309969425201416 + }, + { + "auxiliary_loss_clip": 0.01099345, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.04843032, + "balance_loss_mlp": 1.02171469, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 2.6154957123066085, + "language_loss": 0.64746892, + "learning_rate": 3.033092039398119e-06, + "loss": 0.66883337, + "num_input_tokens_seen": 124396475, + "step": 5778, + "time_per_iteration": 2.603090763092041 + }, + { + "auxiliary_loss_clip": 0.01116864, + "auxiliary_loss_mlp": 0.01048574, + "balance_loss_clip": 1.04752433, + "balance_loss_mlp": 1.03305244, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.934408623290316, + "language_loss": 0.71394551, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73559988, + "num_input_tokens_seen": 124416480, + "step": 5779, + "time_per_iteration": 4.162119626998901 + }, + { + "auxiliary_loss_clip": 0.01142693, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.05076718, + "balance_loss_mlp": 1.02961314, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 5.2321782157017696, + "language_loss": 0.622949, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64482921, + "num_input_tokens_seen": 124435950, + "step": 5780, + "time_per_iteration": 2.4897067546844482 + }, + { + "auxiliary_loss_clip": 0.01098273, + "auxiliary_loss_mlp": 0.01041992, + "balance_loss_clip": 1.04517865, + "balance_loss_mlp": 1.02643502, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.6741451086846577, + "language_loss": 0.72184336, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74324596, + "num_input_tokens_seen": 124455410, + "step": 5781, + "time_per_iteration": 2.564687967300415 + }, + { + "auxiliary_loss_clip": 0.01082302, + "auxiliary_loss_mlp": 0.01053777, + "balance_loss_clip": 1.04310465, + "balance_loss_mlp": 1.03642607, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.1904719319905985, + "language_loss": 0.77001798, + "learning_rate": 3.031757805185612e-06, + "loss": 0.79137874, + "num_input_tokens_seen": 124474870, + "step": 5782, + "time_per_iteration": 2.597277879714966 + }, + { + "auxiliary_loss_clip": 0.01110434, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.04939258, + "balance_loss_mlp": 1.02009296, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.8877220977678963, + "language_loss": 0.62640929, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64786232, + "num_input_tokens_seen": 124494105, + "step": 5783, + "time_per_iteration": 2.5170116424560547 + }, + { + "auxiliary_loss_clip": 0.01094416, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.04837239, + "balance_loss_mlp": 1.01706648, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.864137623674045, + "language_loss": 0.88613898, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90739655, + "num_input_tokens_seen": 124512030, + "step": 5784, + "time_per_iteration": 2.5815107822418213 + }, + { + "auxiliary_loss_clip": 0.01090223, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.05048418, + "balance_loss_mlp": 1.02144992, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.7591629129462585, + "language_loss": 0.81622094, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83748537, + "num_input_tokens_seen": 124530980, + "step": 5785, + "time_per_iteration": 4.066852807998657 + }, + { + "auxiliary_loss_clip": 0.01111202, + "auxiliary_loss_mlp": 0.01043466, + "balance_loss_clip": 1.0464499, + "balance_loss_mlp": 1.02808833, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 2.299933188364638, + "language_loss": 0.80595422, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82750082, + "num_input_tokens_seen": 124549330, + "step": 5786, + "time_per_iteration": 2.5544092655181885 + }, + { + "auxiliary_loss_clip": 0.01138222, + "auxiliary_loss_mlp": 0.00781355, + "balance_loss_clip": 1.05267525, + "balance_loss_mlp": 1.00057054, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.787463940365769, + "language_loss": 0.74915612, + "learning_rate": 3.030089132216836e-06, + "loss": 0.76835191, + "num_input_tokens_seen": 124567200, + "step": 5787, + "time_per_iteration": 2.447582483291626 + }, + { + "auxiliary_loss_clip": 0.01109851, + "auxiliary_loss_mlp": 0.00782233, + "balance_loss_clip": 1.04395652, + "balance_loss_mlp": 1.00065553, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.561370731147942, + "language_loss": 0.81208134, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83100212, + "num_input_tokens_seen": 124587025, + "step": 5788, + "time_per_iteration": 2.596297025680542 + }, + { + "auxiliary_loss_clip": 0.01146354, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.05546403, + "balance_loss_mlp": 1.02251673, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.7484357945380968, + "language_loss": 0.85686791, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87871224, + "num_input_tokens_seen": 124605860, + "step": 5789, + "time_per_iteration": 2.448669195175171 + }, + { + "auxiliary_loss_clip": 0.01134801, + "auxiliary_loss_mlp": 0.01057655, + "balance_loss_clip": 1.05559587, + "balance_loss_mlp": 1.04215789, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.9998046968235463, + "language_loss": 0.85020328, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87212777, + "num_input_tokens_seen": 124624270, + "step": 5790, + "time_per_iteration": 2.474276065826416 + }, + { + "auxiliary_loss_clip": 0.01131767, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_clip": 1.05123162, + "balance_loss_mlp": 1.02889514, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 1.9379331565396059, + "language_loss": 0.81527758, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.8370415, + "num_input_tokens_seen": 124644005, + "step": 5791, + "time_per_iteration": 2.529733657836914 + }, + { + "auxiliary_loss_clip": 0.01130563, + "auxiliary_loss_mlp": 0.01041467, + "balance_loss_clip": 1.05194259, + "balance_loss_mlp": 1.0259285, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.8229129512249664, + "language_loss": 0.7781226, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79984295, + "num_input_tokens_seen": 124663020, + "step": 5792, + "time_per_iteration": 2.5284745693206787 + }, + { + "auxiliary_loss_clip": 0.01114376, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.04528034, + "balance_loss_mlp": 1.02086461, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.5182588374535746, + "language_loss": 0.81387436, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83537686, + "num_input_tokens_seen": 124682975, + "step": 5793, + "time_per_iteration": 2.535125970840454 + }, + { + "auxiliary_loss_clip": 0.01129857, + "auxiliary_loss_mlp": 0.01051776, + "balance_loss_clip": 1.05282724, + "balance_loss_mlp": 1.03581429, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 2.005483913631165, + "language_loss": 0.75857949, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78039587, + "num_input_tokens_seen": 124701340, + "step": 5794, + "time_per_iteration": 2.471832513809204 + }, + { + "auxiliary_loss_clip": 0.01128597, + "auxiliary_loss_mlp": 0.01041608, + "balance_loss_clip": 1.05000913, + "balance_loss_mlp": 1.02637339, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.5960454946960203, + "language_loss": 0.56998372, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59168577, + "num_input_tokens_seen": 124719165, + "step": 5795, + "time_per_iteration": 2.4915292263031006 + }, + { + "auxiliary_loss_clip": 0.01111573, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.04983258, + "balance_loss_mlp": 1.02026296, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 1.9395718936675748, + "language_loss": 0.82762474, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84909654, + "num_input_tokens_seen": 124738670, + "step": 5796, + "time_per_iteration": 2.545471429824829 + }, + { + "auxiliary_loss_clip": 0.01126591, + "auxiliary_loss_mlp": 0.01029684, + "balance_loss_clip": 1.05267882, + "balance_loss_mlp": 1.01567674, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.605503230706221, + "language_loss": 0.83900881, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.86057156, + "num_input_tokens_seen": 124758760, + "step": 5797, + "time_per_iteration": 2.5202114582061768 + }, + { + "auxiliary_loss_clip": 0.011363, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.05163908, + "balance_loss_mlp": 1.02179146, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.7225481469839168, + "language_loss": 0.73441237, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75614583, + "num_input_tokens_seen": 124777765, + "step": 5798, + "time_per_iteration": 2.495974063873291 + }, + { + "auxiliary_loss_clip": 0.0113961, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.05076993, + "balance_loss_mlp": 1.02739334, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.8061446860067303, + "language_loss": 0.76121587, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78304523, + "num_input_tokens_seen": 124796775, + "step": 5799, + "time_per_iteration": 2.451108694076538 + }, + { + "auxiliary_loss_clip": 0.01075366, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.05090618, + "balance_loss_mlp": 1.02586401, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.7272870318429625, + "language_loss": 0.75671029, + "learning_rate": 3.025746016302734e-06, + "loss": 0.77786517, + "num_input_tokens_seen": 124815825, + "step": 5800, + "time_per_iteration": 4.238599538803101 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.00782939, + "balance_loss_clip": 1.04890966, + "balance_loss_mlp": 1.00057232, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 2.341252079436275, + "language_loss": 0.67522955, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69424927, + "num_input_tokens_seen": 124838420, + "step": 5801, + "time_per_iteration": 3.0180912017822266 + }, + { + "auxiliary_loss_clip": 0.01109376, + "auxiliary_loss_mlp": 0.01041438, + "balance_loss_clip": 1.04595041, + "balance_loss_mlp": 1.02564311, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 2.0776024388652683, + "language_loss": 0.76684988, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78835803, + "num_input_tokens_seen": 124857320, + "step": 5802, + "time_per_iteration": 2.538663864135742 + }, + { + "auxiliary_loss_clip": 0.01060474, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.04354751, + "balance_loss_mlp": 1.02118945, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.6930146287004857, + "language_loss": 0.79037416, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81133938, + "num_input_tokens_seen": 124875685, + "step": 5803, + "time_per_iteration": 2.654755115509033 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.00783285, + "balance_loss_clip": 1.04504621, + "balance_loss_mlp": 1.0005933, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 2.1454514940701754, + "language_loss": 0.67683512, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69580185, + "num_input_tokens_seen": 124895960, + "step": 5804, + "time_per_iteration": 2.6100480556488037 + }, + { + "auxiliary_loss_clip": 0.01113984, + "auxiliary_loss_mlp": 0.01043766, + "balance_loss_clip": 1.05314934, + "balance_loss_mlp": 1.02792275, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 1.877888830207285, + "language_loss": 0.76698244, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78855991, + "num_input_tokens_seen": 124914140, + "step": 5805, + "time_per_iteration": 2.519735813140869 + }, + { + "auxiliary_loss_clip": 0.01091788, + "auxiliary_loss_mlp": 0.01039079, + "balance_loss_clip": 1.04306412, + "balance_loss_mlp": 1.02401114, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 2.005405969021183, + "language_loss": 0.67068481, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69199347, + "num_input_tokens_seen": 124934180, + "step": 5806, + "time_per_iteration": 2.620448350906372 + }, + { + "auxiliary_loss_clip": 0.01123949, + "auxiliary_loss_mlp": 0.01047703, + "balance_loss_clip": 1.05220008, + "balance_loss_mlp": 1.03269458, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.846702725795363, + "language_loss": 0.72512096, + "learning_rate": 3.023404690904629e-06, + "loss": 0.7468375, + "num_input_tokens_seen": 124956060, + "step": 5807, + "time_per_iteration": 2.569770336151123 + }, + { + "auxiliary_loss_clip": 0.0113771, + "auxiliary_loss_mlp": 0.01039359, + "balance_loss_clip": 1.04864526, + "balance_loss_mlp": 1.02317083, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.955145741665802, + "language_loss": 0.74466544, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.76643616, + "num_input_tokens_seen": 124976070, + "step": 5808, + "time_per_iteration": 2.5264792442321777 + }, + { + "auxiliary_loss_clip": 0.01132991, + "auxiliary_loss_mlp": 0.01047084, + "balance_loss_clip": 1.05039203, + "balance_loss_mlp": 1.03264797, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.6235769680388192, + "language_loss": 0.84683645, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.8686372, + "num_input_tokens_seen": 124996995, + "step": 5809, + "time_per_iteration": 2.458919048309326 + }, + { + "auxiliary_loss_clip": 0.01107865, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.04585934, + "balance_loss_mlp": 1.0223279, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 1.7439539889510645, + "language_loss": 0.80371064, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82515037, + "num_input_tokens_seen": 125015600, + "step": 5810, + "time_per_iteration": 2.564265489578247 + }, + { + "auxiliary_loss_clip": 0.01136349, + "auxiliary_loss_mlp": 0.01040506, + "balance_loss_clip": 1.05042124, + "balance_loss_mlp": 1.02633214, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.844468300984279, + "language_loss": 0.7564702, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77823877, + "num_input_tokens_seen": 125035290, + "step": 5811, + "time_per_iteration": 2.5158886909484863 + }, + { + "auxiliary_loss_clip": 0.01115984, + "auxiliary_loss_mlp": 0.01041446, + "balance_loss_clip": 1.04569268, + "balance_loss_mlp": 1.02680755, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4796281473495154, + "language_loss": 0.79858464, + "learning_rate": 3.021731151138386e-06, + "loss": 0.8201589, + "num_input_tokens_seen": 125057130, + "step": 5812, + "time_per_iteration": 2.562319755554199 + }, + { + "auxiliary_loss_clip": 0.0107215, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.04097652, + "balance_loss_mlp": 1.02083743, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.907352883536566, + "language_loss": 0.69203424, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71311915, + "num_input_tokens_seen": 125073720, + "step": 5813, + "time_per_iteration": 2.5769011974334717 + }, + { + "auxiliary_loss_clip": 0.01105984, + "auxiliary_loss_mlp": 0.00781937, + "balance_loss_clip": 1.04250956, + "balance_loss_mlp": 1.00054002, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 1.861836453433973, + "language_loss": 0.76366234, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.78254157, + "num_input_tokens_seen": 125090635, + "step": 5814, + "time_per_iteration": 3.958754301071167 + }, + { + "auxiliary_loss_clip": 0.01115907, + "auxiliary_loss_mlp": 0.00782289, + "balance_loss_clip": 1.04774451, + "balance_loss_mlp": 1.0005517, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.5153419129344567, + "language_loss": 0.84593976, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86492169, + "num_input_tokens_seen": 125110070, + "step": 5815, + "time_per_iteration": 2.5571250915527344 + }, + { + "auxiliary_loss_clip": 0.01122895, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.05176675, + "balance_loss_mlp": 1.01881695, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.0570046146826777, + "language_loss": 0.77356267, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79512262, + "num_input_tokens_seen": 125125730, + "step": 5816, + "time_per_iteration": 2.4829494953155518 + }, + { + "auxiliary_loss_clip": 0.01127172, + "auxiliary_loss_mlp": 0.01042168, + "balance_loss_clip": 1.05354691, + "balance_loss_mlp": 1.02741027, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 1.795833944128589, + "language_loss": 0.59079409, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.61248744, + "num_input_tokens_seen": 125146195, + "step": 5817, + "time_per_iteration": 2.5526695251464844 + }, + { + "auxiliary_loss_clip": 0.01065357, + "auxiliary_loss_mlp": 0.01016127, + "balance_loss_clip": 1.03585505, + "balance_loss_mlp": 1.01444614, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 1.0855691184168719, + "language_loss": 0.59856904, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61938387, + "num_input_tokens_seen": 125207790, + "step": 5818, + "time_per_iteration": 3.0943710803985596 + }, + { + "auxiliary_loss_clip": 0.01096257, + "auxiliary_loss_mlp": 0.01037252, + "balance_loss_clip": 1.0439992, + "balance_loss_mlp": 1.02183187, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 1.8168320904198925, + "language_loss": 0.83187521, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85321033, + "num_input_tokens_seen": 125226220, + "step": 5819, + "time_per_iteration": 4.0563805103302 + }, + { + "auxiliary_loss_clip": 0.01111215, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.04635262, + "balance_loss_mlp": 1.01540875, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 1.9169118761119277, + "language_loss": 0.71143788, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.73284483, + "num_input_tokens_seen": 125247485, + "step": 5820, + "time_per_iteration": 2.575561046600342 + }, + { + "auxiliary_loss_clip": 0.01128754, + "auxiliary_loss_mlp": 0.01038323, + "balance_loss_clip": 1.0504117, + "balance_loss_mlp": 1.02337384, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 1.5925414536194662, + "language_loss": 0.70722234, + "learning_rate": 3.018716339744759e-06, + "loss": 0.72889316, + "num_input_tokens_seen": 125268625, + "step": 5821, + "time_per_iteration": 2.6106724739074707 + }, + { + "auxiliary_loss_clip": 0.01131793, + "auxiliary_loss_mlp": 0.01044637, + "balance_loss_clip": 1.05119586, + "balance_loss_mlp": 1.02779245, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.5350552228506578, + "language_loss": 0.73913485, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.76089913, + "num_input_tokens_seen": 125287530, + "step": 5822, + "time_per_iteration": 2.530059576034546 + }, + { + "auxiliary_loss_clip": 0.01116839, + "auxiliary_loss_mlp": 0.01038265, + "balance_loss_clip": 1.05098617, + "balance_loss_mlp": 1.02194476, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.6349077764908746, + "language_loss": 0.78567904, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80723006, + "num_input_tokens_seen": 125307020, + "step": 5823, + "time_per_iteration": 2.5085339546203613 + }, + { + "auxiliary_loss_clip": 0.01051585, + "auxiliary_loss_mlp": 0.01003859, + "balance_loss_clip": 1.03178668, + "balance_loss_mlp": 1.00228548, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7109734869797005, + "language_loss": 0.59249687, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61305135, + "num_input_tokens_seen": 125370445, + "step": 5824, + "time_per_iteration": 4.431643724441528 + }, + { + "auxiliary_loss_clip": 0.01114316, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.04790545, + "balance_loss_mlp": 1.02132702, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 3.6898307806928172, + "language_loss": 0.84451687, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86603606, + "num_input_tokens_seen": 125388900, + "step": 5825, + "time_per_iteration": 2.521084785461426 + }, + { + "auxiliary_loss_clip": 0.01126421, + "auxiliary_loss_mlp": 0.00782174, + "balance_loss_clip": 1.04976618, + "balance_loss_mlp": 1.00065339, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 3.338114849468987, + "language_loss": 0.83496916, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.85405517, + "num_input_tokens_seen": 125402675, + "step": 5826, + "time_per_iteration": 2.451535940170288 + }, + { + "auxiliary_loss_clip": 0.01115489, + "auxiliary_loss_mlp": 0.01043155, + "balance_loss_clip": 1.04996634, + "balance_loss_mlp": 1.02719331, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.5314883956050893, + "language_loss": 0.80924422, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83083063, + "num_input_tokens_seen": 125421360, + "step": 5827, + "time_per_iteration": 2.509274959564209 + }, + { + "auxiliary_loss_clip": 0.01084648, + "auxiliary_loss_mlp": 0.01042673, + "balance_loss_clip": 1.04326224, + "balance_loss_mlp": 1.0273782, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.064503093928561, + "language_loss": 0.71120423, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73247743, + "num_input_tokens_seen": 125440000, + "step": 5828, + "time_per_iteration": 2.5869123935699463 + }, + { + "auxiliary_loss_clip": 0.01132222, + "auxiliary_loss_mlp": 0.01047879, + "balance_loss_clip": 1.0531354, + "balance_loss_mlp": 1.02964008, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.7164028614932492, + "language_loss": 0.7961061, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81790709, + "num_input_tokens_seen": 125460390, + "step": 5829, + "time_per_iteration": 2.5547354221343994 + }, + { + "auxiliary_loss_clip": 0.0110215, + "auxiliary_loss_mlp": 0.0104829, + "balance_loss_clip": 1.05219948, + "balance_loss_mlp": 1.03113544, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 2.508747769528271, + "language_loss": 0.72525477, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74675918, + "num_input_tokens_seen": 125478410, + "step": 5830, + "time_per_iteration": 2.590752601623535 + }, + { + "auxiliary_loss_clip": 0.01093641, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.04060626, + "balance_loss_mlp": 1.02416408, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.068273042657399, + "language_loss": 0.88546932, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90681791, + "num_input_tokens_seen": 125495975, + "step": 5831, + "time_per_iteration": 2.536870241165161 + }, + { + "auxiliary_loss_clip": 0.01081561, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.04462481, + "balance_loss_mlp": 1.02630353, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 2.5883227676400122, + "language_loss": 0.78933388, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.81057644, + "num_input_tokens_seen": 125515035, + "step": 5832, + "time_per_iteration": 2.5764248371124268 + }, + { + "auxiliary_loss_clip": 0.01101228, + "auxiliary_loss_mlp": 0.01049679, + "balance_loss_clip": 1.04546714, + "balance_loss_mlp": 1.03086829, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.8284703697353686, + "language_loss": 0.71366668, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73517573, + "num_input_tokens_seen": 125535555, + "step": 5833, + "time_per_iteration": 2.6109321117401123 + }, + { + "auxiliary_loss_clip": 0.01123635, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.04715753, + "balance_loss_mlp": 1.01931012, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.4449220305104453, + "language_loss": 0.81178129, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83336085, + "num_input_tokens_seen": 125558195, + "step": 5834, + "time_per_iteration": 2.54508376121521 + }, + { + "auxiliary_loss_clip": 0.01089855, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.05410349, + "balance_loss_mlp": 1.02284157, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.1194702454323573, + "language_loss": 0.8369475, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.85823679, + "num_input_tokens_seen": 125575375, + "step": 5835, + "time_per_iteration": 2.5566813945770264 + }, + { + "auxiliary_loss_clip": 0.01081791, + "auxiliary_loss_mlp": 0.01044101, + "balance_loss_clip": 1.04809189, + "balance_loss_mlp": 1.02846098, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 1.525649056167432, + "language_loss": 0.76975191, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.79101086, + "num_input_tokens_seen": 125596745, + "step": 5836, + "time_per_iteration": 2.6035733222961426 + }, + { + "auxiliary_loss_clip": 0.01097013, + "auxiliary_loss_mlp": 0.01046971, + "balance_loss_clip": 1.04694343, + "balance_loss_mlp": 1.02890491, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 2.345138826035342, + "language_loss": 0.7711308, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79257065, + "num_input_tokens_seen": 125613980, + "step": 5837, + "time_per_iteration": 2.5268008708953857 + }, + { + "auxiliary_loss_clip": 0.01126491, + "auxiliary_loss_mlp": 0.01047967, + "balance_loss_clip": 1.04958999, + "balance_loss_mlp": 1.03221917, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.5677812377396865, + "language_loss": 0.67646104, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.69820559, + "num_input_tokens_seen": 125632100, + "step": 5838, + "time_per_iteration": 2.505915403366089 + }, + { + "auxiliary_loss_clip": 0.01134435, + "auxiliary_loss_mlp": 0.0104027, + "balance_loss_clip": 1.0481739, + "balance_loss_mlp": 1.02455866, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 3.2922563725705998, + "language_loss": 0.83250344, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85425043, + "num_input_tokens_seen": 125649190, + "step": 5839, + "time_per_iteration": 3.9462156295776367 + }, + { + "auxiliary_loss_clip": 0.0112779, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.04665887, + "balance_loss_mlp": 1.02798784, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.898849105164266, + "language_loss": 0.59151679, + "learning_rate": 3.012341473657572e-06, + "loss": 0.61324137, + "num_input_tokens_seen": 125668680, + "step": 5840, + "time_per_iteration": 2.514188289642334 + }, + { + "auxiliary_loss_clip": 0.01098964, + "auxiliary_loss_mlp": 0.01043572, + "balance_loss_clip": 1.04705644, + "balance_loss_mlp": 1.02758598, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.236994075624295, + "language_loss": 0.87565535, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89708072, + "num_input_tokens_seen": 125686935, + "step": 5841, + "time_per_iteration": 2.5829241275787354 + }, + { + "auxiliary_loss_clip": 0.01118882, + "auxiliary_loss_mlp": 0.01045532, + "balance_loss_clip": 1.04899776, + "balance_loss_mlp": 1.02817535, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.7273186335053865, + "language_loss": 0.75227666, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77392083, + "num_input_tokens_seen": 125707180, + "step": 5842, + "time_per_iteration": 2.5401320457458496 + }, + { + "auxiliary_loss_clip": 0.01127498, + "auxiliary_loss_mlp": 0.01051973, + "balance_loss_clip": 1.04862666, + "balance_loss_mlp": 1.03594565, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 2.91984624116685, + "language_loss": 0.6842128, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70600754, + "num_input_tokens_seen": 125722780, + "step": 5843, + "time_per_iteration": 2.4304282665252686 + }, + { + "auxiliary_loss_clip": 0.01135628, + "auxiliary_loss_mlp": 0.01048728, + "balance_loss_clip": 1.04960513, + "balance_loss_mlp": 1.03249204, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 2.091310447522322, + "language_loss": 0.65338302, + "learning_rate": 3.010997627806655e-06, + "loss": 0.67522657, + "num_input_tokens_seen": 125742110, + "step": 5844, + "time_per_iteration": 2.4960620403289795 + }, + { + "auxiliary_loss_clip": 0.01123567, + "auxiliary_loss_mlp": 0.01050945, + "balance_loss_clip": 1.05005336, + "balance_loss_mlp": 1.03385055, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.0882778029239155, + "language_loss": 0.75123179, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77297699, + "num_input_tokens_seen": 125759980, + "step": 5845, + "time_per_iteration": 2.494209051132202 + }, + { + "auxiliary_loss_clip": 0.01125071, + "auxiliary_loss_mlp": 0.01047898, + "balance_loss_clip": 1.04917383, + "balance_loss_mlp": 1.0323416, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 3.1176139258060367, + "language_loss": 0.7354362, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75716585, + "num_input_tokens_seen": 125772660, + "step": 5846, + "time_per_iteration": 2.480451822280884 + }, + { + "auxiliary_loss_clip": 0.01099498, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.04797494, + "balance_loss_mlp": 1.03496921, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.5726041588153477, + "language_loss": 0.75700295, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77850807, + "num_input_tokens_seen": 125791935, + "step": 5847, + "time_per_iteration": 2.5226871967315674 + }, + { + "auxiliary_loss_clip": 0.01110638, + "auxiliary_loss_mlp": 0.01045751, + "balance_loss_clip": 1.0441227, + "balance_loss_mlp": 1.03051567, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.06655678627576, + "language_loss": 0.72369236, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74525625, + "num_input_tokens_seen": 125813455, + "step": 5848, + "time_per_iteration": 2.623079299926758 + }, + { + "auxiliary_loss_clip": 0.01126256, + "auxiliary_loss_mlp": 0.01055262, + "balance_loss_clip": 1.05382133, + "balance_loss_mlp": 1.03864455, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.5621468558857643, + "language_loss": 0.89228356, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91409874, + "num_input_tokens_seen": 125827660, + "step": 5849, + "time_per_iteration": 2.4808311462402344 + }, + { + "auxiliary_loss_clip": 0.01117467, + "auxiliary_loss_mlp": 0.01040021, + "balance_loss_clip": 1.05015349, + "balance_loss_mlp": 1.02484536, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 1.9334840447333315, + "language_loss": 0.75102913, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.77260399, + "num_input_tokens_seen": 125846655, + "step": 5850, + "time_per_iteration": 2.550398111343384 + }, + { + "auxiliary_loss_clip": 0.01125618, + "auxiliary_loss_mlp": 0.01043432, + "balance_loss_clip": 1.0501442, + "balance_loss_mlp": 1.02808976, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.4559146262155984, + "language_loss": 0.75427306, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77596354, + "num_input_tokens_seen": 125866290, + "step": 5851, + "time_per_iteration": 2.4966330528259277 + }, + { + "auxiliary_loss_clip": 0.01110422, + "auxiliary_loss_mlp": 0.01043475, + "balance_loss_clip": 1.04647911, + "balance_loss_mlp": 1.02617836, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 1.9843797707152673, + "language_loss": 0.86966932, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89120835, + "num_input_tokens_seen": 125884620, + "step": 5852, + "time_per_iteration": 2.542618751525879 + }, + { + "auxiliary_loss_clip": 0.01134707, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.04933941, + "balance_loss_mlp": 1.02747941, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.186095106360882, + "language_loss": 0.67233199, + "learning_rate": 3.007971733162737e-06, + "loss": 0.69409943, + "num_input_tokens_seen": 125902430, + "step": 5853, + "time_per_iteration": 2.4661765098571777 + }, + { + "auxiliary_loss_clip": 0.01114233, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_clip": 1.04634035, + "balance_loss_mlp": 1.02549946, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.7155228724287948, + "language_loss": 0.80712807, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.82868332, + "num_input_tokens_seen": 125920570, + "step": 5854, + "time_per_iteration": 4.030916929244995 + }, + { + "auxiliary_loss_clip": 0.01117902, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.05346441, + "balance_loss_mlp": 1.02255988, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.3990980007098281, + "language_loss": 0.73061752, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75216663, + "num_input_tokens_seen": 125939800, + "step": 5855, + "time_per_iteration": 2.51257061958313 + }, + { + "auxiliary_loss_clip": 0.0113292, + "auxiliary_loss_mlp": 0.01040954, + "balance_loss_clip": 1.04870319, + "balance_loss_mlp": 1.02626705, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 2.890161178286382, + "language_loss": 0.70782793, + "learning_rate": 3.006962413152691e-06, + "loss": 0.72956669, + "num_input_tokens_seen": 125958720, + "step": 5856, + "time_per_iteration": 2.488271474838257 + }, + { + "auxiliary_loss_clip": 0.01120478, + "auxiliary_loss_mlp": 0.0104912, + "balance_loss_clip": 1.04592681, + "balance_loss_mlp": 1.03265679, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.4961564476483262, + "language_loss": 0.61096007, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63265598, + "num_input_tokens_seen": 125984310, + "step": 5857, + "time_per_iteration": 2.6878836154937744 + }, + { + "auxiliary_loss_clip": 0.01128143, + "auxiliary_loss_mlp": 0.01040617, + "balance_loss_clip": 1.05143809, + "balance_loss_mlp": 1.02492952, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.940853885291862, + "language_loss": 0.73223829, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75392592, + "num_input_tokens_seen": 126002410, + "step": 5858, + "time_per_iteration": 4.027983903884888 + }, + { + "auxiliary_loss_clip": 0.01139804, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.05087137, + "balance_loss_mlp": 1.02561283, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.87626273752023, + "language_loss": 0.76078451, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78258717, + "num_input_tokens_seen": 126022490, + "step": 5859, + "time_per_iteration": 2.5047624111175537 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01039571, + "balance_loss_clip": 1.05159616, + "balance_loss_mlp": 1.02286971, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 5.890632664082108, + "language_loss": 0.71829081, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.73988986, + "num_input_tokens_seen": 126042895, + "step": 5860, + "time_per_iteration": 2.522794246673584 + }, + { + "auxiliary_loss_clip": 0.01107506, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.0442704, + "balance_loss_mlp": 1.0252738, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.1076620297732296, + "language_loss": 0.65996778, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68146384, + "num_input_tokens_seen": 126060130, + "step": 5861, + "time_per_iteration": 2.4917256832122803 + }, + { + "auxiliary_loss_clip": 0.01114357, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.04855525, + "balance_loss_mlp": 1.01960468, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 1.9643561252461834, + "language_loss": 0.6642189, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68570495, + "num_input_tokens_seen": 126077850, + "step": 5862, + "time_per_iteration": 2.5052857398986816 + }, + { + "auxiliary_loss_clip": 0.01109441, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.04846478, + "balance_loss_mlp": 1.02517939, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 2.4879181632247325, + "language_loss": 0.77069747, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79220986, + "num_input_tokens_seen": 126095985, + "step": 5863, + "time_per_iteration": 2.5122337341308594 + }, + { + "auxiliary_loss_clip": 0.0112335, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.05046606, + "balance_loss_mlp": 1.02260804, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 1.987333500179463, + "language_loss": 0.75642383, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77803457, + "num_input_tokens_seen": 126116070, + "step": 5864, + "time_per_iteration": 4.007255554199219 + }, + { + "auxiliary_loss_clip": 0.01124729, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.04693198, + "balance_loss_mlp": 1.03143644, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 2.040021613942858, + "language_loss": 0.79410195, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81581402, + "num_input_tokens_seen": 126135205, + "step": 5865, + "time_per_iteration": 2.537675142288208 + }, + { + "auxiliary_loss_clip": 0.01133739, + "auxiliary_loss_mlp": 0.01043175, + "balance_loss_clip": 1.05288529, + "balance_loss_mlp": 1.02721286, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 2.011920220033297, + "language_loss": 0.81411815, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83588731, + "num_input_tokens_seen": 126151895, + "step": 5866, + "time_per_iteration": 2.462498903274536 + }, + { + "auxiliary_loss_clip": 0.01099024, + "auxiliary_loss_mlp": 0.0104286, + "balance_loss_clip": 1.05333745, + "balance_loss_mlp": 1.02637362, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 1.8501412994570985, + "language_loss": 0.8397584, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.86117727, + "num_input_tokens_seen": 126168515, + "step": 5867, + "time_per_iteration": 2.5724141597747803 + }, + { + "auxiliary_loss_clip": 0.01137388, + "auxiliary_loss_mlp": 0.01047307, + "balance_loss_clip": 1.0487926, + "balance_loss_mlp": 1.0314523, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 1.7301194547762535, + "language_loss": 0.74221081, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76405776, + "num_input_tokens_seen": 126186460, + "step": 5868, + "time_per_iteration": 2.441835403442383 + }, + { + "auxiliary_loss_clip": 0.0113003, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.05193424, + "balance_loss_mlp": 1.02292275, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 2.08868218930775, + "language_loss": 0.6156888, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63737965, + "num_input_tokens_seen": 126206170, + "step": 5869, + "time_per_iteration": 2.4676105976104736 + }, + { + "auxiliary_loss_clip": 0.01127587, + "auxiliary_loss_mlp": 0.01042411, + "balance_loss_clip": 1.05042768, + "balance_loss_mlp": 1.02678299, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 1.854525284679716, + "language_loss": 0.74448442, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76618439, + "num_input_tokens_seen": 126225605, + "step": 5870, + "time_per_iteration": 2.479691743850708 + }, + { + "auxiliary_loss_clip": 0.01125518, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.04919803, + "balance_loss_mlp": 1.02093101, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.537021777939156, + "language_loss": 0.71867931, + "learning_rate": 3.001910665140316e-06, + "loss": 0.74030137, + "num_input_tokens_seen": 126250230, + "step": 5871, + "time_per_iteration": 2.602248191833496 + }, + { + "auxiliary_loss_clip": 0.0111775, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04446256, + "balance_loss_mlp": 1.02000165, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.6544179888008186, + "language_loss": 0.73744917, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.75897139, + "num_input_tokens_seen": 126268315, + "step": 5872, + "time_per_iteration": 2.4582908153533936 + }, + { + "auxiliary_loss_clip": 0.01117637, + "auxiliary_loss_mlp": 0.00781046, + "balance_loss_clip": 1.05100036, + "balance_loss_mlp": 1.00044644, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.7795890588863688, + "language_loss": 0.82567227, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84465909, + "num_input_tokens_seen": 126288390, + "step": 5873, + "time_per_iteration": 2.5473248958587646 + }, + { + "auxiliary_loss_clip": 0.01116119, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_clip": 1.04543638, + "balance_loss_mlp": 1.02785194, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.9406497620978769, + "language_loss": 0.66072541, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68233311, + "num_input_tokens_seen": 126305750, + "step": 5874, + "time_per_iteration": 2.5429527759552 + }, + { + "auxiliary_loss_clip": 0.0105249, + "auxiliary_loss_mlp": 0.01002084, + "balance_loss_clip": 1.03100967, + "balance_loss_mlp": 1.00043905, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.7623556074995738, + "language_loss": 0.61554587, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63609165, + "num_input_tokens_seen": 126362495, + "step": 5875, + "time_per_iteration": 2.9820778369903564 + }, + { + "auxiliary_loss_clip": 0.01075858, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.04403615, + "balance_loss_mlp": 1.02406669, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.8363132128766848, + "language_loss": 0.80074799, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82190251, + "num_input_tokens_seen": 126378320, + "step": 5876, + "time_per_iteration": 2.620344400405884 + }, + { + "auxiliary_loss_clip": 0.01024712, + "auxiliary_loss_mlp": 0.00755815, + "balance_loss_clip": 1.02275658, + "balance_loss_mlp": 1.00008285, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.6726205494132719, + "language_loss": 0.56798375, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58578897, + "num_input_tokens_seen": 126442735, + "step": 5877, + "time_per_iteration": 3.19815731048584 + }, + { + "auxiliary_loss_clip": 0.01105719, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.04650462, + "balance_loss_mlp": 1.0213486, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.5654577251296646, + "language_loss": 0.71626997, + "learning_rate": 2.999550254685024e-06, + "loss": 0.73769712, + "num_input_tokens_seen": 126463090, + "step": 5878, + "time_per_iteration": 4.156183242797852 + }, + { + "auxiliary_loss_clip": 0.01112949, + "auxiliary_loss_mlp": 0.01043018, + "balance_loss_clip": 1.05124354, + "balance_loss_mlp": 1.02760446, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 2.3562119751927533, + "language_loss": 0.78653383, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80809355, + "num_input_tokens_seen": 126482105, + "step": 5879, + "time_per_iteration": 2.5319724082946777 + }, + { + "auxiliary_loss_clip": 0.01110215, + "auxiliary_loss_mlp": 0.01045808, + "balance_loss_clip": 1.05169237, + "balance_loss_mlp": 1.02802205, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.0771113466758147, + "language_loss": 0.63539684, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65695715, + "num_input_tokens_seen": 126502125, + "step": 5880, + "time_per_iteration": 2.5651814937591553 + }, + { + "auxiliary_loss_clip": 0.01117041, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_clip": 1.05028737, + "balance_loss_mlp": 1.02137017, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 1.979925044082772, + "language_loss": 0.65784848, + "learning_rate": 2.998538081402727e-06, + "loss": 0.679398, + "num_input_tokens_seen": 126521950, + "step": 5881, + "time_per_iteration": 2.5450806617736816 + }, + { + "auxiliary_loss_clip": 0.01117887, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.05045605, + "balance_loss_mlp": 1.0168047, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.4217800765898974, + "language_loss": 0.75612438, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77762067, + "num_input_tokens_seen": 126542445, + "step": 5882, + "time_per_iteration": 2.563850164413452 + }, + { + "auxiliary_loss_clip": 0.01110632, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_clip": 1.04634058, + "balance_loss_mlp": 1.02906024, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.0443479952828993, + "language_loss": 0.70501363, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72659391, + "num_input_tokens_seen": 126560690, + "step": 5883, + "time_per_iteration": 2.556690216064453 + }, + { + "auxiliary_loss_clip": 0.01108717, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.05177498, + "balance_loss_mlp": 1.026057, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 3.071448105874792, + "language_loss": 0.78417844, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80568737, + "num_input_tokens_seen": 126577620, + "step": 5884, + "time_per_iteration": 2.5476837158203125 + }, + { + "auxiliary_loss_clip": 0.01111146, + "auxiliary_loss_mlp": 0.0103547, + "balance_loss_clip": 1.04845071, + "balance_loss_mlp": 1.02035403, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 2.004574392694689, + "language_loss": 0.75472176, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.7761879, + "num_input_tokens_seen": 126596235, + "step": 5885, + "time_per_iteration": 2.5106325149536133 + }, + { + "auxiliary_loss_clip": 0.0108827, + "auxiliary_loss_mlp": 0.01043896, + "balance_loss_clip": 1.04204023, + "balance_loss_mlp": 1.02643168, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.6017036061338152, + "language_loss": 0.82943177, + "learning_rate": 2.996850368809606e-06, + "loss": 0.85075343, + "num_input_tokens_seen": 126612830, + "step": 5886, + "time_per_iteration": 2.5619421005249023 + }, + { + "auxiliary_loss_clip": 0.0113721, + "auxiliary_loss_mlp": 0.01037568, + "balance_loss_clip": 1.05019605, + "balance_loss_mlp": 1.02019882, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.152164039949296, + "language_loss": 0.78527963, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.8070274, + "num_input_tokens_seen": 126630910, + "step": 5887, + "time_per_iteration": 2.451263189315796 + }, + { + "auxiliary_loss_clip": 0.01081226, + "auxiliary_loss_mlp": 0.01043011, + "balance_loss_clip": 1.04656136, + "balance_loss_mlp": 1.02774012, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 1.999085297777928, + "language_loss": 0.65511894, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67636132, + "num_input_tokens_seen": 126648365, + "step": 5888, + "time_per_iteration": 2.6020355224609375 + }, + { + "auxiliary_loss_clip": 0.011069, + "auxiliary_loss_mlp": 0.01040546, + "balance_loss_clip": 1.04584002, + "balance_loss_mlp": 1.02482247, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.7055232108746048, + "language_loss": 0.76914907, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.79062355, + "num_input_tokens_seen": 126667500, + "step": 5889, + "time_per_iteration": 2.5501792430877686 + }, + { + "auxiliary_loss_clip": 0.01106568, + "auxiliary_loss_mlp": 0.01039226, + "balance_loss_clip": 1.05486679, + "balance_loss_mlp": 1.02390742, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.8176672940482341, + "language_loss": 0.80989629, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.8313542, + "num_input_tokens_seen": 126686820, + "step": 5890, + "time_per_iteration": 2.591092824935913 + }, + { + "auxiliary_loss_clip": 0.01110882, + "auxiliary_loss_mlp": 0.01036978, + "balance_loss_clip": 1.04610538, + "balance_loss_mlp": 1.02328694, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.8035443814407484, + "language_loss": 0.79618901, + "learning_rate": 2.99516171119991e-06, + "loss": 0.8176676, + "num_input_tokens_seen": 126706965, + "step": 5891, + "time_per_iteration": 2.555415630340576 + }, + { + "auxiliary_loss_clip": 0.01096507, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.04350114, + "balance_loss_mlp": 1.0304091, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 1.981220982646883, + "language_loss": 0.73289478, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75433224, + "num_input_tokens_seen": 126724015, + "step": 5892, + "time_per_iteration": 2.516256093978882 + }, + { + "auxiliary_loss_clip": 0.01113448, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_clip": 1.04658318, + "balance_loss_mlp": 1.03080964, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 2.1948132950192885, + "language_loss": 0.67466128, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69626659, + "num_input_tokens_seen": 126737565, + "step": 5893, + "time_per_iteration": 3.9953112602233887 + }, + { + "auxiliary_loss_clip": 0.01084155, + "auxiliary_loss_mlp": 0.01039551, + "balance_loss_clip": 1.04219818, + "balance_loss_mlp": 1.02290928, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 3.714213882166086, + "language_loss": 0.69696355, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71820062, + "num_input_tokens_seen": 126756095, + "step": 5894, + "time_per_iteration": 2.582050323486328 + }, + { + "auxiliary_loss_clip": 0.01109943, + "auxiliary_loss_mlp": 0.0078085, + "balance_loss_clip": 1.04566216, + "balance_loss_mlp": 1.00052857, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.8265265334607317, + "language_loss": 0.74726808, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.76617604, + "num_input_tokens_seen": 126775455, + "step": 5895, + "time_per_iteration": 2.5077784061431885 + }, + { + "auxiliary_loss_clip": 0.01108194, + "auxiliary_loss_mlp": 0.01037917, + "balance_loss_clip": 1.04494703, + "balance_loss_mlp": 1.02193069, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 1.9058668980436202, + "language_loss": 0.8402673, + "learning_rate": 2.993472110174491e-06, + "loss": 0.86172831, + "num_input_tokens_seen": 126792320, + "step": 5896, + "time_per_iteration": 2.534858465194702 + }, + { + "auxiliary_loss_clip": 0.01111677, + "auxiliary_loss_mlp": 0.00781109, + "balance_loss_clip": 1.04743695, + "balance_loss_mlp": 1.0004766, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 2.232373940210109, + "language_loss": 0.69831926, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.71724707, + "num_input_tokens_seen": 126813680, + "step": 5897, + "time_per_iteration": 2.6523914337158203 + }, + { + "auxiliary_loss_clip": 0.01110323, + "auxiliary_loss_mlp": 0.01045184, + "balance_loss_clip": 1.04622424, + "balance_loss_mlp": 1.02792287, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 1.6522233676611344, + "language_loss": 0.81730843, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.83886355, + "num_input_tokens_seen": 126834395, + "step": 5898, + "time_per_iteration": 4.028563022613525 + }, + { + "auxiliary_loss_clip": 0.01131018, + "auxiliary_loss_mlp": 0.01038868, + "balance_loss_clip": 1.04645264, + "balance_loss_mlp": 1.0246284, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 2.4468044060187073, + "language_loss": 0.74284136, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76454026, + "num_input_tokens_seen": 126855145, + "step": 5899, + "time_per_iteration": 2.4713525772094727 + }, + { + "auxiliary_loss_clip": 0.01135092, + "auxiliary_loss_mlp": 0.00779653, + "balance_loss_clip": 1.04751766, + "balance_loss_mlp": 1.00056505, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.93622199588178, + "language_loss": 0.79721165, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81635904, + "num_input_tokens_seen": 126873790, + "step": 5900, + "time_per_iteration": 2.5022027492523193 + }, + { + "auxiliary_loss_clip": 0.01111004, + "auxiliary_loss_mlp": 0.01043139, + "balance_loss_clip": 1.04282618, + "balance_loss_mlp": 1.02679515, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.9191038831186726, + "language_loss": 0.81408787, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83562934, + "num_input_tokens_seen": 126892865, + "step": 5901, + "time_per_iteration": 2.526657819747925 + }, + { + "auxiliary_loss_clip": 0.01121739, + "auxiliary_loss_mlp": 0.00780215, + "balance_loss_clip": 1.0482949, + "balance_loss_mlp": 1.00049758, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.9714620643504668, + "language_loss": 0.76535702, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.7843765, + "num_input_tokens_seen": 126911935, + "step": 5902, + "time_per_iteration": 2.485626459121704 + }, + { + "auxiliary_loss_clip": 0.01123159, + "auxiliary_loss_mlp": 0.0103917, + "balance_loss_clip": 1.04745817, + "balance_loss_mlp": 1.02416122, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 3.9759536937809092, + "language_loss": 0.70286787, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72449112, + "num_input_tokens_seen": 126930040, + "step": 5903, + "time_per_iteration": 3.884284019470215 + }, + { + "auxiliary_loss_clip": 0.0112579, + "auxiliary_loss_mlp": 0.01041331, + "balance_loss_clip": 1.04700613, + "balance_loss_mlp": 1.02552962, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.4338634809830637, + "language_loss": 0.73842573, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.76009691, + "num_input_tokens_seen": 126948390, + "step": 5904, + "time_per_iteration": 2.5139293670654297 + }, + { + "auxiliary_loss_clip": 0.01116695, + "auxiliary_loss_mlp": 0.00781053, + "balance_loss_clip": 1.05117941, + "balance_loss_mlp": 1.00045729, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.1446417987158153, + "language_loss": 0.79167879, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81065619, + "num_input_tokens_seen": 126964905, + "step": 5905, + "time_per_iteration": 2.485560894012451 + }, + { + "auxiliary_loss_clip": 0.01097257, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.04416168, + "balance_loss_mlp": 1.02108717, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 1.882949545631637, + "language_loss": 0.72528946, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74660873, + "num_input_tokens_seen": 126982000, + "step": 5906, + "time_per_iteration": 2.5539586544036865 + }, + { + "auxiliary_loss_clip": 0.01106288, + "auxiliary_loss_mlp": 0.01037197, + "balance_loss_clip": 1.04549265, + "balance_loss_mlp": 1.02024543, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.022302338820392, + "language_loss": 0.74976689, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.77120173, + "num_input_tokens_seen": 126998390, + "step": 5907, + "time_per_iteration": 2.495547294616699 + }, + { + "auxiliary_loss_clip": 0.01066741, + "auxiliary_loss_mlp": 0.01038739, + "balance_loss_clip": 1.03545809, + "balance_loss_mlp": 1.020226, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.6443144578619238, + "language_loss": 0.75273836, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77379316, + "num_input_tokens_seen": 127020220, + "step": 5908, + "time_per_iteration": 2.6745498180389404 + }, + { + "auxiliary_loss_clip": 0.01110589, + "auxiliary_loss_mlp": 0.01039424, + "balance_loss_clip": 1.04826379, + "balance_loss_mlp": 1.0245167, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.9979066978754405, + "language_loss": 0.68114591, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70264608, + "num_input_tokens_seen": 127038585, + "step": 5909, + "time_per_iteration": 2.5602567195892334 + }, + { + "auxiliary_loss_clip": 0.01119499, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.04898679, + "balance_loss_mlp": 1.01840091, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 1.8401732772124884, + "language_loss": 0.78568745, + "learning_rate": 2.988736221969144e-06, + "loss": 0.807212, + "num_input_tokens_seen": 127056215, + "step": 5910, + "time_per_iteration": 2.5028579235076904 + }, + { + "auxiliary_loss_clip": 0.01113714, + "auxiliary_loss_mlp": 0.01045712, + "balance_loss_clip": 1.04370928, + "balance_loss_mlp": 1.02899265, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 2.1897055760255926, + "language_loss": 0.71063977, + "learning_rate": 2.98839766262581e-06, + "loss": 0.732234, + "num_input_tokens_seen": 127075825, + "step": 5911, + "time_per_iteration": 2.500631332397461 + }, + { + "auxiliary_loss_clip": 0.01119289, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.04496586, + "balance_loss_mlp": 1.02312589, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 2.589679934884553, + "language_loss": 0.86848617, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89005959, + "num_input_tokens_seen": 127091205, + "step": 5912, + "time_per_iteration": 2.4355506896972656 + }, + { + "auxiliary_loss_clip": 0.01113508, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.04797375, + "balance_loss_mlp": 1.02054238, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.9186255736616622, + "language_loss": 0.76804447, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.78953314, + "num_input_tokens_seen": 127109210, + "step": 5913, + "time_per_iteration": 2.502581834793091 + }, + { + "auxiliary_loss_clip": 0.01097771, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.04752934, + "balance_loss_mlp": 1.02262044, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.79488187364069, + "language_loss": 0.82683676, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84819216, + "num_input_tokens_seen": 127128400, + "step": 5914, + "time_per_iteration": 2.542438268661499 + }, + { + "auxiliary_loss_clip": 0.0113642, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.05025458, + "balance_loss_mlp": 1.02064669, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.1527642037034886, + "language_loss": 0.70397055, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72569728, + "num_input_tokens_seen": 127149965, + "step": 5915, + "time_per_iteration": 2.5568103790283203 + }, + { + "auxiliary_loss_clip": 0.01121787, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.04574454, + "balance_loss_mlp": 1.02509534, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.8705481714426426, + "language_loss": 0.76437926, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78599685, + "num_input_tokens_seen": 127169865, + "step": 5916, + "time_per_iteration": 2.4789838790893555 + }, + { + "auxiliary_loss_clip": 0.0110369, + "auxiliary_loss_mlp": 0.01040181, + "balance_loss_clip": 1.04454112, + "balance_loss_mlp": 1.02578044, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 2.062783271013308, + "language_loss": 0.88443089, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90586954, + "num_input_tokens_seen": 127188075, + "step": 5917, + "time_per_iteration": 4.068307876586914 + }, + { + "auxiliary_loss_clip": 0.01060391, + "auxiliary_loss_mlp": 0.01043768, + "balance_loss_clip": 1.03784394, + "balance_loss_mlp": 1.02545774, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 1.9866780312897143, + "language_loss": 0.74616444, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76720595, + "num_input_tokens_seen": 127206065, + "step": 5918, + "time_per_iteration": 2.587766647338867 + }, + { + "auxiliary_loss_clip": 0.01049433, + "auxiliary_loss_mlp": 0.01008805, + "balance_loss_clip": 1.0357424, + "balance_loss_mlp": 1.00671887, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 0.9223918898406204, + "language_loss": 0.63796353, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65854591, + "num_input_tokens_seen": 127257885, + "step": 5919, + "time_per_iteration": 2.815065860748291 + }, + { + "auxiliary_loss_clip": 0.01127598, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.05331612, + "balance_loss_mlp": 1.01915276, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 1.9848436526378332, + "language_loss": 0.73342335, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.7550472, + "num_input_tokens_seen": 127275550, + "step": 5920, + "time_per_iteration": 2.482759714126587 + }, + { + "auxiliary_loss_clip": 0.01091619, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.04469681, + "balance_loss_mlp": 1.01678228, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.7721609983372189, + "language_loss": 0.77506745, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79630125, + "num_input_tokens_seen": 127295110, + "step": 5921, + "time_per_iteration": 2.567786931991577 + }, + { + "auxiliary_loss_clip": 0.01114688, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.05190778, + "balance_loss_mlp": 1.02568817, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 2.0286403262820674, + "language_loss": 0.67378956, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69533765, + "num_input_tokens_seen": 127312865, + "step": 5922, + "time_per_iteration": 2.5084846019744873 + }, + { + "auxiliary_loss_clip": 0.01122463, + "auxiliary_loss_mlp": 0.01037212, + "balance_loss_clip": 1.04781818, + "balance_loss_mlp": 1.0218581, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 2.1169228844310815, + "language_loss": 0.78879941, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81039619, + "num_input_tokens_seen": 127331710, + "step": 5923, + "time_per_iteration": 2.4784815311431885 + }, + { + "auxiliary_loss_clip": 0.01115813, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.05129051, + "balance_loss_mlp": 1.02677596, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.7133927000692792, + "language_loss": 0.85574329, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87731016, + "num_input_tokens_seen": 127350950, + "step": 5924, + "time_per_iteration": 2.511172294616699 + }, + { + "auxiliary_loss_clip": 0.0111089, + "auxiliary_loss_mlp": 0.01045224, + "balance_loss_clip": 1.04536712, + "balance_loss_mlp": 1.02907085, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 2.0721024549119313, + "language_loss": 0.77422643, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79578757, + "num_input_tokens_seen": 127369385, + "step": 5925, + "time_per_iteration": 2.568741798400879 + }, + { + "auxiliary_loss_clip": 0.01077592, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_clip": 1.04462123, + "balance_loss_mlp": 1.02845049, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 2.3911777765343127, + "language_loss": 0.762676, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78388107, + "num_input_tokens_seen": 127386965, + "step": 5926, + "time_per_iteration": 2.5833487510681152 + }, + { + "auxiliary_loss_clip": 0.01108994, + "auxiliary_loss_mlp": 0.00781102, + "balance_loss_clip": 1.0481391, + "balance_loss_mlp": 1.00050294, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.466364295241751, + "language_loss": 0.69121331, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71011424, + "num_input_tokens_seen": 127406075, + "step": 5927, + "time_per_iteration": 2.578113555908203 + }, + { + "auxiliary_loss_clip": 0.0113542, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.05006218, + "balance_loss_mlp": 1.0210464, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 1.841240369872903, + "language_loss": 0.79757929, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81928545, + "num_input_tokens_seen": 127425350, + "step": 5928, + "time_per_iteration": 2.4535927772521973 + }, + { + "auxiliary_loss_clip": 0.0113645, + "auxiliary_loss_mlp": 0.01037503, + "balance_loss_clip": 1.05054045, + "balance_loss_mlp": 1.02278638, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.3189483906005761, + "language_loss": 0.81881338, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84055293, + "num_input_tokens_seen": 127446335, + "step": 5929, + "time_per_iteration": 2.46044921875 + }, + { + "auxiliary_loss_clip": 0.01120202, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.04613221, + "balance_loss_mlp": 1.02112973, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.840769948114923, + "language_loss": 0.70297641, + "learning_rate": 2.981957928520201e-06, + "loss": 0.72452748, + "num_input_tokens_seen": 127462795, + "step": 5930, + "time_per_iteration": 2.4531450271606445 + }, + { + "auxiliary_loss_clip": 0.01125089, + "auxiliary_loss_mlp": 0.01043404, + "balance_loss_clip": 1.05092478, + "balance_loss_mlp": 1.02821684, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 2.0028014809173276, + "language_loss": 0.67952311, + "learning_rate": 2.981618622015244e-06, + "loss": 0.70120806, + "num_input_tokens_seen": 127482675, + "step": 5931, + "time_per_iteration": 2.587118625640869 + }, + { + "auxiliary_loss_clip": 0.01123512, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.04928303, + "balance_loss_mlp": 1.0205524, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.6382201886866412, + "language_loss": 0.67263997, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69422162, + "num_input_tokens_seen": 127502275, + "step": 5932, + "time_per_iteration": 4.073146820068359 + }, + { + "auxiliary_loss_clip": 0.01082812, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.04737866, + "balance_loss_mlp": 1.01971221, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 3.74566346065504, + "language_loss": 0.78380227, + "learning_rate": 2.980939897348969e-06, + "loss": 0.80497301, + "num_input_tokens_seen": 127520195, + "step": 5933, + "time_per_iteration": 2.5489420890808105 + }, + { + "auxiliary_loss_clip": 0.01120519, + "auxiliary_loss_mlp": 0.01056635, + "balance_loss_clip": 1.04531014, + "balance_loss_mlp": 1.0401485, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.4469368755395153, + "language_loss": 0.69485378, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71662533, + "num_input_tokens_seen": 127544495, + "step": 5934, + "time_per_iteration": 2.580432176589966 + }, + { + "auxiliary_loss_clip": 0.01116518, + "auxiliary_loss_mlp": 0.00787286, + "balance_loss_clip": 1.04861259, + "balance_loss_mlp": 1.00046301, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.7818272406877818, + "language_loss": 0.70846605, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.72750413, + "num_input_tokens_seen": 127563810, + "step": 5935, + "time_per_iteration": 2.5470378398895264 + }, + { + "auxiliary_loss_clip": 0.01108711, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.04898989, + "balance_loss_mlp": 1.02129841, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.9262753450318315, + "language_loss": 0.78063786, + "learning_rate": 2.979921531401692e-06, + "loss": 0.80209273, + "num_input_tokens_seen": 127579065, + "step": 5936, + "time_per_iteration": 2.585658550262451 + }, + { + "auxiliary_loss_clip": 0.01123384, + "auxiliary_loss_mlp": 0.00781639, + "balance_loss_clip": 1.04798424, + "balance_loss_mlp": 1.00052118, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.9684136983940574, + "language_loss": 0.64306545, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66211563, + "num_input_tokens_seen": 127599105, + "step": 5937, + "time_per_iteration": 4.052783489227295 + }, + { + "auxiliary_loss_clip": 0.01137558, + "auxiliary_loss_mlp": 0.00780183, + "balance_loss_clip": 1.04958272, + "balance_loss_mlp": 1.0004487, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.5530232674791895, + "language_loss": 0.78530788, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80448532, + "num_input_tokens_seen": 127614940, + "step": 5938, + "time_per_iteration": 2.4057445526123047 + }, + { + "auxiliary_loss_clip": 0.01102117, + "auxiliary_loss_mlp": 0.01044566, + "balance_loss_clip": 1.05284774, + "balance_loss_mlp": 1.029724, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.54382299733026, + "language_loss": 0.80587506, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82734191, + "num_input_tokens_seen": 127634960, + "step": 5939, + "time_per_iteration": 2.5786659717559814 + }, + { + "auxiliary_loss_clip": 0.01115861, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.04904246, + "balance_loss_mlp": 1.02095592, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.861847416352224, + "language_loss": 0.78961265, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81113666, + "num_input_tokens_seen": 127654545, + "step": 5940, + "time_per_iteration": 2.610811471939087 + }, + { + "auxiliary_loss_clip": 0.01117629, + "auxiliary_loss_mlp": 0.01035716, + "balance_loss_clip": 1.04928625, + "balance_loss_mlp": 1.01961732, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.129829001506825, + "language_loss": 0.72190803, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74344146, + "num_input_tokens_seen": 127672320, + "step": 5941, + "time_per_iteration": 2.5038583278656006 + }, + { + "auxiliary_loss_clip": 0.01129592, + "auxiliary_loss_mlp": 0.01038019, + "balance_loss_clip": 1.05047667, + "balance_loss_mlp": 1.0218184, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 2.2814959101088657, + "language_loss": 0.6452297, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66690582, + "num_input_tokens_seen": 127693315, + "step": 5942, + "time_per_iteration": 4.065264701843262 + }, + { + "auxiliary_loss_clip": 0.01122364, + "auxiliary_loss_mlp": 0.01039291, + "balance_loss_clip": 1.04687321, + "balance_loss_mlp": 1.02381814, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 1.9015271096070647, + "language_loss": 0.74180979, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.7634263, + "num_input_tokens_seen": 127711570, + "step": 5943, + "time_per_iteration": 2.4709486961364746 + }, + { + "auxiliary_loss_clip": 0.01058436, + "auxiliary_loss_mlp": 0.01014472, + "balance_loss_clip": 1.02841723, + "balance_loss_mlp": 1.01279163, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7888404522296322, + "language_loss": 0.6072849, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62801397, + "num_input_tokens_seen": 127772475, + "step": 5944, + "time_per_iteration": 3.141786575317383 + }, + { + "auxiliary_loss_clip": 0.0111327, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.04859769, + "balance_loss_mlp": 1.0192759, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 1.8637322639926504, + "language_loss": 0.72975194, + "learning_rate": 2.976864428379655e-06, + "loss": 0.75123245, + "num_input_tokens_seen": 127790940, + "step": 5945, + "time_per_iteration": 2.5092413425445557 + }, + { + "auxiliary_loss_clip": 0.01112014, + "auxiliary_loss_mlp": 0.00781449, + "balance_loss_clip": 1.04515755, + "balance_loss_mlp": 1.00041533, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.7079275919576586, + "language_loss": 0.81065029, + "learning_rate": 2.976524564880326e-06, + "loss": 0.82958496, + "num_input_tokens_seen": 127808275, + "step": 5946, + "time_per_iteration": 2.55267333984375 + }, + { + "auxiliary_loss_clip": 0.01138191, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_clip": 1.05065012, + "balance_loss_mlp": 1.03025973, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.3567446472845712, + "language_loss": 0.68705845, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.70889926, + "num_input_tokens_seen": 127828840, + "step": 5947, + "time_per_iteration": 2.4544713497161865 + }, + { + "auxiliary_loss_clip": 0.01105921, + "auxiliary_loss_mlp": 0.01043261, + "balance_loss_clip": 1.04352975, + "balance_loss_mlp": 1.02822852, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.9983516786235977, + "language_loss": 0.7581141, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77960587, + "num_input_tokens_seen": 127846240, + "step": 5948, + "time_per_iteration": 2.5354976654052734 + }, + { + "auxiliary_loss_clip": 0.01084726, + "auxiliary_loss_mlp": 0.01042867, + "balance_loss_clip": 1.05047083, + "balance_loss_mlp": 1.02723825, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.2930515852982976, + "language_loss": 0.70901012, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.73028606, + "num_input_tokens_seen": 127866880, + "step": 5949, + "time_per_iteration": 2.6829187870025635 + }, + { + "auxiliary_loss_clip": 0.01112696, + "auxiliary_loss_mlp": 0.01039594, + "balance_loss_clip": 1.04762244, + "balance_loss_mlp": 1.02509212, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 16.738130962212896, + "language_loss": 0.76967144, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79119426, + "num_input_tokens_seen": 127883560, + "step": 5950, + "time_per_iteration": 2.5207488536834717 + }, + { + "auxiliary_loss_clip": 0.011272, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.0475924, + "balance_loss_mlp": 1.02458334, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.5933174167721096, + "language_loss": 0.72932088, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.75099766, + "num_input_tokens_seen": 127902330, + "step": 5951, + "time_per_iteration": 2.462242841720581 + }, + { + "auxiliary_loss_clip": 0.01126968, + "auxiliary_loss_mlp": 0.01040537, + "balance_loss_clip": 1.04825568, + "balance_loss_mlp": 1.02499223, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 2.3493908420655267, + "language_loss": 0.70054531, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.7222203, + "num_input_tokens_seen": 127922325, + "step": 5952, + "time_per_iteration": 2.5405852794647217 + }, + { + "auxiliary_loss_clip": 0.01081423, + "auxiliary_loss_mlp": 0.01041565, + "balance_loss_clip": 1.04148293, + "balance_loss_mlp": 1.02568638, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 2.6298977693050722, + "language_loss": 0.69765127, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71888113, + "num_input_tokens_seen": 127942635, + "step": 5953, + "time_per_iteration": 2.728285074234009 + }, + { + "auxiliary_loss_clip": 0.0111013, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.04460263, + "balance_loss_mlp": 1.01922607, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.5600851035156205, + "language_loss": 0.66556889, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68701124, + "num_input_tokens_seen": 127962520, + "step": 5954, + "time_per_iteration": 2.5428247451782227 + }, + { + "auxiliary_loss_clip": 0.01110344, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.04719377, + "balance_loss_mlp": 1.0263586, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 2.205837562599455, + "language_loss": 0.75319302, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.77470684, + "num_input_tokens_seen": 127981180, + "step": 5955, + "time_per_iteration": 2.4933857917785645 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.04716992, + "balance_loss_mlp": 1.02061427, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.4605332638651354, + "language_loss": 0.75881064, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78028953, + "num_input_tokens_seen": 127999725, + "step": 5956, + "time_per_iteration": 2.5095274448394775 + }, + { + "auxiliary_loss_clip": 0.01129236, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.04722524, + "balance_loss_mlp": 1.02218378, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.755775586126365, + "language_loss": 0.73110127, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75275624, + "num_input_tokens_seen": 128018885, + "step": 5957, + "time_per_iteration": 4.0280702114105225 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.04928589, + "balance_loss_mlp": 1.02545953, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 1.7702554457460102, + "language_loss": 0.70935565, + "learning_rate": 2.972443318242726e-06, + "loss": 0.73087597, + "num_input_tokens_seen": 128037875, + "step": 5958, + "time_per_iteration": 2.625875949859619 + }, + { + "auxiliary_loss_clip": 0.01095337, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.04545045, + "balance_loss_mlp": 1.01664317, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.8786301329378752, + "language_loss": 0.88288498, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90413874, + "num_input_tokens_seen": 128056045, + "step": 5959, + "time_per_iteration": 2.6136465072631836 + }, + { + "auxiliary_loss_clip": 0.01134354, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.04951096, + "balance_loss_mlp": 1.02258611, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.4225769006927729, + "language_loss": 0.57989705, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60161406, + "num_input_tokens_seen": 128077815, + "step": 5960, + "time_per_iteration": 2.5243632793426514 + }, + { + "auxiliary_loss_clip": 0.01134086, + "auxiliary_loss_mlp": 0.01040501, + "balance_loss_clip": 1.04836011, + "balance_loss_mlp": 1.02407408, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 1.960593634032498, + "language_loss": 0.76340485, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78515077, + "num_input_tokens_seen": 128095460, + "step": 5961, + "time_per_iteration": 2.4149117469787598 + }, + { + "auxiliary_loss_clip": 0.01100257, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.04781389, + "balance_loss_mlp": 1.01867867, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.6883293909834216, + "language_loss": 0.70117468, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72251201, + "num_input_tokens_seen": 128118605, + "step": 5962, + "time_per_iteration": 2.6508166790008545 + }, + { + "auxiliary_loss_clip": 0.01117006, + "auxiliary_loss_mlp": 0.01036246, + "balance_loss_clip": 1.05244052, + "balance_loss_mlp": 1.0233115, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.7160044726122936, + "language_loss": 0.74147785, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76301038, + "num_input_tokens_seen": 128139205, + "step": 5963, + "time_per_iteration": 2.5374538898468018 + }, + { + "auxiliary_loss_clip": 0.0113476, + "auxiliary_loss_mlp": 0.01042128, + "balance_loss_clip": 1.05124187, + "balance_loss_mlp": 1.02751279, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.7599629831008532, + "language_loss": 0.78680092, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80856979, + "num_input_tokens_seen": 128158765, + "step": 5964, + "time_per_iteration": 2.452921152114868 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.01036047, + "balance_loss_clip": 1.04872751, + "balance_loss_mlp": 1.02079988, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.7533069253252347, + "language_loss": 0.6634385, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68489039, + "num_input_tokens_seen": 128177850, + "step": 5965, + "time_per_iteration": 2.5171539783477783 + }, + { + "auxiliary_loss_clip": 0.01131137, + "auxiliary_loss_mlp": 0.00780038, + "balance_loss_clip": 1.04670095, + "balance_loss_mlp": 1.00042105, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 2.0476564692811454, + "language_loss": 0.78753203, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.80664378, + "num_input_tokens_seen": 128196925, + "step": 5966, + "time_per_iteration": 2.516273260116577 + }, + { + "auxiliary_loss_clip": 0.01079713, + "auxiliary_loss_mlp": 0.01043852, + "balance_loss_clip": 1.04429543, + "balance_loss_mlp": 1.02718699, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 2.2238515498841522, + "language_loss": 0.90711737, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.92835301, + "num_input_tokens_seen": 128213955, + "step": 5967, + "time_per_iteration": 2.645979881286621 + }, + { + "auxiliary_loss_clip": 0.01097328, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_clip": 1.04444194, + "balance_loss_mlp": 1.03156185, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 2.2320305976045725, + "language_loss": 0.80401123, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82547545, + "num_input_tokens_seen": 128232980, + "step": 5968, + "time_per_iteration": 2.5498862266540527 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01053688, + "balance_loss_clip": 1.04414511, + "balance_loss_mlp": 1.03730869, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.2582312856395705, + "language_loss": 0.85010028, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.87173223, + "num_input_tokens_seen": 128252795, + "step": 5969, + "time_per_iteration": 2.544158458709717 + }, + { + "auxiliary_loss_clip": 0.01088287, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.04282308, + "balance_loss_mlp": 1.02312541, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.9446981793524638, + "language_loss": 0.72204149, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74329638, + "num_input_tokens_seen": 128273115, + "step": 5970, + "time_per_iteration": 2.6376452445983887 + }, + { + "auxiliary_loss_clip": 0.01109086, + "auxiliary_loss_mlp": 0.01037176, + "balance_loss_clip": 1.04548383, + "balance_loss_mlp": 1.02281094, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.7810128305019464, + "language_loss": 0.79639119, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81785381, + "num_input_tokens_seen": 128292220, + "step": 5971, + "time_per_iteration": 2.5083189010620117 + }, + { + "auxiliary_loss_clip": 0.01099623, + "auxiliary_loss_mlp": 0.01041068, + "balance_loss_clip": 1.05071235, + "balance_loss_mlp": 1.025249, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 2.4865874900322344, + "language_loss": 0.78529507, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80670202, + "num_input_tokens_seen": 128310305, + "step": 5972, + "time_per_iteration": 3.9766573905944824 + }, + { + "auxiliary_loss_clip": 0.01091344, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.0403018, + "balance_loss_mlp": 1.02180016, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 3.148461886320178, + "language_loss": 0.81832486, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83960605, + "num_input_tokens_seen": 128328305, + "step": 5973, + "time_per_iteration": 2.5166358947753906 + }, + { + "auxiliary_loss_clip": 0.010416, + "auxiliary_loss_mlp": 0.01003213, + "balance_loss_clip": 1.02981734, + "balance_loss_mlp": 1.00122249, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.94326579577721, + "language_loss": 0.56789398, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58834207, + "num_input_tokens_seen": 128378380, + "step": 5974, + "time_per_iteration": 2.9474754333496094 + }, + { + "auxiliary_loss_clip": 0.01122242, + "auxiliary_loss_mlp": 0.01041474, + "balance_loss_clip": 1.0454998, + "balance_loss_mlp": 1.0275147, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.7264974436725105, + "language_loss": 0.68633991, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.70797706, + "num_input_tokens_seen": 128394315, + "step": 5975, + "time_per_iteration": 2.4668612480163574 + }, + { + "auxiliary_loss_clip": 0.01130558, + "auxiliary_loss_mlp": 0.01037372, + "balance_loss_clip": 1.04689741, + "balance_loss_mlp": 1.02415168, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.6643420765417023, + "language_loss": 0.80092263, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82260185, + "num_input_tokens_seen": 128414515, + "step": 5976, + "time_per_iteration": 3.972522020339966 + }, + { + "auxiliary_loss_clip": 0.01070449, + "auxiliary_loss_mlp": 0.01042708, + "balance_loss_clip": 1.03909016, + "balance_loss_mlp": 1.02711523, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 3.217371918491848, + "language_loss": 0.79211676, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81324834, + "num_input_tokens_seen": 128430615, + "step": 5977, + "time_per_iteration": 2.582965850830078 + }, + { + "auxiliary_loss_clip": 0.01092431, + "auxiliary_loss_mlp": 0.01040191, + "balance_loss_clip": 1.04440784, + "balance_loss_mlp": 1.02637434, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7950289644047865, + "language_loss": 0.80133522, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82266146, + "num_input_tokens_seen": 128449480, + "step": 5978, + "time_per_iteration": 2.6060378551483154 + }, + { + "auxiliary_loss_clip": 0.01132834, + "auxiliary_loss_mlp": 0.0078222, + "balance_loss_clip": 1.04770398, + "balance_loss_mlp": 1.00054348, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.4948533484775932, + "language_loss": 0.6790784, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69822896, + "num_input_tokens_seen": 128471465, + "step": 5979, + "time_per_iteration": 2.5459022521972656 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01039865, + "balance_loss_clip": 1.04311311, + "balance_loss_mlp": 1.02456498, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 1.962994168487873, + "language_loss": 0.6738143, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69525421, + "num_input_tokens_seen": 128490645, + "step": 5980, + "time_per_iteration": 2.5211448669433594 + }, + { + "auxiliary_loss_clip": 0.01109787, + "auxiliary_loss_mlp": 0.01044492, + "balance_loss_clip": 1.04501009, + "balance_loss_mlp": 1.02907848, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 1.8381949735697578, + "language_loss": 0.71203244, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73357522, + "num_input_tokens_seen": 128510225, + "step": 5981, + "time_per_iteration": 4.001246452331543 + }, + { + "auxiliary_loss_clip": 0.01109712, + "auxiliary_loss_mlp": 0.01045534, + "balance_loss_clip": 1.05056632, + "balance_loss_mlp": 1.02903509, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.6896778828916184, + "language_loss": 0.71219802, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73375046, + "num_input_tokens_seen": 128530195, + "step": 5982, + "time_per_iteration": 2.588282585144043 + }, + { + "auxiliary_loss_clip": 0.0111412, + "auxiliary_loss_mlp": 0.01051401, + "balance_loss_clip": 1.04437709, + "balance_loss_mlp": 1.03591537, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.6194428251249642, + "language_loss": 0.75863361, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.78028882, + "num_input_tokens_seen": 128549990, + "step": 5983, + "time_per_iteration": 2.489670515060425 + }, + { + "auxiliary_loss_clip": 0.01139481, + "auxiliary_loss_mlp": 0.01046367, + "balance_loss_clip": 1.05028367, + "balance_loss_mlp": 1.02985644, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.97235385288475, + "language_loss": 0.76283681, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78469527, + "num_input_tokens_seen": 128567925, + "step": 5984, + "time_per_iteration": 2.405806303024292 + }, + { + "auxiliary_loss_clip": 0.01118867, + "auxiliary_loss_mlp": 0.00780957, + "balance_loss_clip": 1.04608285, + "balance_loss_mlp": 1.00041616, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.9654656370171428, + "language_loss": 0.86027122, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.87926948, + "num_input_tokens_seen": 128585655, + "step": 5985, + "time_per_iteration": 2.4729456901550293 + }, + { + "auxiliary_loss_clip": 0.01119725, + "auxiliary_loss_mlp": 0.01042229, + "balance_loss_clip": 1.04673505, + "balance_loss_mlp": 1.02661264, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.6620195050894069, + "language_loss": 0.72473341, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74635291, + "num_input_tokens_seen": 128604820, + "step": 5986, + "time_per_iteration": 2.4540646076202393 + }, + { + "auxiliary_loss_clip": 0.01103129, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.04299867, + "balance_loss_mlp": 1.0234189, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 2.2889429129353127, + "language_loss": 0.73570931, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75713003, + "num_input_tokens_seen": 128623070, + "step": 5987, + "time_per_iteration": 2.5272250175476074 + }, + { + "auxiliary_loss_clip": 0.01138654, + "auxiliary_loss_mlp": 0.01039831, + "balance_loss_clip": 1.05095375, + "balance_loss_mlp": 1.0240593, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 4.3888370921154936, + "language_loss": 0.69888884, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.72067368, + "num_input_tokens_seen": 128642430, + "step": 5988, + "time_per_iteration": 2.424828290939331 + }, + { + "auxiliary_loss_clip": 0.01127794, + "auxiliary_loss_mlp": 0.01038248, + "balance_loss_clip": 1.04925585, + "balance_loss_mlp": 1.02323985, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.8194450118819099, + "language_loss": 0.7338587, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75551915, + "num_input_tokens_seen": 128661285, + "step": 5989, + "time_per_iteration": 2.4564433097839355 + }, + { + "auxiliary_loss_clip": 0.01090032, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.04031157, + "balance_loss_mlp": 1.02282953, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.5191410678409707, + "language_loss": 0.79969215, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82097447, + "num_input_tokens_seen": 128682210, + "step": 5990, + "time_per_iteration": 2.5874931812286377 + }, + { + "auxiliary_loss_clip": 0.01125337, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.04926372, + "balance_loss_mlp": 1.01944208, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.7591340846679306, + "language_loss": 0.84137923, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86297834, + "num_input_tokens_seen": 128700445, + "step": 5991, + "time_per_iteration": 2.466567277908325 + }, + { + "auxiliary_loss_clip": 0.01109979, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.0481112, + "balance_loss_mlp": 1.02734566, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 1.9869042741203613, + "language_loss": 0.75522071, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77674824, + "num_input_tokens_seen": 128716855, + "step": 5992, + "time_per_iteration": 2.47648024559021 + }, + { + "auxiliary_loss_clip": 0.01134421, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.04938364, + "balance_loss_mlp": 1.02167845, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 2.712086756764766, + "language_loss": 0.77781844, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79953665, + "num_input_tokens_seen": 128735835, + "step": 5993, + "time_per_iteration": 2.423848867416382 + }, + { + "auxiliary_loss_clip": 0.01111227, + "auxiliary_loss_mlp": 0.01052015, + "balance_loss_clip": 1.04599357, + "balance_loss_mlp": 1.03431273, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 2.1635569595367423, + "language_loss": 0.74311477, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.7647472, + "num_input_tokens_seen": 128752465, + "step": 5994, + "time_per_iteration": 2.4763267040252686 + }, + { + "auxiliary_loss_clip": 0.01096869, + "auxiliary_loss_mlp": 0.01039143, + "balance_loss_clip": 1.04945195, + "balance_loss_mlp": 1.02413476, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8060901459992524, + "language_loss": 0.6883989, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.70975906, + "num_input_tokens_seen": 128770865, + "step": 5995, + "time_per_iteration": 2.542767286300659 + }, + { + "auxiliary_loss_clip": 0.01106413, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_clip": 1.0434258, + "balance_loss_mlp": 1.0301224, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.7366135228492352, + "language_loss": 0.82322478, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84475803, + "num_input_tokens_seen": 128789730, + "step": 5996, + "time_per_iteration": 3.971369504928589 + }, + { + "auxiliary_loss_clip": 0.01133373, + "auxiliary_loss_mlp": 0.01041346, + "balance_loss_clip": 1.04785526, + "balance_loss_mlp": 1.02584267, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.9050625431164419, + "language_loss": 0.73630011, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75804734, + "num_input_tokens_seen": 128806610, + "step": 5997, + "time_per_iteration": 2.4310715198516846 + }, + { + "auxiliary_loss_clip": 0.0112273, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.04830444, + "balance_loss_mlp": 1.02054131, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.109972267511052, + "language_loss": 0.69107175, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.71264946, + "num_input_tokens_seen": 128824830, + "step": 5998, + "time_per_iteration": 2.458409547805786 + }, + { + "auxiliary_loss_clip": 0.01090179, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.0466572, + "balance_loss_mlp": 1.02711058, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.3143087170611225, + "language_loss": 0.77373278, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79508203, + "num_input_tokens_seen": 128838170, + "step": 5999, + "time_per_iteration": 2.552027463912964 + }, + { + "auxiliary_loss_clip": 0.01098328, + "auxiliary_loss_mlp": 0.01043366, + "balance_loss_clip": 1.04693687, + "balance_loss_mlp": 1.02838182, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 2.2156537849709386, + "language_loss": 0.7795068, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80092376, + "num_input_tokens_seen": 128855625, + "step": 6000, + "time_per_iteration": 2.525974988937378 + }, + { + "auxiliary_loss_clip": 0.01096056, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.0470612, + "balance_loss_mlp": 1.01937318, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6323778509666345, + "language_loss": 0.78010201, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80140555, + "num_input_tokens_seen": 128873540, + "step": 6001, + "time_per_iteration": 2.5308971405029297 + }, + { + "auxiliary_loss_clip": 0.01131398, + "auxiliary_loss_mlp": 0.00781141, + "balance_loss_clip": 1.04753757, + "balance_loss_mlp": 1.00055075, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 2.31561869765529, + "language_loss": 0.83284074, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85196614, + "num_input_tokens_seen": 128889925, + "step": 6002, + "time_per_iteration": 2.5152366161346436 + }, + { + "auxiliary_loss_clip": 0.0110408, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.04307556, + "balance_loss_mlp": 1.02267909, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.132463562918525, + "language_loss": 0.90538973, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92680037, + "num_input_tokens_seen": 128906890, + "step": 6003, + "time_per_iteration": 2.5175888538360596 + }, + { + "auxiliary_loss_clip": 0.01028124, + "auxiliary_loss_mlp": 0.01012108, + "balance_loss_clip": 1.0287323, + "balance_loss_mlp": 1.01021278, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8612744696403271, + "language_loss": 0.53329968, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55370212, + "num_input_tokens_seen": 128965940, + "step": 6004, + "time_per_iteration": 3.048940420150757 + }, + { + "auxiliary_loss_clip": 0.01114087, + "auxiliary_loss_mlp": 0.00783044, + "balance_loss_clip": 1.04845977, + "balance_loss_mlp": 1.00049138, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.6919366244272072, + "language_loss": 0.77731305, + "learning_rate": 2.956407517225883e-06, + "loss": 0.79628432, + "num_input_tokens_seen": 128985835, + "step": 6005, + "time_per_iteration": 2.5377376079559326 + }, + { + "auxiliary_loss_clip": 0.01114955, + "auxiliary_loss_mlp": 0.01041683, + "balance_loss_clip": 1.04427803, + "balance_loss_mlp": 1.02617419, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 1.939672090162318, + "language_loss": 0.79832673, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81989306, + "num_input_tokens_seen": 129003120, + "step": 6006, + "time_per_iteration": 2.4808642864227295 + }, + { + "auxiliary_loss_clip": 0.01138967, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.05017066, + "balance_loss_mlp": 1.01861942, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 2.0550390773795857, + "language_loss": 0.84370673, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86545146, + "num_input_tokens_seen": 129021645, + "step": 6007, + "time_per_iteration": 2.465888023376465 + }, + { + "auxiliary_loss_clip": 0.01119224, + "auxiliary_loss_mlp": 0.01036305, + "balance_loss_clip": 1.04903674, + "balance_loss_mlp": 1.01897204, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.2642207661405034, + "language_loss": 0.72196001, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74351525, + "num_input_tokens_seen": 129038375, + "step": 6008, + "time_per_iteration": 2.5263211727142334 + }, + { + "auxiliary_loss_clip": 0.01118306, + "auxiliary_loss_mlp": 0.01037424, + "balance_loss_clip": 1.04199719, + "balance_loss_mlp": 1.02170014, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 2.1524121074461164, + "language_loss": 0.82558376, + "learning_rate": 2.955039050023368e-06, + "loss": 0.84714103, + "num_input_tokens_seen": 129056235, + "step": 6009, + "time_per_iteration": 2.465827226638794 + }, + { + "auxiliary_loss_clip": 0.01105501, + "auxiliary_loss_mlp": 0.01048571, + "balance_loss_clip": 1.04754043, + "balance_loss_mlp": 1.03265667, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.7647226544724957, + "language_loss": 0.76442003, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78596073, + "num_input_tokens_seen": 129072405, + "step": 6010, + "time_per_iteration": 2.533473253250122 + }, + { + "auxiliary_loss_clip": 0.01110033, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.05033863, + "balance_loss_mlp": 1.02400303, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.6880411214137911, + "language_loss": 0.82981408, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85130358, + "num_input_tokens_seen": 129090225, + "step": 6011, + "time_per_iteration": 2.511796236038208 + }, + { + "auxiliary_loss_clip": 0.0113985, + "auxiliary_loss_mlp": 0.01039358, + "balance_loss_clip": 1.04874372, + "balance_loss_mlp": 1.02321744, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 4.029447831804395, + "language_loss": 0.62836897, + "learning_rate": 2.954012319316727e-06, + "loss": 0.65016103, + "num_input_tokens_seen": 129107685, + "step": 6012, + "time_per_iteration": 3.87965989112854 + }, + { + "auxiliary_loss_clip": 0.01107435, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.04523039, + "balance_loss_mlp": 1.02163792, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 2.0892250802921204, + "language_loss": 0.84024131, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86168486, + "num_input_tokens_seen": 129125315, + "step": 6013, + "time_per_iteration": 2.5371625423431396 + }, + { + "auxiliary_loss_clip": 0.01134053, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.0466938, + "balance_loss_mlp": 1.02270937, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.781249728091132, + "language_loss": 0.91652441, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93825573, + "num_input_tokens_seen": 129141600, + "step": 6014, + "time_per_iteration": 2.4447343349456787 + }, + { + "auxiliary_loss_clip": 0.01132631, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.04578114, + "balance_loss_mlp": 1.02656507, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 2.0827306034423323, + "language_loss": 0.73544401, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75719166, + "num_input_tokens_seen": 129160665, + "step": 6015, + "time_per_iteration": 2.436842203140259 + }, + { + "auxiliary_loss_clip": 0.01060433, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.04221916, + "balance_loss_mlp": 1.02486134, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.7011811959681944, + "language_loss": 0.65110075, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67214525, + "num_input_tokens_seen": 129179220, + "step": 6016, + "time_per_iteration": 4.255064010620117 + }, + { + "auxiliary_loss_clip": 0.01126165, + "auxiliary_loss_mlp": 0.01041443, + "balance_loss_clip": 1.04915905, + "balance_loss_mlp": 1.02356195, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.7847714433944568, + "language_loss": 0.71903479, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74071097, + "num_input_tokens_seen": 129200385, + "step": 6017, + "time_per_iteration": 2.6338460445404053 + }, + { + "auxiliary_loss_clip": 0.01123289, + "auxiliary_loss_mlp": 0.01039473, + "balance_loss_clip": 1.04544699, + "balance_loss_mlp": 1.02341521, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.7540163066488743, + "language_loss": 0.73494935, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75657696, + "num_input_tokens_seen": 129217395, + "step": 6018, + "time_per_iteration": 2.458263397216797 + }, + { + "auxiliary_loss_clip": 0.01090206, + "auxiliary_loss_mlp": 0.01038912, + "balance_loss_clip": 1.04882395, + "balance_loss_mlp": 1.02295005, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.8722915569218546, + "language_loss": 0.69095457, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.7122457, + "num_input_tokens_seen": 129238940, + "step": 6019, + "time_per_iteration": 2.5811734199523926 + }, + { + "auxiliary_loss_clip": 0.01114769, + "auxiliary_loss_mlp": 0.01037528, + "balance_loss_clip": 1.04556644, + "balance_loss_mlp": 1.02104139, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.672388886755521, + "language_loss": 0.7645756, + "learning_rate": 2.95127277996311e-06, + "loss": 0.7860986, + "num_input_tokens_seen": 129258240, + "step": 6020, + "time_per_iteration": 2.5090978145599365 + }, + { + "auxiliary_loss_clip": 0.01126039, + "auxiliary_loss_mlp": 0.0104229, + "balance_loss_clip": 1.0493567, + "balance_loss_mlp": 1.02543414, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 2.05997246106009, + "language_loss": 0.74290198, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76458526, + "num_input_tokens_seen": 129279040, + "step": 6021, + "time_per_iteration": 3.9175631999969482 + }, + { + "auxiliary_loss_clip": 0.01094441, + "auxiliary_loss_mlp": 0.01035359, + "balance_loss_clip": 1.04530621, + "balance_loss_mlp": 1.0209291, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.6329665878218347, + "language_loss": 0.81157935, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.8328774, + "num_input_tokens_seen": 129295415, + "step": 6022, + "time_per_iteration": 2.4995460510253906 + }, + { + "auxiliary_loss_clip": 0.01122476, + "auxiliary_loss_mlp": 0.01040818, + "balance_loss_clip": 1.05047321, + "balance_loss_mlp": 1.02640581, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.8322839299100222, + "language_loss": 0.81762624, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83925915, + "num_input_tokens_seen": 129312620, + "step": 6023, + "time_per_iteration": 2.5004208087921143 + }, + { + "auxiliary_loss_clip": 0.0111168, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.04634213, + "balance_loss_mlp": 1.02427149, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 4.018649408011186, + "language_loss": 0.7961235, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81764597, + "num_input_tokens_seen": 129331825, + "step": 6024, + "time_per_iteration": 2.5265870094299316 + }, + { + "auxiliary_loss_clip": 0.01098442, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.04046416, + "balance_loss_mlp": 1.0226779, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.8806759141137042, + "language_loss": 0.75699782, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.77836937, + "num_input_tokens_seen": 129350400, + "step": 6025, + "time_per_iteration": 2.4890482425689697 + }, + { + "auxiliary_loss_clip": 0.01118761, + "auxiliary_loss_mlp": 0.0078121, + "balance_loss_clip": 1.045367, + "balance_loss_mlp": 1.00041306, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.6170278738953066, + "language_loss": 0.72580183, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74480152, + "num_input_tokens_seen": 129371155, + "step": 6026, + "time_per_iteration": 2.4827656745910645 + }, + { + "auxiliary_loss_clip": 0.01130543, + "auxiliary_loss_mlp": 0.01047467, + "balance_loss_clip": 1.05025923, + "balance_loss_mlp": 1.03074217, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.2587603385566815, + "language_loss": 0.79255366, + "learning_rate": 2.948873789002833e-06, + "loss": 0.8143338, + "num_input_tokens_seen": 129391230, + "step": 6027, + "time_per_iteration": 2.534209966659546 + }, + { + "auxiliary_loss_clip": 0.01114238, + "auxiliary_loss_mlp": 0.0104235, + "balance_loss_clip": 1.04781628, + "balance_loss_mlp": 1.02587533, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.8254002509307488, + "language_loss": 0.67240226, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69396818, + "num_input_tokens_seen": 129410065, + "step": 6028, + "time_per_iteration": 2.573849678039551 + }, + { + "auxiliary_loss_clip": 0.01093002, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.04435277, + "balance_loss_mlp": 1.02117074, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 2.5799830684459595, + "language_loss": 0.854976, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.8762635, + "num_input_tokens_seen": 129428655, + "step": 6029, + "time_per_iteration": 2.5202181339263916 + }, + { + "auxiliary_loss_clip": 0.01097957, + "auxiliary_loss_mlp": 0.01040132, + "balance_loss_clip": 1.04696083, + "balance_loss_mlp": 1.02525437, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.812469463223319, + "language_loss": 0.7344718, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.7558527, + "num_input_tokens_seen": 129447845, + "step": 6030, + "time_per_iteration": 2.52024245262146 + }, + { + "auxiliary_loss_clip": 0.01117045, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.0523138, + "balance_loss_mlp": 1.02572489, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.5157470180839945, + "language_loss": 0.74197674, + "learning_rate": 2.94750214514905e-06, + "loss": 0.7635892, + "num_input_tokens_seen": 129463275, + "step": 6031, + "time_per_iteration": 2.4832825660705566 + }, + { + "auxiliary_loss_clip": 0.0109309, + "auxiliary_loss_mlp": 0.0103808, + "balance_loss_clip": 1.04351914, + "balance_loss_mlp": 1.02214205, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.921039552749758, + "language_loss": 0.73415947, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75547123, + "num_input_tokens_seen": 129483205, + "step": 6032, + "time_per_iteration": 2.541780710220337 + }, + { + "auxiliary_loss_clip": 0.01087098, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.04391253, + "balance_loss_mlp": 1.02621806, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 1.8317507757635896, + "language_loss": 0.77804726, + "learning_rate": 2.946816107593884e-06, + "loss": 0.79933405, + "num_input_tokens_seen": 129499885, + "step": 6033, + "time_per_iteration": 2.5479331016540527 + }, + { + "auxiliary_loss_clip": 0.01026691, + "auxiliary_loss_mlp": 0.01006266, + "balance_loss_clip": 1.03868818, + "balance_loss_mlp": 1.00445449, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.7995517886736364, + "language_loss": 0.64804721, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.6683768, + "num_input_tokens_seen": 129561885, + "step": 6034, + "time_per_iteration": 3.1966967582702637 + }, + { + "auxiliary_loss_clip": 0.01119075, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.04601932, + "balance_loss_mlp": 1.02227378, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.6024619565021034, + "language_loss": 0.89820892, + "learning_rate": 2.946129926425273e-06, + "loss": 0.91978812, + "num_input_tokens_seen": 129582325, + "step": 6035, + "time_per_iteration": 2.5182857513427734 + }, + { + "auxiliary_loss_clip": 0.01116, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.05295253, + "balance_loss_mlp": 1.01969039, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 2.3306553793258056, + "language_loss": 0.73902941, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.7605449, + "num_input_tokens_seen": 129600350, + "step": 6036, + "time_per_iteration": 4.0156097412109375 + }, + { + "auxiliary_loss_clip": 0.01119613, + "auxiliary_loss_mlp": 0.01034255, + "balance_loss_clip": 1.04788899, + "balance_loss_mlp": 1.01791096, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.8744863492284705, + "language_loss": 0.75441337, + "learning_rate": 2.945443601747297e-06, + "loss": 0.77595204, + "num_input_tokens_seen": 129618425, + "step": 6037, + "time_per_iteration": 2.5072057247161865 + }, + { + "auxiliary_loss_clip": 0.01115656, + "auxiliary_loss_mlp": 0.01057806, + "balance_loss_clip": 1.04521537, + "balance_loss_mlp": 1.03986514, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.619359145015846, + "language_loss": 0.78024244, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80197716, + "num_input_tokens_seen": 129636750, + "step": 6038, + "time_per_iteration": 2.4695873260498047 + }, + { + "auxiliary_loss_clip": 0.01082109, + "auxiliary_loss_mlp": 0.01008094, + "balance_loss_clip": 1.06713009, + "balance_loss_mlp": 1.00603151, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8340028839354057, + "language_loss": 0.63500631, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65590835, + "num_input_tokens_seen": 129699030, + "step": 6039, + "time_per_iteration": 3.120103597640991 + }, + { + "auxiliary_loss_clip": 0.01107822, + "auxiliary_loss_mlp": 0.01047406, + "balance_loss_clip": 1.0470109, + "balance_loss_mlp": 1.0307163, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 2.120959894860074, + "language_loss": 0.71089262, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73244488, + "num_input_tokens_seen": 129717135, + "step": 6040, + "time_per_iteration": 2.5223443508148193 + }, + { + "auxiliary_loss_clip": 0.01129428, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.05155516, + "balance_loss_mlp": 1.0214808, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 1.7860053722401388, + "language_loss": 0.8127926, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83445799, + "num_input_tokens_seen": 129735940, + "step": 6041, + "time_per_iteration": 2.4832725524902344 + }, + { + "auxiliary_loss_clip": 0.01114299, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.04681754, + "balance_loss_mlp": 1.01835847, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.06246918856876, + "language_loss": 0.83671725, + "learning_rate": 2.943727162882107e-06, + "loss": 0.85821253, + "num_input_tokens_seen": 129752790, + "step": 6042, + "time_per_iteration": 2.48183012008667 + }, + { + "auxiliary_loss_clip": 0.01115065, + "auxiliary_loss_mlp": 0.01045047, + "balance_loss_clip": 1.05016637, + "balance_loss_mlp": 1.0298003, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.7433929591616828, + "language_loss": 0.78283572, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80443686, + "num_input_tokens_seen": 129773655, + "step": 6043, + "time_per_iteration": 2.517057418823242 + }, + { + "auxiliary_loss_clip": 0.01109948, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.0543586, + "balance_loss_mlp": 1.02390194, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 2.1323360274987166, + "language_loss": 0.65377057, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67527187, + "num_input_tokens_seen": 129791605, + "step": 6044, + "time_per_iteration": 2.519024610519409 + }, + { + "auxiliary_loss_clip": 0.0110931, + "auxiliary_loss_mlp": 0.01038967, + "balance_loss_clip": 1.04943824, + "balance_loss_mlp": 1.02325559, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 1.7057607323620663, + "language_loss": 0.80732298, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.82880569, + "num_input_tokens_seen": 129811075, + "step": 6045, + "time_per_iteration": 2.542829990386963 + }, + { + "auxiliary_loss_clip": 0.01103005, + "auxiliary_loss_mlp": 0.01042676, + "balance_loss_clip": 1.04711103, + "balance_loss_mlp": 1.02623725, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 2.094782354023505, + "language_loss": 0.64705473, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66851151, + "num_input_tokens_seen": 129833755, + "step": 6046, + "time_per_iteration": 2.6239919662475586 + }, + { + "auxiliary_loss_clip": 0.0110129, + "auxiliary_loss_mlp": 0.01043044, + "balance_loss_clip": 1.04571891, + "balance_loss_mlp": 1.02726054, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 2.433019625911653, + "language_loss": 0.77792966, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.79937303, + "num_input_tokens_seen": 129854475, + "step": 6047, + "time_per_iteration": 2.5535855293273926 + }, + { + "auxiliary_loss_clip": 0.01135046, + "auxiliary_loss_mlp": 0.01046436, + "balance_loss_clip": 1.04965806, + "balance_loss_mlp": 1.0283047, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.672564201940134, + "language_loss": 0.79424942, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81606424, + "num_input_tokens_seen": 129873530, + "step": 6048, + "time_per_iteration": 2.508078098297119 + }, + { + "auxiliary_loss_clip": 0.01051472, + "auxiliary_loss_mlp": 0.01019855, + "balance_loss_clip": 1.03316152, + "balance_loss_mlp": 1.01801956, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7586586677644631, + "language_loss": 0.52624357, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54695678, + "num_input_tokens_seen": 129940400, + "step": 6049, + "time_per_iteration": 3.1137828826904297 + }, + { + "auxiliary_loss_clip": 0.01104144, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.0475812, + "balance_loss_mlp": 1.02432716, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 1.8232442450150044, + "language_loss": 0.86366165, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88511455, + "num_input_tokens_seen": 129958635, + "step": 6050, + "time_per_iteration": 2.585118532180786 + }, + { + "auxiliary_loss_clip": 0.01124049, + "auxiliary_loss_mlp": 0.00781254, + "balance_loss_clip": 1.05518913, + "balance_loss_mlp": 1.00043321, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.839538770472371, + "language_loss": 0.78625512, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80530822, + "num_input_tokens_seen": 129977685, + "step": 6051, + "time_per_iteration": 3.9353041648864746 + }, + { + "auxiliary_loss_clip": 0.01126455, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.04879725, + "balance_loss_mlp": 1.02446151, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 2.0503536350648446, + "language_loss": 0.82253635, + "learning_rate": 2.940291602812822e-06, + "loss": 0.8441968, + "num_input_tokens_seen": 129997530, + "step": 6052, + "time_per_iteration": 2.5358309745788574 + }, + { + "auxiliary_loss_clip": 0.0109701, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.04484677, + "balance_loss_mlp": 1.02238369, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 1.8144561864091457, + "language_loss": 0.72133654, + "learning_rate": 2.939947850483145e-06, + "loss": 0.7426697, + "num_input_tokens_seen": 130017955, + "step": 6053, + "time_per_iteration": 2.568587303161621 + }, + { + "auxiliary_loss_clip": 0.01019727, + "auxiliary_loss_mlp": 0.01056666, + "balance_loss_clip": 1.03427446, + "balance_loss_mlp": 1.05435324, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7958659451225798, + "language_loss": 0.61234707, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.633111, + "num_input_tokens_seen": 130074275, + "step": 6054, + "time_per_iteration": 4.709692478179932 + }, + { + "auxiliary_loss_clip": 0.01111162, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.04809785, + "balance_loss_mlp": 1.02409792, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 9.686235077889783, + "language_loss": 0.75813997, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.77965945, + "num_input_tokens_seen": 130091375, + "step": 6055, + "time_per_iteration": 2.6272006034851074 + }, + { + "auxiliary_loss_clip": 0.01137821, + "auxiliary_loss_mlp": 0.01044229, + "balance_loss_clip": 1.05211639, + "balance_loss_mlp": 1.02857673, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.7640075226776162, + "language_loss": 0.75093877, + "learning_rate": 2.938916379688765e-06, + "loss": 0.77275932, + "num_input_tokens_seen": 130111595, + "step": 6056, + "time_per_iteration": 2.5093305110931396 + }, + { + "auxiliary_loss_clip": 0.01115101, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.05010557, + "balance_loss_mlp": 1.02601588, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 1.715529207827451, + "language_loss": 0.80006015, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82162452, + "num_input_tokens_seen": 130131440, + "step": 6057, + "time_per_iteration": 2.555555582046509 + }, + { + "auxiliary_loss_clip": 0.01108973, + "auxiliary_loss_mlp": 0.01039803, + "balance_loss_clip": 1.04660225, + "balance_loss_mlp": 1.02410316, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 2.1494702522387703, + "language_loss": 0.80162632, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.8231141, + "num_input_tokens_seen": 130151375, + "step": 6058, + "time_per_iteration": 2.6806867122650146 + }, + { + "auxiliary_loss_clip": 0.01105063, + "auxiliary_loss_mlp": 0.00781566, + "balance_loss_clip": 1.04573011, + "balance_loss_mlp": 1.00034833, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.7093979305693983, + "language_loss": 0.8518033, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.8706696, + "num_input_tokens_seen": 130169960, + "step": 6059, + "time_per_iteration": 2.5496487617492676 + }, + { + "auxiliary_loss_clip": 0.0109682, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.04528427, + "balance_loss_mlp": 1.02976799, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.6531755384618296, + "language_loss": 0.87968576, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90112209, + "num_input_tokens_seen": 130189800, + "step": 6060, + "time_per_iteration": 3.9520890712738037 + }, + { + "auxiliary_loss_clip": 0.01128614, + "auxiliary_loss_mlp": 0.01042822, + "balance_loss_clip": 1.0532192, + "balance_loss_mlp": 1.02620423, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 2.07829994416909, + "language_loss": 0.67850757, + "learning_rate": 2.937196549795971e-06, + "loss": 0.7002219, + "num_input_tokens_seen": 130206370, + "step": 6061, + "time_per_iteration": 2.485919237136841 + }, + { + "auxiliary_loss_clip": 0.01121352, + "auxiliary_loss_mlp": 0.01043571, + "balance_loss_clip": 1.05404639, + "balance_loss_mlp": 1.02733421, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.0050605458750175, + "language_loss": 0.75452369, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.776173, + "num_input_tokens_seen": 130224445, + "step": 6062, + "time_per_iteration": 2.5079050064086914 + }, + { + "auxiliary_loss_clip": 0.01116379, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.0565846, + "balance_loss_mlp": 1.01732254, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.647206673416066, + "language_loss": 0.72252929, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74403435, + "num_input_tokens_seen": 130245380, + "step": 6063, + "time_per_iteration": 2.5952045917510986 + }, + { + "auxiliary_loss_clip": 0.01122837, + "auxiliary_loss_mlp": 0.01044744, + "balance_loss_clip": 1.04853344, + "balance_loss_mlp": 1.02921081, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 2.089363877710116, + "language_loss": 0.68181717, + "learning_rate": 2.936164225292901e-06, + "loss": 0.703493, + "num_input_tokens_seen": 130265575, + "step": 6064, + "time_per_iteration": 2.5091965198516846 + }, + { + "auxiliary_loss_clip": 0.01115962, + "auxiliary_loss_mlp": 0.01051394, + "balance_loss_clip": 1.04860234, + "balance_loss_mlp": 1.03488362, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 2.13249041275951, + "language_loss": 0.74351346, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76518703, + "num_input_tokens_seen": 130286195, + "step": 6065, + "time_per_iteration": 2.5553958415985107 + }, + { + "auxiliary_loss_clip": 0.01120132, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.05262518, + "balance_loss_mlp": 1.02679336, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.86811106778838, + "language_loss": 0.74711931, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.76876259, + "num_input_tokens_seen": 130306095, + "step": 6066, + "time_per_iteration": 2.6199841499328613 + }, + { + "auxiliary_loss_clip": 0.01124799, + "auxiliary_loss_mlp": 0.01036591, + "balance_loss_clip": 1.05273342, + "balance_loss_mlp": 1.02172542, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.1564271649845566, + "language_loss": 0.76352406, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.78513789, + "num_input_tokens_seen": 130324685, + "step": 6067, + "time_per_iteration": 2.5458664894104004 + }, + { + "auxiliary_loss_clip": 0.01137731, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_clip": 1.0546751, + "balance_loss_mlp": 1.03289676, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 1.8735057629087313, + "language_loss": 0.70812017, + "learning_rate": 2.934787295690886e-06, + "loss": 0.72996879, + "num_input_tokens_seen": 130343855, + "step": 6068, + "time_per_iteration": 2.5142202377319336 + }, + { + "auxiliary_loss_clip": 0.0112557, + "auxiliary_loss_mlp": 0.0104343, + "balance_loss_clip": 1.04826605, + "balance_loss_mlp": 1.02776003, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.8832802810783587, + "language_loss": 0.73845553, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76014549, + "num_input_tokens_seen": 130362320, + "step": 6069, + "time_per_iteration": 2.481670618057251 + }, + { + "auxiliary_loss_clip": 0.01121295, + "auxiliary_loss_mlp": 0.01043246, + "balance_loss_clip": 1.05496681, + "balance_loss_mlp": 1.02740288, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 3.692164738451763, + "language_loss": 0.66140372, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68304908, + "num_input_tokens_seen": 130383165, + "step": 6070, + "time_per_iteration": 2.5419652462005615 + }, + { + "auxiliary_loss_clip": 0.01124317, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.0553093, + "balance_loss_mlp": 1.02609837, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.785970123210791, + "language_loss": 0.74309343, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76474762, + "num_input_tokens_seen": 130402425, + "step": 6071, + "time_per_iteration": 2.503251075744629 + }, + { + "auxiliary_loss_clip": 0.01125944, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.04970431, + "balance_loss_mlp": 1.02135539, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 1.7842306483629442, + "language_loss": 0.88535297, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90698975, + "num_input_tokens_seen": 130419440, + "step": 6072, + "time_per_iteration": 2.5205698013305664 + }, + { + "auxiliary_loss_clip": 0.01120716, + "auxiliary_loss_mlp": 0.01040807, + "balance_loss_clip": 1.05006027, + "balance_loss_mlp": 1.02571511, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.0652707307782863, + "language_loss": 0.72846496, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.75008017, + "num_input_tokens_seen": 130438495, + "step": 6073, + "time_per_iteration": 2.505309581756592 + }, + { + "auxiliary_loss_clip": 0.01067322, + "auxiliary_loss_mlp": 0.01044192, + "balance_loss_clip": 1.04595804, + "balance_loss_mlp": 1.02693069, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 2.011328514215317, + "language_loss": 0.66884792, + "learning_rate": 2.932720838132236e-06, + "loss": 0.6899631, + "num_input_tokens_seen": 130455575, + "step": 6074, + "time_per_iteration": 2.6379034519195557 + }, + { + "auxiliary_loss_clip": 0.01101671, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.04717636, + "balance_loss_mlp": 1.02192664, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 2.096889487422646, + "language_loss": 0.7281974, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.74958497, + "num_input_tokens_seen": 130476385, + "step": 6075, + "time_per_iteration": 2.676487684249878 + }, + { + "auxiliary_loss_clip": 0.01103412, + "auxiliary_loss_mlp": 0.01050735, + "balance_loss_clip": 1.04972243, + "balance_loss_mlp": 1.03410494, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.0200588380853284, + "language_loss": 0.89548993, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91703141, + "num_input_tokens_seen": 130493630, + "step": 6076, + "time_per_iteration": 4.191144704818726 + }, + { + "auxiliary_loss_clip": 0.01125023, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.05124104, + "balance_loss_mlp": 1.02491462, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 1.9739400875296569, + "language_loss": 0.69166362, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71332669, + "num_input_tokens_seen": 130510735, + "step": 6077, + "time_per_iteration": 2.48138165473938 + }, + { + "auxiliary_loss_clip": 0.01075172, + "auxiliary_loss_mlp": 0.01025838, + "balance_loss_clip": 1.04512334, + "balance_loss_mlp": 1.02390718, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7580372400735675, + "language_loss": 0.61733556, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63834566, + "num_input_tokens_seen": 130577050, + "step": 6078, + "time_per_iteration": 3.1507489681243896 + }, + { + "auxiliary_loss_clip": 0.01109175, + "auxiliary_loss_mlp": 0.01051849, + "balance_loss_clip": 1.04436934, + "balance_loss_mlp": 1.03399193, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 2.076684035193343, + "language_loss": 0.78318709, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80479735, + "num_input_tokens_seen": 130593780, + "step": 6079, + "time_per_iteration": 2.642929792404175 + }, + { + "auxiliary_loss_clip": 0.01126546, + "auxiliary_loss_mlp": 0.01043756, + "balance_loss_clip": 1.05007577, + "balance_loss_mlp": 1.02724576, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 1.8296207962162374, + "language_loss": 0.62783229, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.64953542, + "num_input_tokens_seen": 130615510, + "step": 6080, + "time_per_iteration": 2.698357582092285 + }, + { + "auxiliary_loss_clip": 0.01110805, + "auxiliary_loss_mlp": 0.01041767, + "balance_loss_clip": 1.04993391, + "balance_loss_mlp": 1.02451706, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.5786877207913177, + "language_loss": 0.67493743, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69646311, + "num_input_tokens_seen": 130635410, + "step": 6081, + "time_per_iteration": 2.579759120941162 + }, + { + "auxiliary_loss_clip": 0.01113929, + "auxiliary_loss_mlp": 0.00783149, + "balance_loss_clip": 1.05047226, + "balance_loss_mlp": 1.00034904, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.787115445408206, + "language_loss": 0.75003815, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76900893, + "num_input_tokens_seen": 130657725, + "step": 6082, + "time_per_iteration": 2.6026248931884766 + }, + { + "auxiliary_loss_clip": 0.01075411, + "auxiliary_loss_mlp": 0.00782416, + "balance_loss_clip": 1.04299951, + "balance_loss_mlp": 1.00028312, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.67736358390403, + "language_loss": 0.83056295, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84914124, + "num_input_tokens_seen": 130678360, + "step": 6083, + "time_per_iteration": 2.6797595024108887 + }, + { + "auxiliary_loss_clip": 0.01045599, + "auxiliary_loss_mlp": 0.01004078, + "balance_loss_clip": 1.03477776, + "balance_loss_mlp": 1.00214672, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.92623346295763, + "language_loss": 0.59281981, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61331654, + "num_input_tokens_seen": 130742110, + "step": 6084, + "time_per_iteration": 3.1948163509368896 + }, + { + "auxiliary_loss_clip": 0.01097064, + "auxiliary_loss_mlp": 0.01051536, + "balance_loss_clip": 1.04748809, + "balance_loss_mlp": 1.03553843, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 1.8318255061446607, + "language_loss": 0.73027879, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75176477, + "num_input_tokens_seen": 130759870, + "step": 6085, + "time_per_iteration": 2.5602569580078125 + }, + { + "auxiliary_loss_clip": 0.01103507, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.05219364, + "balance_loss_mlp": 1.02503562, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 2.020504030656346, + "language_loss": 0.78650749, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80794138, + "num_input_tokens_seen": 130778510, + "step": 6086, + "time_per_iteration": 2.5457162857055664 + }, + { + "auxiliary_loss_clip": 0.01116725, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.04556036, + "balance_loss_mlp": 1.02346444, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 2.9421742349048814, + "language_loss": 0.76941818, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79096985, + "num_input_tokens_seen": 130798535, + "step": 6087, + "time_per_iteration": 2.5539958477020264 + }, + { + "auxiliary_loss_clip": 0.0108382, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.04648852, + "balance_loss_mlp": 1.02891552, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 2.5434814562350474, + "language_loss": 0.70622706, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72752577, + "num_input_tokens_seen": 130816655, + "step": 6088, + "time_per_iteration": 2.5524256229400635 + }, + { + "auxiliary_loss_clip": 0.01131906, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.05032825, + "balance_loss_mlp": 1.02604532, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.7054258110879914, + "language_loss": 0.79746342, + "learning_rate": 2.92754912981472e-06, + "loss": 0.81921649, + "num_input_tokens_seen": 130841225, + "step": 6089, + "time_per_iteration": 2.643127202987671 + }, + { + "auxiliary_loss_clip": 0.01103429, + "auxiliary_loss_mlp": 0.01038099, + "balance_loss_clip": 1.0479728, + "balance_loss_mlp": 1.02313864, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.7801747376646493, + "language_loss": 0.71236771, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73378301, + "num_input_tokens_seen": 130861050, + "step": 6090, + "time_per_iteration": 4.071444272994995 + }, + { + "auxiliary_loss_clip": 0.01106597, + "auxiliary_loss_mlp": 0.01056217, + "balance_loss_clip": 1.05251253, + "balance_loss_mlp": 1.04123807, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.6994082148805474, + "language_loss": 0.74088871, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76251686, + "num_input_tokens_seen": 130879775, + "step": 6091, + "time_per_iteration": 2.4843173027038574 + }, + { + "auxiliary_loss_clip": 0.01076477, + "auxiliary_loss_mlp": 0.01045033, + "balance_loss_clip": 1.046857, + "balance_loss_mlp": 1.02954757, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 2.035539409402891, + "language_loss": 0.72518325, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74639827, + "num_input_tokens_seen": 130898070, + "step": 6092, + "time_per_iteration": 2.6522765159606934 + }, + { + "auxiliary_loss_clip": 0.01129451, + "auxiliary_loss_mlp": 0.01053356, + "balance_loss_clip": 1.05025113, + "balance_loss_mlp": 1.03715563, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 2.2029540338107907, + "language_loss": 0.78246075, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.80428886, + "num_input_tokens_seen": 130915250, + "step": 6093, + "time_per_iteration": 2.504849910736084 + }, + { + "auxiliary_loss_clip": 0.01126375, + "auxiliary_loss_mlp": 0.01041325, + "balance_loss_clip": 1.04928923, + "balance_loss_mlp": 1.02603006, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.9988373750807824, + "language_loss": 0.74503314, + "learning_rate": 2.925823466224696e-06, + "loss": 0.7667101, + "num_input_tokens_seen": 130936995, + "step": 6094, + "time_per_iteration": 4.088507890701294 + }, + { + "auxiliary_loss_clip": 0.01143421, + "auxiliary_loss_mlp": 0.01062596, + "balance_loss_clip": 1.05381393, + "balance_loss_mlp": 1.04622829, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.734921895500921, + "language_loss": 0.79507053, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81713068, + "num_input_tokens_seen": 130957970, + "step": 6095, + "time_per_iteration": 2.5138583183288574 + }, + { + "auxiliary_loss_clip": 0.01118196, + "auxiliary_loss_mlp": 0.00784942, + "balance_loss_clip": 1.05080783, + "balance_loss_mlp": 1.00029516, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.8413874964010004, + "language_loss": 0.73340213, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75243348, + "num_input_tokens_seen": 130974915, + "step": 6096, + "time_per_iteration": 2.489821672439575 + }, + { + "auxiliary_loss_clip": 0.01101586, + "auxiliary_loss_mlp": 0.01043006, + "balance_loss_clip": 1.04461408, + "balance_loss_mlp": 1.02667463, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 3.9350220942241925, + "language_loss": 0.67232841, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69377434, + "num_input_tokens_seen": 130995745, + "step": 6097, + "time_per_iteration": 2.5849366188049316 + }, + { + "auxiliary_loss_clip": 0.01083131, + "auxiliary_loss_mlp": 0.01043334, + "balance_loss_clip": 1.04863858, + "balance_loss_mlp": 1.02752662, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.576931108589423, + "language_loss": 0.77863151, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79989612, + "num_input_tokens_seen": 131015545, + "step": 6098, + "time_per_iteration": 2.6425626277923584 + }, + { + "auxiliary_loss_clip": 0.01124514, + "auxiliary_loss_mlp": 0.01045009, + "balance_loss_clip": 1.04794979, + "balance_loss_mlp": 1.02877283, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 2.3656074720271807, + "language_loss": 0.73761094, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75930619, + "num_input_tokens_seen": 131033990, + "step": 6099, + "time_per_iteration": 3.864466428756714 + }, + { + "auxiliary_loss_clip": 0.01113829, + "auxiliary_loss_mlp": 0.01047838, + "balance_loss_clip": 1.04994106, + "balance_loss_mlp": 1.0327456, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.826088217087916, + "language_loss": 0.84660095, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86821765, + "num_input_tokens_seen": 131050710, + "step": 6100, + "time_per_iteration": 2.4935336112976074 + }, + { + "auxiliary_loss_clip": 0.01105342, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.04661846, + "balance_loss_mlp": 1.02162051, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 1.9138876369251547, + "language_loss": 0.70293808, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72436887, + "num_input_tokens_seen": 131071435, + "step": 6101, + "time_per_iteration": 2.562908411026001 + }, + { + "auxiliary_loss_clip": 0.01110453, + "auxiliary_loss_mlp": 0.0104961, + "balance_loss_clip": 1.04972601, + "balance_loss_mlp": 1.03282535, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.0484202577615482, + "language_loss": 0.76077116, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78237182, + "num_input_tokens_seen": 131088775, + "step": 6102, + "time_per_iteration": 2.47674822807312 + }, + { + "auxiliary_loss_clip": 0.0113021, + "auxiliary_loss_mlp": 0.01039576, + "balance_loss_clip": 1.05084205, + "balance_loss_mlp": 1.02183819, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.7335015959473257, + "language_loss": 0.70237422, + "learning_rate": 2.922715061101625e-06, + "loss": 0.7240721, + "num_input_tokens_seen": 131112800, + "step": 6103, + "time_per_iteration": 2.701225757598877 + }, + { + "auxiliary_loss_clip": 0.01099531, + "auxiliary_loss_mlp": 0.0103837, + "balance_loss_clip": 1.05441046, + "balance_loss_mlp": 1.02245593, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.7485167502578567, + "language_loss": 0.71974206, + "learning_rate": 2.922369507632716e-06, + "loss": 0.74112105, + "num_input_tokens_seen": 131131150, + "step": 6104, + "time_per_iteration": 2.556119441986084 + }, + { + "auxiliary_loss_clip": 0.01126414, + "auxiliary_loss_mlp": 0.01039589, + "balance_loss_clip": 1.04822516, + "balance_loss_mlp": 1.02354348, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 1.8319189013241077, + "language_loss": 0.81301814, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83467817, + "num_input_tokens_seen": 131150365, + "step": 6105, + "time_per_iteration": 2.4708609580993652 + }, + { + "auxiliary_loss_clip": 0.01143358, + "auxiliary_loss_mlp": 0.01039257, + "balance_loss_clip": 1.05199933, + "balance_loss_mlp": 1.02235341, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.7764793063160447, + "language_loss": 0.80765963, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.82948577, + "num_input_tokens_seen": 131169310, + "step": 6106, + "time_per_iteration": 2.4846551418304443 + }, + { + "auxiliary_loss_clip": 0.01040391, + "auxiliary_loss_mlp": 0.00755842, + "balance_loss_clip": 1.03717136, + "balance_loss_mlp": 0.9999277, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6884122893258188, + "language_loss": 0.59230596, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.61026829, + "num_input_tokens_seen": 131232900, + "step": 6107, + "time_per_iteration": 3.1810200214385986 + }, + { + "auxiliary_loss_clip": 0.01110551, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.04747176, + "balance_loss_mlp": 1.0179584, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.8088195273662708, + "language_loss": 0.74667406, + "learning_rate": 2.92098694412469e-06, + "loss": 0.76811343, + "num_input_tokens_seen": 131250920, + "step": 6108, + "time_per_iteration": 2.5340311527252197 + }, + { + "auxiliary_loss_clip": 0.0112717, + "auxiliary_loss_mlp": 0.01038388, + "balance_loss_clip": 1.04827166, + "balance_loss_mlp": 1.02295017, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.6110498926569816, + "language_loss": 0.73815298, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.7598086, + "num_input_tokens_seen": 131267910, + "step": 6109, + "time_per_iteration": 2.4521219730377197 + }, + { + "auxiliary_loss_clip": 0.01068216, + "auxiliary_loss_mlp": 0.01043524, + "balance_loss_clip": 1.04214168, + "balance_loss_mlp": 1.0270617, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 2.0001028593593286, + "language_loss": 0.53143847, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55255592, + "num_input_tokens_seen": 131287150, + "step": 6110, + "time_per_iteration": 2.605982780456543 + }, + { + "auxiliary_loss_clip": 0.01124155, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.04777098, + "balance_loss_mlp": 1.0278486, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.4599961679241895, + "language_loss": 0.80532199, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82700825, + "num_input_tokens_seen": 131308225, + "step": 6111, + "time_per_iteration": 2.5492918491363525 + }, + { + "auxiliary_loss_clip": 0.01090732, + "auxiliary_loss_mlp": 0.01046341, + "balance_loss_clip": 1.04956663, + "balance_loss_mlp": 1.03060508, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.4647574621638602, + "language_loss": 0.72515512, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74652582, + "num_input_tokens_seen": 131332115, + "step": 6112, + "time_per_iteration": 2.6527295112609863 + }, + { + "auxiliary_loss_clip": 0.01125916, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.04827738, + "balance_loss_mlp": 1.0306772, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6552798097051675, + "language_loss": 0.85081053, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87252575, + "num_input_tokens_seen": 131351885, + "step": 6113, + "time_per_iteration": 2.4823429584503174 + }, + { + "auxiliary_loss_clip": 0.01127158, + "auxiliary_loss_mlp": 0.0104694, + "balance_loss_clip": 1.04731393, + "balance_loss_mlp": 1.03031015, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.915678930203789, + "language_loss": 0.78612715, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80786812, + "num_input_tokens_seen": 131370245, + "step": 6114, + "time_per_iteration": 2.593441963195801 + }, + { + "auxiliary_loss_clip": 0.01130182, + "auxiliary_loss_mlp": 0.01047168, + "balance_loss_clip": 1.048581, + "balance_loss_mlp": 1.02847624, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.593470640110657, + "language_loss": 0.66971791, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69149137, + "num_input_tokens_seen": 131388115, + "step": 6115, + "time_per_iteration": 4.142830848693848 + }, + { + "auxiliary_loss_clip": 0.01105312, + "auxiliary_loss_mlp": 0.01037568, + "balance_loss_clip": 1.04412425, + "balance_loss_mlp": 1.02211809, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.402638958905686, + "language_loss": 0.76411277, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78554153, + "num_input_tokens_seen": 131404595, + "step": 6116, + "time_per_iteration": 2.630577802658081 + }, + { + "auxiliary_loss_clip": 0.01090628, + "auxiliary_loss_mlp": 0.01040528, + "balance_loss_clip": 1.04479671, + "balance_loss_mlp": 1.02550721, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 2.151055237108658, + "language_loss": 0.63347924, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65479088, + "num_input_tokens_seen": 131423760, + "step": 6117, + "time_per_iteration": 2.7077438831329346 + }, + { + "auxiliary_loss_clip": 0.01104967, + "auxiliary_loss_mlp": 0.01042033, + "balance_loss_clip": 1.04238236, + "balance_loss_mlp": 1.02528381, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 2.0661445358568606, + "language_loss": 0.73216558, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75363553, + "num_input_tokens_seen": 131444955, + "step": 6118, + "time_per_iteration": 2.640856981277466 + }, + { + "auxiliary_loss_clip": 0.01132254, + "auxiliary_loss_mlp": 0.01048675, + "balance_loss_clip": 1.05242801, + "balance_loss_mlp": 1.0314616, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 1.8041031435749177, + "language_loss": 0.73177737, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.75358671, + "num_input_tokens_seen": 131465720, + "step": 6119, + "time_per_iteration": 2.623471260070801 + }, + { + "auxiliary_loss_clip": 0.01109568, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.04773688, + "balance_loss_mlp": 1.0207088, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.92493258651398, + "language_loss": 0.80479306, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82626116, + "num_input_tokens_seen": 131483080, + "step": 6120, + "time_per_iteration": 2.649430513381958 + }, + { + "auxiliary_loss_clip": 0.01094449, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_clip": 1.04880571, + "balance_loss_mlp": 1.02723825, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.9789012976307994, + "language_loss": 0.64114046, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66252029, + "num_input_tokens_seen": 131502545, + "step": 6121, + "time_per_iteration": 2.581413984298706 + }, + { + "auxiliary_loss_clip": 0.01126666, + "auxiliary_loss_mlp": 0.0104904, + "balance_loss_clip": 1.05273223, + "balance_loss_mlp": 1.03257132, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 1.891130605613604, + "language_loss": 0.7097379, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.7314949, + "num_input_tokens_seen": 131522155, + "step": 6122, + "time_per_iteration": 2.5234415531158447 + }, + { + "auxiliary_loss_clip": 0.01106991, + "auxiliary_loss_mlp": 0.01049142, + "balance_loss_clip": 1.04865754, + "balance_loss_mlp": 1.03190434, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 2.252407085757052, + "language_loss": 0.69184983, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71341121, + "num_input_tokens_seen": 131543865, + "step": 6123, + "time_per_iteration": 2.5846848487854004 + }, + { + "auxiliary_loss_clip": 0.01127733, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_clip": 1.04711854, + "balance_loss_mlp": 1.02690744, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 3.6190163506082507, + "language_loss": 0.736359, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.75808305, + "num_input_tokens_seen": 131562155, + "step": 6124, + "time_per_iteration": 2.4981634616851807 + }, + { + "auxiliary_loss_clip": 0.01115981, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.04853988, + "balance_loss_mlp": 1.0219543, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 1.9488506373201986, + "language_loss": 0.74149537, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76305485, + "num_input_tokens_seen": 131581695, + "step": 6125, + "time_per_iteration": 2.5440900325775146 + }, + { + "auxiliary_loss_clip": 0.01128663, + "auxiliary_loss_mlp": 0.01054745, + "balance_loss_clip": 1.04761958, + "balance_loss_mlp": 1.0368036, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.9578142668931786, + "language_loss": 0.78150946, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80334353, + "num_input_tokens_seen": 131599465, + "step": 6126, + "time_per_iteration": 2.4517736434936523 + }, + { + "auxiliary_loss_clip": 0.01125285, + "auxiliary_loss_mlp": 0.01044867, + "balance_loss_clip": 1.04614663, + "balance_loss_mlp": 1.02607965, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.2698062223461073, + "language_loss": 0.65900874, + "learning_rate": 2.914412150914888e-06, + "loss": 0.6807102, + "num_input_tokens_seen": 131618330, + "step": 6127, + "time_per_iteration": 2.456998109817505 + }, + { + "auxiliary_loss_clip": 0.01118819, + "auxiliary_loss_mlp": 0.01043436, + "balance_loss_clip": 1.05274463, + "balance_loss_mlp": 1.02698493, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.7059132370351149, + "language_loss": 0.70276439, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72438699, + "num_input_tokens_seen": 131638960, + "step": 6128, + "time_per_iteration": 2.6459155082702637 + }, + { + "auxiliary_loss_clip": 0.01118999, + "auxiliary_loss_mlp": 0.01043755, + "balance_loss_clip": 1.05092645, + "balance_loss_mlp": 1.02683949, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 1.8632879959871584, + "language_loss": 0.75285459, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77448213, + "num_input_tokens_seen": 131657440, + "step": 6129, + "time_per_iteration": 3.9973349571228027 + }, + { + "auxiliary_loss_clip": 0.01118357, + "auxiliary_loss_mlp": 0.01047304, + "balance_loss_clip": 1.04488635, + "balance_loss_mlp": 1.03042388, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.6831878207502737, + "language_loss": 0.84911472, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.87077141, + "num_input_tokens_seen": 131678035, + "step": 6130, + "time_per_iteration": 2.5111429691314697 + }, + { + "auxiliary_loss_clip": 0.01055894, + "auxiliary_loss_mlp": 0.0101091, + "balance_loss_clip": 1.05529511, + "balance_loss_mlp": 1.00888371, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.806709933374375, + "language_loss": 0.60252082, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62318891, + "num_input_tokens_seen": 131742470, + "step": 6131, + "time_per_iteration": 3.1819117069244385 + }, + { + "auxiliary_loss_clip": 0.01098059, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.04522395, + "balance_loss_mlp": 1.01695085, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.562929678210747, + "language_loss": 0.73381901, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75513572, + "num_input_tokens_seen": 131764570, + "step": 6132, + "time_per_iteration": 2.619145631790161 + }, + { + "auxiliary_loss_clip": 0.01126608, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.0473665, + "balance_loss_mlp": 1.02117062, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.5300307050716027, + "language_loss": 0.73905116, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76070547, + "num_input_tokens_seen": 131785720, + "step": 6133, + "time_per_iteration": 4.053387403488159 + }, + { + "auxiliary_loss_clip": 0.01073128, + "auxiliary_loss_mlp": 0.01049129, + "balance_loss_clip": 1.03908253, + "balance_loss_mlp": 1.0310688, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.893411418884123, + "language_loss": 0.7151739, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73639649, + "num_input_tokens_seen": 131804430, + "step": 6134, + "time_per_iteration": 2.6136372089385986 + }, + { + "auxiliary_loss_clip": 0.01104282, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.04871678, + "balance_loss_mlp": 1.01935041, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.7286123566714455, + "language_loss": 0.75204539, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77344644, + "num_input_tokens_seen": 131822060, + "step": 6135, + "time_per_iteration": 2.5491127967834473 + }, + { + "auxiliary_loss_clip": 0.01035267, + "auxiliary_loss_mlp": 0.01003237, + "balance_loss_clip": 1.03336477, + "balance_loss_mlp": 1.0016396, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.808224967848437, + "language_loss": 0.58792818, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.6083132, + "num_input_tokens_seen": 131880715, + "step": 6136, + "time_per_iteration": 3.0551552772521973 + }, + { + "auxiliary_loss_clip": 0.0110631, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.04407954, + "balance_loss_mlp": 1.01896787, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 2.4642473773528075, + "language_loss": 0.79680061, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.81822205, + "num_input_tokens_seen": 131895850, + "step": 6137, + "time_per_iteration": 2.4941353797912598 + }, + { + "auxiliary_loss_clip": 0.0112698, + "auxiliary_loss_mlp": 0.01043163, + "balance_loss_clip": 1.04870176, + "balance_loss_mlp": 1.02588964, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 1.8889472016633877, + "language_loss": 0.73891628, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76061773, + "num_input_tokens_seen": 131915775, + "step": 6138, + "time_per_iteration": 2.5150198936462402 + }, + { + "auxiliary_loss_clip": 0.01091421, + "auxiliary_loss_mlp": 0.01042674, + "balance_loss_clip": 1.04241467, + "balance_loss_mlp": 1.02585387, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 2.003430120239922, + "language_loss": 0.65567124, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.67701221, + "num_input_tokens_seen": 131935715, + "step": 6139, + "time_per_iteration": 4.072701454162598 + }, + { + "auxiliary_loss_clip": 0.01098328, + "auxiliary_loss_mlp": 0.01043032, + "balance_loss_clip": 1.04647529, + "balance_loss_mlp": 1.02584207, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.2814408149995744, + "language_loss": 0.71370447, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73511803, + "num_input_tokens_seen": 131954120, + "step": 6140, + "time_per_iteration": 2.5175271034240723 + }, + { + "auxiliary_loss_clip": 0.01035685, + "auxiliary_loss_mlp": 0.01001661, + "balance_loss_clip": 1.03972769, + "balance_loss_mlp": 1.00008702, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7496423128741421, + "language_loss": 0.59329182, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61366528, + "num_input_tokens_seen": 132017485, + "step": 6141, + "time_per_iteration": 3.168349266052246 + }, + { + "auxiliary_loss_clip": 0.01122137, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.04802144, + "balance_loss_mlp": 1.02351141, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 2.0518034341114673, + "language_loss": 0.75448155, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77610421, + "num_input_tokens_seen": 132036760, + "step": 6142, + "time_per_iteration": 2.49143385887146 + }, + { + "auxiliary_loss_clip": 0.01121953, + "auxiliary_loss_mlp": 0.01035771, + "balance_loss_clip": 1.04778981, + "balance_loss_mlp": 1.02049422, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 1.7096970187499676, + "language_loss": 0.7674706, + "learning_rate": 2.908865770392555e-06, + "loss": 0.78904784, + "num_input_tokens_seen": 132056935, + "step": 6143, + "time_per_iteration": 2.5158939361572266 + }, + { + "auxiliary_loss_clip": 0.01119803, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.04519355, + "balance_loss_mlp": 1.02122331, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 2.8536091445258025, + "language_loss": 0.82129759, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.8428576, + "num_input_tokens_seen": 132077285, + "step": 6144, + "time_per_iteration": 2.503422498703003 + }, + { + "auxiliary_loss_clip": 0.01124485, + "auxiliary_loss_mlp": 0.01039315, + "balance_loss_clip": 1.04589653, + "balance_loss_mlp": 1.0233289, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 2.174977803647987, + "language_loss": 0.77375484, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79539281, + "num_input_tokens_seen": 132095520, + "step": 6145, + "time_per_iteration": 2.491511344909668 + }, + { + "auxiliary_loss_clip": 0.01119382, + "auxiliary_loss_mlp": 0.01035228, + "balance_loss_clip": 1.0471617, + "balance_loss_mlp": 1.01881289, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.7896342674288075, + "language_loss": 0.77176106, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79330719, + "num_input_tokens_seen": 132112810, + "step": 6146, + "time_per_iteration": 2.448511838912964 + }, + { + "auxiliary_loss_clip": 0.01110041, + "auxiliary_loss_mlp": 0.01043915, + "balance_loss_clip": 1.04792619, + "balance_loss_mlp": 1.0265944, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.7053866159208848, + "language_loss": 0.80732465, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82886428, + "num_input_tokens_seen": 132131615, + "step": 6147, + "time_per_iteration": 2.477383852005005 + }, + { + "auxiliary_loss_clip": 0.01102159, + "auxiliary_loss_mlp": 0.00782079, + "balance_loss_clip": 1.04502022, + "balance_loss_mlp": 1.00046587, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.973633360364268, + "language_loss": 0.83449602, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85333842, + "num_input_tokens_seen": 132149585, + "step": 6148, + "time_per_iteration": 2.5611977577209473 + }, + { + "auxiliary_loss_clip": 0.01122295, + "auxiliary_loss_mlp": 0.01039068, + "balance_loss_clip": 1.05035937, + "balance_loss_mlp": 1.02323151, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 3.3988542130668344, + "language_loss": 0.74319804, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76481169, + "num_input_tokens_seen": 132165555, + "step": 6149, + "time_per_iteration": 2.4911301136016846 + }, + { + "auxiliary_loss_clip": 0.01139013, + "auxiliary_loss_mlp": 0.01040333, + "balance_loss_clip": 1.05060863, + "balance_loss_mlp": 1.02305925, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.942023848468543, + "language_loss": 0.71030927, + "learning_rate": 2.906436451364054e-06, + "loss": 0.73210275, + "num_input_tokens_seen": 132185100, + "step": 6150, + "time_per_iteration": 2.4811184406280518 + }, + { + "auxiliary_loss_clip": 0.01111018, + "auxiliary_loss_mlp": 0.01041398, + "balance_loss_clip": 1.04697394, + "balance_loss_mlp": 1.02541232, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.5129094663936486, + "language_loss": 0.81490463, + "learning_rate": 2.906089268194611e-06, + "loss": 0.83642876, + "num_input_tokens_seen": 132203930, + "step": 6151, + "time_per_iteration": 2.509460687637329 + }, + { + "auxiliary_loss_clip": 0.01039023, + "auxiliary_loss_mlp": 0.01006188, + "balance_loss_clip": 1.02770805, + "balance_loss_mlp": 1.00439978, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.8368497680546049, + "language_loss": 0.63166577, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65211785, + "num_input_tokens_seen": 132263845, + "step": 6152, + "time_per_iteration": 3.201577663421631 + }, + { + "auxiliary_loss_clip": 0.01085648, + "auxiliary_loss_mlp": 0.01052271, + "balance_loss_clip": 1.04674423, + "balance_loss_mlp": 1.03540301, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 2.0975384934095485, + "language_loss": 0.69976342, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72114265, + "num_input_tokens_seen": 132282350, + "step": 6153, + "time_per_iteration": 2.603621244430542 + }, + { + "auxiliary_loss_clip": 0.01126476, + "auxiliary_loss_mlp": 0.01043793, + "balance_loss_clip": 1.05125093, + "balance_loss_mlp": 1.02744925, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 1.803198888700706, + "language_loss": 0.72640359, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74810636, + "num_input_tokens_seen": 132301930, + "step": 6154, + "time_per_iteration": 4.014392852783203 + }, + { + "auxiliary_loss_clip": 0.01108104, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.04852176, + "balance_loss_mlp": 1.02065587, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 2.0205216754397552, + "language_loss": 0.67918998, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.700634, + "num_input_tokens_seen": 132320915, + "step": 6155, + "time_per_iteration": 2.5228660106658936 + }, + { + "auxiliary_loss_clip": 0.01121745, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.04576766, + "balance_loss_mlp": 1.01883841, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 2.653133058549036, + "language_loss": 0.67926669, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.70082819, + "num_input_tokens_seen": 132340415, + "step": 6156, + "time_per_iteration": 2.4662508964538574 + }, + { + "auxiliary_loss_clip": 0.01108957, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_clip": 1.04471064, + "balance_loss_mlp": 1.0286901, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.974750102084213, + "language_loss": 0.82202578, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84354556, + "num_input_tokens_seen": 132358600, + "step": 6157, + "time_per_iteration": 2.505356788635254 + }, + { + "auxiliary_loss_clip": 0.01094419, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.04848027, + "balance_loss_mlp": 1.02428341, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.414414023740366, + "language_loss": 0.76755941, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78891635, + "num_input_tokens_seen": 132373160, + "step": 6158, + "time_per_iteration": 2.581129550933838 + }, + { + "auxiliary_loss_clip": 0.01136747, + "auxiliary_loss_mlp": 0.01037378, + "balance_loss_clip": 1.04918969, + "balance_loss_mlp": 1.02048564, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.1882578098517587, + "language_loss": 0.68885362, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71059483, + "num_input_tokens_seen": 132392345, + "step": 6159, + "time_per_iteration": 2.448986768722534 + }, + { + "auxiliary_loss_clip": 0.01108999, + "auxiliary_loss_mlp": 0.0104576, + "balance_loss_clip": 1.04969311, + "balance_loss_mlp": 1.03146672, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.796396484861908, + "language_loss": 0.71157038, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73311794, + "num_input_tokens_seen": 132412620, + "step": 6160, + "time_per_iteration": 2.555814504623413 + }, + { + "auxiliary_loss_clip": 0.01106202, + "auxiliary_loss_mlp": 0.01037871, + "balance_loss_clip": 1.04581797, + "balance_loss_mlp": 1.0237211, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.7023745895935223, + "language_loss": 0.78881401, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81025469, + "num_input_tokens_seen": 132431570, + "step": 6161, + "time_per_iteration": 2.499570846557617 + }, + { + "auxiliary_loss_clip": 0.011351, + "auxiliary_loss_mlp": 0.01042165, + "balance_loss_clip": 1.04987216, + "balance_loss_mlp": 1.02620256, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.8118017644471927, + "language_loss": 0.7919969, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81376958, + "num_input_tokens_seen": 132451525, + "step": 6162, + "time_per_iteration": 2.463496685028076 + }, + { + "auxiliary_loss_clip": 0.01107561, + "auxiliary_loss_mlp": 0.00782951, + "balance_loss_clip": 1.04377985, + "balance_loss_mlp": 1.00044823, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.9518081048171616, + "language_loss": 0.7998684, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81877351, + "num_input_tokens_seen": 132469875, + "step": 6163, + "time_per_iteration": 2.487520694732666 + }, + { + "auxiliary_loss_clip": 0.01123128, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.04929841, + "balance_loss_mlp": 1.02503276, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.6197118335510838, + "language_loss": 0.68221951, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70386869, + "num_input_tokens_seen": 132488360, + "step": 6164, + "time_per_iteration": 2.474132537841797 + }, + { + "auxiliary_loss_clip": 0.01110802, + "auxiliary_loss_mlp": 0.01047078, + "balance_loss_clip": 1.05000043, + "balance_loss_mlp": 1.02979267, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.328453009491835, + "language_loss": 0.83091462, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.85249335, + "num_input_tokens_seen": 132508630, + "step": 6165, + "time_per_iteration": 2.546581268310547 + }, + { + "auxiliary_loss_clip": 0.01118432, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.05025613, + "balance_loss_mlp": 1.02404058, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 3.098954665796511, + "language_loss": 0.69427001, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71586776, + "num_input_tokens_seen": 132527465, + "step": 6166, + "time_per_iteration": 2.5224337577819824 + }, + { + "auxiliary_loss_clip": 0.01041956, + "auxiliary_loss_mlp": 0.01006638, + "balance_loss_clip": 1.04218149, + "balance_loss_mlp": 1.00506425, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.7911838490618306, + "language_loss": 0.56985623, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.59034216, + "num_input_tokens_seen": 132579940, + "step": 6167, + "time_per_iteration": 2.978386878967285 + }, + { + "auxiliary_loss_clip": 0.01111702, + "auxiliary_loss_mlp": 0.01043783, + "balance_loss_clip": 1.04752064, + "balance_loss_mlp": 1.02872717, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.7708834892290648, + "language_loss": 0.75682497, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77837992, + "num_input_tokens_seen": 132598390, + "step": 6168, + "time_per_iteration": 4.0782630443573 + }, + { + "auxiliary_loss_clip": 0.01119951, + "auxiliary_loss_mlp": 0.00782229, + "balance_loss_clip": 1.04687667, + "balance_loss_mlp": 1.00052714, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 1.5605282817869892, + "language_loss": 0.74024439, + "learning_rate": 2.899834108519755e-06, + "loss": 0.75926614, + "num_input_tokens_seen": 132616920, + "step": 6169, + "time_per_iteration": 2.47414493560791 + }, + { + "auxiliary_loss_clip": 0.01135653, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.05264938, + "balance_loss_mlp": 1.02508759, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 2.456267626499914, + "language_loss": 0.79328632, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81503958, + "num_input_tokens_seen": 132637660, + "step": 6170, + "time_per_iteration": 2.4853079319000244 + }, + { + "auxiliary_loss_clip": 0.01123774, + "auxiliary_loss_mlp": 0.01048891, + "balance_loss_clip": 1.05054355, + "balance_loss_mlp": 1.03196287, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.5666582563063811, + "language_loss": 0.76242971, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78415632, + "num_input_tokens_seen": 132657635, + "step": 6171, + "time_per_iteration": 2.6189324855804443 + }, + { + "auxiliary_loss_clip": 0.01115939, + "auxiliary_loss_mlp": 0.01038676, + "balance_loss_clip": 1.0552125, + "balance_loss_mlp": 1.02300024, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.8443536189457626, + "language_loss": 0.80124462, + "learning_rate": 2.898790504994232e-06, + "loss": 0.8227908, + "num_input_tokens_seen": 132674455, + "step": 6172, + "time_per_iteration": 4.0184409618377686 + }, + { + "auxiliary_loss_clip": 0.01128048, + "auxiliary_loss_mlp": 0.01046063, + "balance_loss_clip": 1.05042887, + "balance_loss_mlp": 1.02921891, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 1.976590518637473, + "language_loss": 0.59223449, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61397564, + "num_input_tokens_seen": 132695140, + "step": 6173, + "time_per_iteration": 2.6008007526397705 + }, + { + "auxiliary_loss_clip": 0.01111333, + "auxiliary_loss_mlp": 0.01041789, + "balance_loss_clip": 1.04506123, + "balance_loss_mlp": 1.02526641, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 2.0323843096342604, + "language_loss": 0.81012821, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83165944, + "num_input_tokens_seen": 132712470, + "step": 6174, + "time_per_iteration": 2.479736566543579 + }, + { + "auxiliary_loss_clip": 0.01130331, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_clip": 1.04785538, + "balance_loss_mlp": 1.03124857, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.7829534251897505, + "language_loss": 0.79913378, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.82089633, + "num_input_tokens_seen": 132732945, + "step": 6175, + "time_per_iteration": 2.51385235786438 + }, + { + "auxiliary_loss_clip": 0.01127502, + "auxiliary_loss_mlp": 0.01048785, + "balance_loss_clip": 1.0534867, + "balance_loss_mlp": 1.03341901, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 2.112026521671805, + "language_loss": 0.88838458, + "learning_rate": 2.89739855653729e-06, + "loss": 0.91014743, + "num_input_tokens_seen": 132752470, + "step": 6176, + "time_per_iteration": 2.5012879371643066 + }, + { + "auxiliary_loss_clip": 0.01127478, + "auxiliary_loss_mlp": 0.01043588, + "balance_loss_clip": 1.05180407, + "balance_loss_mlp": 1.02850819, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.5250373913514688, + "language_loss": 0.73476857, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75647926, + "num_input_tokens_seen": 132771485, + "step": 6177, + "time_per_iteration": 2.4683706760406494 + }, + { + "auxiliary_loss_clip": 0.01104216, + "auxiliary_loss_mlp": 0.01047891, + "balance_loss_clip": 1.04738808, + "balance_loss_mlp": 1.03232193, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.838888948378839, + "language_loss": 0.7533502, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77487129, + "num_input_tokens_seen": 132791465, + "step": 6178, + "time_per_iteration": 3.9699418544769287 + }, + { + "auxiliary_loss_clip": 0.01075766, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_clip": 1.04665756, + "balance_loss_mlp": 1.03145862, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.6114545423725974, + "language_loss": 0.72036934, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.7416141, + "num_input_tokens_seen": 132810160, + "step": 6179, + "time_per_iteration": 2.6070361137390137 + }, + { + "auxiliary_loss_clip": 0.01138863, + "auxiliary_loss_mlp": 0.01045708, + "balance_loss_clip": 1.05120969, + "balance_loss_mlp": 1.02901828, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 1.706392412031757, + "language_loss": 0.69550419, + "learning_rate": 2.896006063609283e-06, + "loss": 0.71734989, + "num_input_tokens_seen": 132831265, + "step": 6180, + "time_per_iteration": 2.4840526580810547 + }, + { + "auxiliary_loss_clip": 0.01113173, + "auxiliary_loss_mlp": 0.01037674, + "balance_loss_clip": 1.04866648, + "balance_loss_mlp": 1.02229643, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.7123298730899745, + "language_loss": 0.77454901, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.79605746, + "num_input_tokens_seen": 132850005, + "step": 6181, + "time_per_iteration": 2.505289077758789 + }, + { + "auxiliary_loss_clip": 0.01123418, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.050771, + "balance_loss_mlp": 1.02290356, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.9258084950288783, + "language_loss": 0.78521025, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80682778, + "num_input_tokens_seen": 132865790, + "step": 6182, + "time_per_iteration": 2.5035910606384277 + }, + { + "auxiliary_loss_clip": 0.01034518, + "auxiliary_loss_mlp": 0.01003855, + "balance_loss_clip": 1.02521896, + "balance_loss_mlp": 1.00246036, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7817311298318234, + "language_loss": 0.57465553, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59503925, + "num_input_tokens_seen": 132921775, + "step": 6183, + "time_per_iteration": 3.0741024017333984 + }, + { + "auxiliary_loss_clip": 0.01131033, + "auxiliary_loss_mlp": 0.00784517, + "balance_loss_clip": 1.0494349, + "balance_loss_mlp": 1.00058126, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.0704932428979412, + "language_loss": 0.76738387, + "learning_rate": 2.894613027055066e-06, + "loss": 0.78653938, + "num_input_tokens_seen": 132941060, + "step": 6184, + "time_per_iteration": 2.4798784255981445 + }, + { + "auxiliary_loss_clip": 0.0109565, + "auxiliary_loss_mlp": 0.01040619, + "balance_loss_clip": 1.04523706, + "balance_loss_mlp": 1.02534795, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 2.177853064723418, + "language_loss": 0.72335243, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74471515, + "num_input_tokens_seen": 132961850, + "step": 6185, + "time_per_iteration": 2.563821315765381 + }, + { + "auxiliary_loss_clip": 0.01082633, + "auxiliary_loss_mlp": 0.01039349, + "balance_loss_clip": 1.04165196, + "balance_loss_mlp": 1.02252889, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.5023219118835909, + "language_loss": 0.76898158, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79020143, + "num_input_tokens_seen": 132981625, + "step": 6186, + "time_per_iteration": 2.5780861377716064 + }, + { + "auxiliary_loss_clip": 0.01130204, + "auxiliary_loss_mlp": 0.01042936, + "balance_loss_clip": 1.05131876, + "balance_loss_mlp": 1.02637744, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.7425432571740862, + "language_loss": 0.83208132, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85381269, + "num_input_tokens_seen": 133001225, + "step": 6187, + "time_per_iteration": 2.5090181827545166 + }, + { + "auxiliary_loss_clip": 0.0112255, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.04696524, + "balance_loss_mlp": 1.02765846, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 1.9983400820229567, + "language_loss": 0.84902191, + "learning_rate": 2.893219447719824e-06, + "loss": 0.87068355, + "num_input_tokens_seen": 133018820, + "step": 6188, + "time_per_iteration": 2.4756178855895996 + }, + { + "auxiliary_loss_clip": 0.01108145, + "auxiliary_loss_mlp": 0.01039314, + "balance_loss_clip": 1.05015373, + "balance_loss_mlp": 1.02277946, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.8034914220628473, + "language_loss": 0.64943731, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67091191, + "num_input_tokens_seen": 133040205, + "step": 6189, + "time_per_iteration": 2.582577705383301 + }, + { + "auxiliary_loss_clip": 0.01112319, + "auxiliary_loss_mlp": 0.01042767, + "balance_loss_clip": 1.04554188, + "balance_loss_mlp": 1.02626801, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 2.4726148219754855, + "language_loss": 0.84175986, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86331069, + "num_input_tokens_seen": 133058095, + "step": 6190, + "time_per_iteration": 2.485137701034546 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01044645, + "balance_loss_clip": 1.04700089, + "balance_loss_mlp": 1.02844477, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 7.782046450398735, + "language_loss": 0.88834262, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90989375, + "num_input_tokens_seen": 133071530, + "step": 6191, + "time_per_iteration": 2.463423252105713 + }, + { + "auxiliary_loss_clip": 0.01090964, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.04644847, + "balance_loss_mlp": 1.01923275, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.7121779660210885, + "language_loss": 0.74104142, + "learning_rate": 2.891825326449073e-06, + "loss": 0.7623353, + "num_input_tokens_seen": 133091410, + "step": 6192, + "time_per_iteration": 2.6033718585968018 + }, + { + "auxiliary_loss_clip": 0.01135163, + "auxiliary_loss_mlp": 0.01043543, + "balance_loss_clip": 1.04929328, + "balance_loss_mlp": 1.02829623, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.6953558920824485, + "language_loss": 0.79825443, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.82004148, + "num_input_tokens_seen": 133110365, + "step": 6193, + "time_per_iteration": 4.009162425994873 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.04417133, + "balance_loss_mlp": 1.02453661, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 2.7370434401871693, + "language_loss": 0.84011567, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86153543, + "num_input_tokens_seen": 133128255, + "step": 6194, + "time_per_iteration": 2.5341427326202393 + }, + { + "auxiliary_loss_clip": 0.01118755, + "auxiliary_loss_mlp": 0.01039449, + "balance_loss_clip": 1.054896, + "balance_loss_mlp": 1.02397513, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.444926163827493, + "language_loss": 0.76694953, + "learning_rate": 2.890779380359646e-06, + "loss": 0.78853154, + "num_input_tokens_seen": 133143975, + "step": 6195, + "time_per_iteration": 2.499642848968506 + }, + { + "auxiliary_loss_clip": 0.01112849, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_clip": 1.04787302, + "balance_loss_mlp": 1.02618897, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 2.0438593825265543, + "language_loss": 0.79059637, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81214947, + "num_input_tokens_seen": 133162935, + "step": 6196, + "time_per_iteration": 2.528102159500122 + }, + { + "auxiliary_loss_clip": 0.01126275, + "auxiliary_loss_mlp": 0.01042948, + "balance_loss_clip": 1.05143976, + "balance_loss_mlp": 1.02802336, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 2.3918588990295446, + "language_loss": 0.83152509, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85321736, + "num_input_tokens_seen": 133181180, + "step": 6197, + "time_per_iteration": 2.4603705406188965 + }, + { + "auxiliary_loss_clip": 0.01129968, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_clip": 1.04641044, + "balance_loss_mlp": 1.02571642, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 2.0238268286314174, + "language_loss": 0.64518589, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66691232, + "num_input_tokens_seen": 133199615, + "step": 6198, + "time_per_iteration": 2.5076003074645996 + }, + { + "auxiliary_loss_clip": 0.01118893, + "auxiliary_loss_mlp": 0.01048273, + "balance_loss_clip": 1.04631925, + "balance_loss_mlp": 1.03332973, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 1.465564549341521, + "language_loss": 0.7402755, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76194715, + "num_input_tokens_seen": 133219650, + "step": 6199, + "time_per_iteration": 2.49932599067688 + }, + { + "auxiliary_loss_clip": 0.0110822, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.0479542, + "balance_loss_mlp": 1.02812314, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 2.570614544541696, + "language_loss": 0.80623776, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82775545, + "num_input_tokens_seen": 133245675, + "step": 6200, + "time_per_iteration": 2.8985753059387207 + }, + { + "auxiliary_loss_clip": 0.01098891, + "auxiliary_loss_mlp": 0.01047302, + "balance_loss_clip": 1.04832363, + "balance_loss_mlp": 1.03129244, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 1.9104240499852971, + "language_loss": 0.60181898, + "learning_rate": 2.88868657651991e-06, + "loss": 0.62328088, + "num_input_tokens_seen": 133266905, + "step": 6201, + "time_per_iteration": 2.6923556327819824 + }, + { + "auxiliary_loss_clip": 0.01124384, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.04835296, + "balance_loss_mlp": 1.02089334, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.577973700513646, + "language_loss": 0.7307958, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75241256, + "num_input_tokens_seen": 133286865, + "step": 6202, + "time_per_iteration": 2.4803266525268555 + }, + { + "auxiliary_loss_clip": 0.01115811, + "auxiliary_loss_mlp": 0.01037896, + "balance_loss_clip": 1.05107713, + "balance_loss_mlp": 1.0216831, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 1.8015836158158995, + "language_loss": 0.73765236, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.75918943, + "num_input_tokens_seen": 133305295, + "step": 6203, + "time_per_iteration": 2.4877820014953613 + }, + { + "auxiliary_loss_clip": 0.01104303, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.04401493, + "balance_loss_mlp": 1.02090228, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.7669247207147307, + "language_loss": 0.81618899, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83757901, + "num_input_tokens_seen": 133324625, + "step": 6204, + "time_per_iteration": 2.505173683166504 + }, + { + "auxiliary_loss_clip": 0.01126788, + "auxiliary_loss_mlp": 0.01043555, + "balance_loss_clip": 1.05010939, + "balance_loss_mlp": 1.02716339, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.6157466242929506, + "language_loss": 0.75351721, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77522063, + "num_input_tokens_seen": 133344625, + "step": 6205, + "time_per_iteration": 2.5118565559387207 + }, + { + "auxiliary_loss_clip": 0.01119259, + "auxiliary_loss_mlp": 0.01042472, + "balance_loss_clip": 1.04522431, + "balance_loss_mlp": 1.02580619, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.972825093826491, + "language_loss": 0.78256094, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80417824, + "num_input_tokens_seen": 133363605, + "step": 6206, + "time_per_iteration": 2.464735984802246 + }, + { + "auxiliary_loss_clip": 0.01134035, + "auxiliary_loss_mlp": 0.01039605, + "balance_loss_clip": 1.04824519, + "balance_loss_mlp": 1.02347589, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.0147792899644315, + "language_loss": 0.93133783, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95307422, + "num_input_tokens_seen": 133379405, + "step": 6207, + "time_per_iteration": 2.4465367794036865 + }, + { + "auxiliary_loss_clip": 0.01105839, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.04820681, + "balance_loss_mlp": 1.01906145, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.2059131198169264, + "language_loss": 0.8273561, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84875512, + "num_input_tokens_seen": 133397585, + "step": 6208, + "time_per_iteration": 4.047696113586426 + }, + { + "auxiliary_loss_clip": 0.01123483, + "auxiliary_loss_mlp": 0.01039848, + "balance_loss_clip": 1.04588771, + "balance_loss_mlp": 1.02301598, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 2.1080608852592047, + "language_loss": 0.73149157, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75312483, + "num_input_tokens_seen": 133415365, + "step": 6209, + "time_per_iteration": 2.469222068786621 + }, + { + "auxiliary_loss_clip": 0.01099638, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.05177188, + "balance_loss_mlp": 1.02448654, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.589668310426465, + "language_loss": 0.70278358, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72419059, + "num_input_tokens_seen": 133435700, + "step": 6210, + "time_per_iteration": 2.573272705078125 + }, + { + "auxiliary_loss_clip": 0.01077035, + "auxiliary_loss_mlp": 0.01051193, + "balance_loss_clip": 1.03932655, + "balance_loss_mlp": 1.03184533, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.5486817813943434, + "language_loss": 0.7787776, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.8000598, + "num_input_tokens_seen": 133455180, + "step": 6211, + "time_per_iteration": 2.5816140174865723 + }, + { + "auxiliary_loss_clip": 0.01125512, + "auxiliary_loss_mlp": 0.01039433, + "balance_loss_clip": 1.04939151, + "balance_loss_mlp": 1.02330434, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.488397706582383, + "language_loss": 0.73315001, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75479949, + "num_input_tokens_seen": 133476715, + "step": 6212, + "time_per_iteration": 4.139477491378784 + }, + { + "auxiliary_loss_clip": 0.01130769, + "auxiliary_loss_mlp": 0.01050729, + "balance_loss_clip": 1.05227852, + "balance_loss_mlp": 1.03425479, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 1.9507088744423957, + "language_loss": 0.81844902, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84026402, + "num_input_tokens_seen": 133494550, + "step": 6213, + "time_per_iteration": 2.5064926147460938 + }, + { + "auxiliary_loss_clip": 0.01093332, + "auxiliary_loss_mlp": 0.01050117, + "balance_loss_clip": 1.04534793, + "balance_loss_mlp": 1.03276002, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.7030182687083544, + "language_loss": 0.79103696, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81247139, + "num_input_tokens_seen": 133512640, + "step": 6214, + "time_per_iteration": 2.5581254959106445 + }, + { + "auxiliary_loss_clip": 0.01109803, + "auxiliary_loss_mlp": 0.01045378, + "balance_loss_clip": 1.04529691, + "balance_loss_mlp": 1.02959466, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.717665574092126, + "language_loss": 0.84933525, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87088704, + "num_input_tokens_seen": 133535540, + "step": 6215, + "time_per_iteration": 2.6768760681152344 + }, + { + "auxiliary_loss_clip": 0.01102757, + "auxiliary_loss_mlp": 0.01045474, + "balance_loss_clip": 1.04640436, + "balance_loss_mlp": 1.02859354, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 1.8047621162025493, + "language_loss": 0.68064582, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70212817, + "num_input_tokens_seen": 133555795, + "step": 6216, + "time_per_iteration": 2.5779433250427246 + }, + { + "auxiliary_loss_clip": 0.0111366, + "auxiliary_loss_mlp": 0.01046433, + "balance_loss_clip": 1.0482676, + "balance_loss_mlp": 1.02997017, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 3.1420808344157973, + "language_loss": 0.65813518, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67973614, + "num_input_tokens_seen": 133575905, + "step": 6217, + "time_per_iteration": 3.951240062713623 + }, + { + "auxiliary_loss_clip": 0.0111621, + "auxiliary_loss_mlp": 0.01037911, + "balance_loss_clip": 1.05049515, + "balance_loss_mlp": 1.02197313, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 1.7461584708517364, + "language_loss": 0.80424726, + "learning_rate": 2.88275038695833e-06, + "loss": 0.8257885, + "num_input_tokens_seen": 133592585, + "step": 6218, + "time_per_iteration": 2.4905476570129395 + }, + { + "auxiliary_loss_clip": 0.01119482, + "auxiliary_loss_mlp": 0.01038696, + "balance_loss_clip": 1.04893923, + "balance_loss_mlp": 1.02348542, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 4.231951200512491, + "language_loss": 0.78550363, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80708539, + "num_input_tokens_seen": 133615070, + "step": 6219, + "time_per_iteration": 2.595980644226074 + }, + { + "auxiliary_loss_clip": 0.01107646, + "auxiliary_loss_mlp": 0.01045794, + "balance_loss_clip": 1.04568422, + "balance_loss_mlp": 1.03038013, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 1.8732034811786866, + "language_loss": 0.76873451, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.7902689, + "num_input_tokens_seen": 133633490, + "step": 6220, + "time_per_iteration": 2.5572829246520996 + }, + { + "auxiliary_loss_clip": 0.01104822, + "auxiliary_loss_mlp": 0.01042912, + "balance_loss_clip": 1.04762602, + "balance_loss_mlp": 1.02643681, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.632902697824174, + "language_loss": 0.83049494, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85197222, + "num_input_tokens_seen": 133653425, + "step": 6221, + "time_per_iteration": 2.5640885829925537 + }, + { + "auxiliary_loss_clip": 0.01111399, + "auxiliary_loss_mlp": 0.01046812, + "balance_loss_clip": 1.04736423, + "balance_loss_mlp": 1.03104043, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.9700528806675124, + "language_loss": 0.76324034, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78482246, + "num_input_tokens_seen": 133670220, + "step": 6222, + "time_per_iteration": 2.5114521980285645 + }, + { + "auxiliary_loss_clip": 0.01107316, + "auxiliary_loss_mlp": 0.00782184, + "balance_loss_clip": 1.05220795, + "balance_loss_mlp": 1.00054622, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 2.6019715708439506, + "language_loss": 0.70641738, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72531235, + "num_input_tokens_seen": 133688910, + "step": 6223, + "time_per_iteration": 2.593526840209961 + }, + { + "auxiliary_loss_clip": 0.0110838, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.0544821, + "balance_loss_mlp": 1.02813244, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 2.1717750999989676, + "language_loss": 0.68394327, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.70546031, + "num_input_tokens_seen": 133708690, + "step": 6224, + "time_per_iteration": 2.693009614944458 + }, + { + "auxiliary_loss_clip": 0.01090627, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.04969692, + "balance_loss_mlp": 1.02389574, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.795195970926548, + "language_loss": 0.7011984, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72249615, + "num_input_tokens_seen": 133728095, + "step": 6225, + "time_per_iteration": 2.57676362991333 + }, + { + "auxiliary_loss_clip": 0.01091787, + "auxiliary_loss_mlp": 0.01045655, + "balance_loss_clip": 1.04575348, + "balance_loss_mlp": 1.02853656, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.0829452233190113, + "language_loss": 0.7901625, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81153691, + "num_input_tokens_seen": 133745590, + "step": 6226, + "time_per_iteration": 2.605365037918091 + }, + { + "auxiliary_loss_clip": 0.01114831, + "auxiliary_loss_mlp": 0.0104366, + "balance_loss_clip": 1.0546087, + "balance_loss_mlp": 1.0271734, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 1.7688316069353613, + "language_loss": 0.6792807, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70086563, + "num_input_tokens_seen": 133766155, + "step": 6227, + "time_per_iteration": 2.572873115539551 + }, + { + "auxiliary_loss_clip": 0.01101052, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.04577851, + "balance_loss_mlp": 1.01917434, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.59327175800682, + "language_loss": 0.83186358, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85321677, + "num_input_tokens_seen": 133783185, + "step": 6228, + "time_per_iteration": 2.5821292400360107 + }, + { + "auxiliary_loss_clip": 0.01092137, + "auxiliary_loss_mlp": 0.01055404, + "balance_loss_clip": 1.04566932, + "balance_loss_mlp": 1.03804731, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.5417432821123982, + "language_loss": 0.75021088, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.77168638, + "num_input_tokens_seen": 133800975, + "step": 6229, + "time_per_iteration": 2.5356459617614746 + }, + { + "auxiliary_loss_clip": 0.01101231, + "auxiliary_loss_mlp": 0.010391, + "balance_loss_clip": 1.0500071, + "balance_loss_mlp": 1.02244604, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.7327028468433776, + "language_loss": 0.83479643, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85619974, + "num_input_tokens_seen": 133818020, + "step": 6230, + "time_per_iteration": 2.5729660987854004 + }, + { + "auxiliary_loss_clip": 0.01126562, + "auxiliary_loss_mlp": 0.01046508, + "balance_loss_clip": 1.05105782, + "balance_loss_mlp": 1.03051054, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.9162339499079653, + "language_loss": 0.7345041, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75623488, + "num_input_tokens_seen": 133840690, + "step": 6231, + "time_per_iteration": 2.5760395526885986 + }, + { + "auxiliary_loss_clip": 0.01125017, + "auxiliary_loss_mlp": 0.01048978, + "balance_loss_clip": 1.05395854, + "balance_loss_mlp": 1.03280163, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 3.5519304780165784, + "language_loss": 0.7323122, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75405216, + "num_input_tokens_seen": 133858350, + "step": 6232, + "time_per_iteration": 2.5160720348358154 + }, + { + "auxiliary_loss_clip": 0.01111259, + "auxiliary_loss_mlp": 0.01040574, + "balance_loss_clip": 1.0484426, + "balance_loss_mlp": 1.02389646, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.760487039494868, + "language_loss": 0.76997483, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79149318, + "num_input_tokens_seen": 133879775, + "step": 6233, + "time_per_iteration": 4.168405532836914 + }, + { + "auxiliary_loss_clip": 0.01118425, + "auxiliary_loss_mlp": 0.01044422, + "balance_loss_clip": 1.05056715, + "balance_loss_mlp": 1.0283289, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.751924410760683, + "language_loss": 0.69117868, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71280712, + "num_input_tokens_seen": 133898295, + "step": 6234, + "time_per_iteration": 2.5371696949005127 + }, + { + "auxiliary_loss_clip": 0.01125463, + "auxiliary_loss_mlp": 0.01049632, + "balance_loss_clip": 1.04996383, + "balance_loss_mlp": 1.03450489, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.7735444735421995, + "language_loss": 0.82371521, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.8454662, + "num_input_tokens_seen": 133915230, + "step": 6235, + "time_per_iteration": 2.4968552589416504 + }, + { + "auxiliary_loss_clip": 0.0114317, + "auxiliary_loss_mlp": 0.01037333, + "balance_loss_clip": 1.05492914, + "balance_loss_mlp": 1.02146602, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.6715436169404065, + "language_loss": 0.77694464, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.79874969, + "num_input_tokens_seen": 133934110, + "step": 6236, + "time_per_iteration": 2.531383752822876 + }, + { + "auxiliary_loss_clip": 0.01123536, + "auxiliary_loss_mlp": 0.01050375, + "balance_loss_clip": 1.04921055, + "balance_loss_mlp": 1.03199327, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.157228361485747, + "language_loss": 0.72965813, + "learning_rate": 2.876104377085234e-06, + "loss": 0.75139719, + "num_input_tokens_seen": 133952395, + "step": 6237, + "time_per_iteration": 2.491184711456299 + }, + { + "auxiliary_loss_clip": 0.0111507, + "auxiliary_loss_mlp": 0.00782787, + "balance_loss_clip": 1.04600227, + "balance_loss_mlp": 1.00051653, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 2.106766001585317, + "language_loss": 0.93310702, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.95208561, + "num_input_tokens_seen": 133969635, + "step": 6238, + "time_per_iteration": 2.5868592262268066 + }, + { + "auxiliary_loss_clip": 0.01139022, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.05139685, + "balance_loss_mlp": 1.02253342, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 1.9843829755101923, + "language_loss": 0.71407986, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73586506, + "num_input_tokens_seen": 133987215, + "step": 6239, + "time_per_iteration": 2.417301654815674 + }, + { + "auxiliary_loss_clip": 0.01074089, + "auxiliary_loss_mlp": 0.01040128, + "balance_loss_clip": 1.05085802, + "balance_loss_mlp": 1.02354646, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 2.183970299333539, + "language_loss": 0.65199429, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67313647, + "num_input_tokens_seen": 134009250, + "step": 6240, + "time_per_iteration": 2.7661144733428955 + }, + { + "auxiliary_loss_clip": 0.01106547, + "auxiliary_loss_mlp": 0.00782679, + "balance_loss_clip": 1.04911327, + "balance_loss_mlp": 1.00054193, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 1.8962773034728067, + "language_loss": 0.76081347, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.7797057, + "num_input_tokens_seen": 134026875, + "step": 6241, + "time_per_iteration": 2.520319938659668 + }, + { + "auxiliary_loss_clip": 0.01106048, + "auxiliary_loss_mlp": 0.01046442, + "balance_loss_clip": 1.0472399, + "balance_loss_mlp": 1.02943087, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.270691214903349, + "language_loss": 0.83393097, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85545588, + "num_input_tokens_seen": 134047185, + "step": 6242, + "time_per_iteration": 2.582973003387451 + }, + { + "auxiliary_loss_clip": 0.01115212, + "auxiliary_loss_mlp": 0.01046145, + "balance_loss_clip": 1.05209005, + "balance_loss_mlp": 1.03117824, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 7.314735594084172, + "language_loss": 0.67718434, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.69879788, + "num_input_tokens_seen": 134067330, + "step": 6243, + "time_per_iteration": 2.5824551582336426 + }, + { + "auxiliary_loss_clip": 0.0106359, + "auxiliary_loss_mlp": 0.00787391, + "balance_loss_clip": 1.04565179, + "balance_loss_mlp": 1.00056052, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 2.057984197281547, + "language_loss": 0.83355737, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85206717, + "num_input_tokens_seen": 134085525, + "step": 6244, + "time_per_iteration": 2.6584222316741943 + }, + { + "auxiliary_loss_clip": 0.01091323, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.05338144, + "balance_loss_mlp": 1.02732968, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 3.7818471947021166, + "language_loss": 0.83044374, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85179889, + "num_input_tokens_seen": 134101855, + "step": 6245, + "time_per_iteration": 2.5634350776672363 + }, + { + "auxiliary_loss_clip": 0.01104854, + "auxiliary_loss_mlp": 0.01052972, + "balance_loss_clip": 1.04495335, + "balance_loss_mlp": 1.0339582, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 2.3171834601275467, + "language_loss": 0.6414876, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66306579, + "num_input_tokens_seen": 134119360, + "step": 6246, + "time_per_iteration": 2.5469422340393066 + }, + { + "auxiliary_loss_clip": 0.0111341, + "auxiliary_loss_mlp": 0.01049631, + "balance_loss_clip": 1.0501492, + "balance_loss_mlp": 1.0319407, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.7349129951871654, + "language_loss": 0.74824423, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.76987469, + "num_input_tokens_seen": 134137475, + "step": 6247, + "time_per_iteration": 2.5132460594177246 + }, + { + "auxiliary_loss_clip": 0.01130159, + "auxiliary_loss_mlp": 0.01044657, + "balance_loss_clip": 1.05242443, + "balance_loss_mlp": 1.02787256, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 9.623296561723459, + "language_loss": 0.54782474, + "learning_rate": 2.872251199697598e-06, + "loss": 0.56957293, + "num_input_tokens_seen": 134154580, + "step": 6248, + "time_per_iteration": 4.026439666748047 + }, + { + "auxiliary_loss_clip": 0.01123177, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_clip": 1.04884434, + "balance_loss_mlp": 1.02640295, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 1.9959200044686323, + "language_loss": 0.84391522, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86557281, + "num_input_tokens_seen": 134174285, + "step": 6249, + "time_per_iteration": 2.5419108867645264 + }, + { + "auxiliary_loss_clip": 0.01116327, + "auxiliary_loss_mlp": 0.01037682, + "balance_loss_clip": 1.05606818, + "balance_loss_mlp": 1.02232218, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.4607069364626042, + "language_loss": 0.67987621, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70141631, + "num_input_tokens_seen": 134195940, + "step": 6250, + "time_per_iteration": 2.668689727783203 + }, + { + "auxiliary_loss_clip": 0.01118965, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.05149686, + "balance_loss_mlp": 1.02997398, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 1.9148410353123675, + "language_loss": 0.7749362, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79657829, + "num_input_tokens_seen": 134212235, + "step": 6251, + "time_per_iteration": 4.017856597900391 + }, + { + "auxiliary_loss_clip": 0.01127022, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.05495942, + "balance_loss_mlp": 1.0285306, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.4342552401535973, + "language_loss": 0.57807678, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.59978998, + "num_input_tokens_seen": 134233810, + "step": 6252, + "time_per_iteration": 2.6083333492279053 + }, + { + "auxiliary_loss_clip": 0.01125379, + "auxiliary_loss_mlp": 0.01049792, + "balance_loss_clip": 1.05613828, + "balance_loss_mlp": 1.03312695, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 2.1291955781206524, + "language_loss": 0.89356846, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91532016, + "num_input_tokens_seen": 134252020, + "step": 6253, + "time_per_iteration": 2.5523853302001953 + }, + { + "auxiliary_loss_clip": 0.01102194, + "auxiliary_loss_mlp": 0.01033872, + "balance_loss_clip": 1.05329275, + "balance_loss_mlp": 1.01927495, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.8634235461966588, + "language_loss": 0.76693213, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78829277, + "num_input_tokens_seen": 134269495, + "step": 6254, + "time_per_iteration": 2.5583980083465576 + }, + { + "auxiliary_loss_clip": 0.01105493, + "auxiliary_loss_mlp": 0.01055796, + "balance_loss_clip": 1.05008721, + "balance_loss_mlp": 1.03803349, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.530497866417073, + "language_loss": 0.61640012, + "learning_rate": 2.869797092829169e-06, + "loss": 0.63801301, + "num_input_tokens_seen": 134287035, + "step": 6255, + "time_per_iteration": 2.547067165374756 + }, + { + "auxiliary_loss_clip": 0.01137078, + "auxiliary_loss_mlp": 0.01039328, + "balance_loss_clip": 1.05677032, + "balance_loss_mlp": 1.02142274, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 11.953224356587274, + "language_loss": 0.74153101, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76329505, + "num_input_tokens_seen": 134304840, + "step": 6256, + "time_per_iteration": 2.5244638919830322 + }, + { + "auxiliary_loss_clip": 0.01136843, + "auxiliary_loss_mlp": 0.01048312, + "balance_loss_clip": 1.05654252, + "balance_loss_mlp": 1.03044271, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.9585157099704125, + "language_loss": 0.70299572, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72484732, + "num_input_tokens_seen": 134323180, + "step": 6257, + "time_per_iteration": 3.9982964992523193 + }, + { + "auxiliary_loss_clip": 0.01118531, + "auxiliary_loss_mlp": 0.01037366, + "balance_loss_clip": 1.05382895, + "balance_loss_mlp": 1.0222261, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.7486944762215588, + "language_loss": 0.84418404, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86574304, + "num_input_tokens_seen": 134341390, + "step": 6258, + "time_per_iteration": 2.5307602882385254 + }, + { + "auxiliary_loss_clip": 0.01103165, + "auxiliary_loss_mlp": 0.01040333, + "balance_loss_clip": 1.05433023, + "balance_loss_mlp": 1.02528894, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.5405458945214092, + "language_loss": 0.80348086, + "learning_rate": 2.868394020133277e-06, + "loss": 0.82491589, + "num_input_tokens_seen": 134360425, + "step": 6259, + "time_per_iteration": 2.612607002258301 + }, + { + "auxiliary_loss_clip": 0.01101043, + "auxiliary_loss_mlp": 0.01048225, + "balance_loss_clip": 1.04943907, + "balance_loss_mlp": 1.03080893, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 2.082415906806046, + "language_loss": 0.71328288, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73477554, + "num_input_tokens_seen": 134379775, + "step": 6260, + "time_per_iteration": 2.623030185699463 + }, + { + "auxiliary_loss_clip": 0.01112725, + "auxiliary_loss_mlp": 0.01043693, + "balance_loss_clip": 1.04706359, + "balance_loss_mlp": 1.02626479, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 2.11125732892889, + "language_loss": 0.78343529, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80499941, + "num_input_tokens_seen": 134400315, + "step": 6261, + "time_per_iteration": 2.53996205329895 + }, + { + "auxiliary_loss_clip": 0.0111882, + "auxiliary_loss_mlp": 0.01056156, + "balance_loss_clip": 1.05202079, + "balance_loss_mlp": 1.03770208, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.6679320966263167, + "language_loss": 0.80551326, + "learning_rate": 2.867341369804132e-06, + "loss": 0.827263, + "num_input_tokens_seen": 134422875, + "step": 6262, + "time_per_iteration": 2.641294479370117 + }, + { + "auxiliary_loss_clip": 0.0112127, + "auxiliary_loss_mlp": 0.01038319, + "balance_loss_clip": 1.0507369, + "balance_loss_mlp": 1.02241659, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.9067647199815114, + "language_loss": 0.80768818, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82928407, + "num_input_tokens_seen": 134443025, + "step": 6263, + "time_per_iteration": 2.6018145084381104 + }, + { + "auxiliary_loss_clip": 0.01146799, + "auxiliary_loss_mlp": 0.0104787, + "balance_loss_clip": 1.05824018, + "balance_loss_mlp": 1.03169298, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 1.8029945209995553, + "language_loss": 0.79305804, + "learning_rate": 2.866639438447501e-06, + "loss": 0.81500471, + "num_input_tokens_seen": 134460945, + "step": 6264, + "time_per_iteration": 2.4291303157806396 + }, + { + "auxiliary_loss_clip": 0.01140073, + "auxiliary_loss_mlp": 0.01050485, + "balance_loss_clip": 1.05296957, + "balance_loss_mlp": 1.03435636, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 1.973049026596681, + "language_loss": 0.73964953, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.76155508, + "num_input_tokens_seen": 134480440, + "step": 6265, + "time_per_iteration": 2.4608757495880127 + }, + { + "auxiliary_loss_clip": 0.01129459, + "auxiliary_loss_mlp": 0.01044591, + "balance_loss_clip": 1.05554199, + "balance_loss_mlp": 1.03022051, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.5539870487635643, + "language_loss": 0.69050491, + "learning_rate": 2.865937375638654e-06, + "loss": 0.7122454, + "num_input_tokens_seen": 134501110, + "step": 6266, + "time_per_iteration": 2.5398528575897217 + }, + { + "auxiliary_loss_clip": 0.01137854, + "auxiliary_loss_mlp": 0.01043074, + "balance_loss_clip": 1.05690539, + "balance_loss_mlp": 1.02624202, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 2.7364502211429973, + "language_loss": 0.62536651, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.64717579, + "num_input_tokens_seen": 134522460, + "step": 6267, + "time_per_iteration": 2.525970220565796 + }, + { + "auxiliary_loss_clip": 0.01057229, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.03599524, + "balance_loss_mlp": 1.03698468, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7382828746419061, + "language_loss": 0.58885419, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.609815, + "num_input_tokens_seen": 134589545, + "step": 6268, + "time_per_iteration": 3.2555787563323975 + }, + { + "auxiliary_loss_clip": 0.01143331, + "auxiliary_loss_mlp": 0.01045184, + "balance_loss_clip": 1.05436063, + "balance_loss_mlp": 1.02758908, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.3995863111504234, + "language_loss": 0.64969707, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67158222, + "num_input_tokens_seen": 134610550, + "step": 6269, + "time_per_iteration": 2.507566213607788 + }, + { + "auxiliary_loss_clip": 0.01115859, + "auxiliary_loss_mlp": 0.01037741, + "balance_loss_clip": 1.06334364, + "balance_loss_mlp": 1.02072966, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.6554014978537384, + "language_loss": 0.70538962, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72692561, + "num_input_tokens_seen": 134630485, + "step": 6270, + "time_per_iteration": 2.584643840789795 + }, + { + "auxiliary_loss_clip": 0.01064649, + "auxiliary_loss_mlp": 0.01004141, + "balance_loss_clip": 1.03569603, + "balance_loss_mlp": 1.00275838, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7028415366395023, + "language_loss": 0.56148863, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58217645, + "num_input_tokens_seen": 134693510, + "step": 6271, + "time_per_iteration": 3.0185298919677734 + }, + { + "auxiliary_loss_clip": 0.01128208, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.05253887, + "balance_loss_mlp": 1.02219725, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 1.776388143609977, + "language_loss": 0.79675686, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.8184458, + "num_input_tokens_seen": 134713115, + "step": 6272, + "time_per_iteration": 4.030563116073608 + }, + { + "auxiliary_loss_clip": 0.01126053, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.05121589, + "balance_loss_mlp": 1.0222224, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 2.0843479870094264, + "language_loss": 0.7392621, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76089752, + "num_input_tokens_seen": 134732635, + "step": 6273, + "time_per_iteration": 2.516108751296997 + }, + { + "auxiliary_loss_clip": 0.01127603, + "auxiliary_loss_mlp": 0.01052215, + "balance_loss_clip": 1.05680847, + "balance_loss_mlp": 1.036515, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.6311560403336427, + "language_loss": 0.71448922, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.73628742, + "num_input_tokens_seen": 134750695, + "step": 6274, + "time_per_iteration": 2.4887306690216064 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01039893, + "balance_loss_clip": 1.05317664, + "balance_loss_mlp": 1.02421725, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 2.0061028747497307, + "language_loss": 0.84118724, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.86273241, + "num_input_tokens_seen": 134768935, + "step": 6275, + "time_per_iteration": 2.557966470718384 + }, + { + "auxiliary_loss_clip": 0.01088686, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.05365348, + "balance_loss_mlp": 1.02206457, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.507930101690453, + "language_loss": 0.75581193, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77706325, + "num_input_tokens_seen": 134791260, + "step": 6276, + "time_per_iteration": 2.6801633834838867 + }, + { + "auxiliary_loss_clip": 0.01120007, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.05059969, + "balance_loss_mlp": 1.02078533, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 3.601722316260723, + "language_loss": 0.85951447, + "learning_rate": 2.862073685241366e-06, + "loss": 0.8810873, + "num_input_tokens_seen": 134808350, + "step": 6277, + "time_per_iteration": 2.52425479888916 + }, + { + "auxiliary_loss_clip": 0.01129257, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.05556583, + "balance_loss_mlp": 1.0258826, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 1.8216152368272596, + "language_loss": 0.78247303, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80417049, + "num_input_tokens_seen": 134826005, + "step": 6278, + "time_per_iteration": 2.489337205886841 + }, + { + "auxiliary_loss_clip": 0.01111119, + "auxiliary_loss_mlp": 0.01045955, + "balance_loss_clip": 1.05172062, + "balance_loss_mlp": 1.02837133, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.7177710729526976, + "language_loss": 0.82980394, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85137469, + "num_input_tokens_seen": 134844995, + "step": 6279, + "time_per_iteration": 2.572962760925293 + }, + { + "auxiliary_loss_clip": 0.01114724, + "auxiliary_loss_mlp": 0.01039896, + "balance_loss_clip": 1.04797554, + "balance_loss_mlp": 1.02468526, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 2.3486245184748173, + "language_loss": 0.74475062, + "learning_rate": 2.861019264262269e-06, + "loss": 0.76629686, + "num_input_tokens_seen": 134865285, + "step": 6280, + "time_per_iteration": 2.562976121902466 + }, + { + "auxiliary_loss_clip": 0.01136205, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.0536952, + "balance_loss_mlp": 1.02487576, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.3896029852778629, + "language_loss": 0.76584816, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78760207, + "num_input_tokens_seen": 134886535, + "step": 6281, + "time_per_iteration": 2.4605929851531982 + }, + { + "auxiliary_loss_clip": 0.01107258, + "auxiliary_loss_mlp": 0.0104097, + "balance_loss_clip": 1.04737949, + "balance_loss_mlp": 1.02496588, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.4995645563890578, + "language_loss": 0.84375954, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86524177, + "num_input_tokens_seen": 134907435, + "step": 6282, + "time_per_iteration": 2.550229787826538 + }, + { + "auxiliary_loss_clip": 0.01126838, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.05160689, + "balance_loss_mlp": 1.02143478, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.6426341313031336, + "language_loss": 0.695445, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.7170828, + "num_input_tokens_seen": 134925360, + "step": 6283, + "time_per_iteration": 2.506413221359253 + }, + { + "auxiliary_loss_clip": 0.0108761, + "auxiliary_loss_mlp": 0.01047116, + "balance_loss_clip": 1.05397439, + "balance_loss_mlp": 1.02870977, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.8525634193276297, + "language_loss": 0.75955391, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78090119, + "num_input_tokens_seen": 134944205, + "step": 6284, + "time_per_iteration": 2.65024995803833 + }, + { + "auxiliary_loss_clip": 0.01144256, + "auxiliary_loss_mlp": 0.01037061, + "balance_loss_clip": 1.05436909, + "balance_loss_mlp": 1.0202291, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.191960401345432, + "language_loss": 0.85663074, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.8784439, + "num_input_tokens_seen": 134960255, + "step": 6285, + "time_per_iteration": 2.4578819274902344 + }, + { + "auxiliary_loss_clip": 0.01113406, + "auxiliary_loss_mlp": 0.01038402, + "balance_loss_clip": 1.05035412, + "balance_loss_mlp": 1.02246404, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.8569296579561505, + "language_loss": 0.84509122, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86660933, + "num_input_tokens_seen": 134978605, + "step": 6286, + "time_per_iteration": 2.503345489501953 + }, + { + "auxiliary_loss_clip": 0.01120819, + "auxiliary_loss_mlp": 0.01044728, + "balance_loss_clip": 1.05325198, + "balance_loss_mlp": 1.02961254, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.4810365372036114, + "language_loss": 0.81739008, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83904558, + "num_input_tokens_seen": 134995020, + "step": 6287, + "time_per_iteration": 3.9205329418182373 + }, + { + "auxiliary_loss_clip": 0.01124333, + "auxiliary_loss_mlp": 0.01043953, + "balance_loss_clip": 1.05064917, + "balance_loss_mlp": 1.02825952, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 4.019818591353912, + "language_loss": 0.73173565, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75341851, + "num_input_tokens_seen": 135012620, + "step": 6288, + "time_per_iteration": 2.51167893409729 + }, + { + "auxiliary_loss_clip": 0.01131751, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.05744231, + "balance_loss_mlp": 1.02352536, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.6199324906748473, + "language_loss": 0.75465286, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77636409, + "num_input_tokens_seen": 135033365, + "step": 6289, + "time_per_iteration": 2.568389654159546 + }, + { + "auxiliary_loss_clip": 0.01126875, + "auxiliary_loss_mlp": 0.01042548, + "balance_loss_clip": 1.05300236, + "balance_loss_mlp": 1.02781391, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.8072012181327128, + "language_loss": 0.73513818, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75683242, + "num_input_tokens_seen": 135052185, + "step": 6290, + "time_per_iteration": 2.518960475921631 + }, + { + "auxiliary_loss_clip": 0.01101082, + "auxiliary_loss_mlp": 0.01045585, + "balance_loss_clip": 1.04773116, + "balance_loss_mlp": 1.02800179, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.5658811000292543, + "language_loss": 0.7972225, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81868923, + "num_input_tokens_seen": 135070425, + "step": 6291, + "time_per_iteration": 4.028718709945679 + }, + { + "auxiliary_loss_clip": 0.01108046, + "auxiliary_loss_mlp": 0.01037591, + "balance_loss_clip": 1.0528965, + "balance_loss_mlp": 1.02124739, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.7625423999904122, + "language_loss": 0.7578935, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.77934986, + "num_input_tokens_seen": 135090525, + "step": 6292, + "time_per_iteration": 2.612510919570923 + }, + { + "auxiliary_loss_clip": 0.01122272, + "auxiliary_loss_mlp": 0.01053332, + "balance_loss_clip": 1.05077624, + "balance_loss_mlp": 1.03642857, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.8680956874661403, + "language_loss": 0.6950022, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71675825, + "num_input_tokens_seen": 135109575, + "step": 6293, + "time_per_iteration": 2.4744884967803955 + }, + { + "auxiliary_loss_clip": 0.01139053, + "auxiliary_loss_mlp": 0.0104372, + "balance_loss_clip": 1.05505037, + "balance_loss_mlp": 1.02701902, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 2.2124134505994553, + "language_loss": 0.71955657, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.74138433, + "num_input_tokens_seen": 135127000, + "step": 6294, + "time_per_iteration": 2.4724183082580566 + }, + { + "auxiliary_loss_clip": 0.0112113, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.05175865, + "balance_loss_mlp": 1.02568221, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.203356024421329, + "language_loss": 0.82902455, + "learning_rate": 2.855742758826011e-06, + "loss": 0.85066545, + "num_input_tokens_seen": 135145285, + "step": 6295, + "time_per_iteration": 2.533637046813965 + }, + { + "auxiliary_loss_clip": 0.01124357, + "auxiliary_loss_mlp": 0.01044517, + "balance_loss_clip": 1.05220032, + "balance_loss_mlp": 1.02853155, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.95536649752773, + "language_loss": 0.7192409, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.74092972, + "num_input_tokens_seen": 135165240, + "step": 6296, + "time_per_iteration": 4.038141965866089 + }, + { + "auxiliary_loss_clip": 0.0113933, + "auxiliary_loss_mlp": 0.01046184, + "balance_loss_clip": 1.05651903, + "balance_loss_mlp": 1.03054333, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.7963797721905599, + "language_loss": 0.76925778, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79111284, + "num_input_tokens_seen": 135184045, + "step": 6297, + "time_per_iteration": 2.4516375064849854 + }, + { + "auxiliary_loss_clip": 0.01118545, + "auxiliary_loss_mlp": 0.01038073, + "balance_loss_clip": 1.05399883, + "balance_loss_mlp": 1.02243245, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 1.7062897180097771, + "language_loss": 0.78902686, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81059301, + "num_input_tokens_seen": 135202365, + "step": 6298, + "time_per_iteration": 2.508314609527588 + }, + { + "auxiliary_loss_clip": 0.01089947, + "auxiliary_loss_mlp": 0.01058053, + "balance_loss_clip": 1.05333579, + "balance_loss_mlp": 1.03932524, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 1.7240517024323188, + "language_loss": 0.84148562, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86296564, + "num_input_tokens_seen": 135220955, + "step": 6299, + "time_per_iteration": 2.629573106765747 + }, + { + "auxiliary_loss_clip": 0.01114254, + "auxiliary_loss_mlp": 0.01042494, + "balance_loss_clip": 1.05683553, + "balance_loss_mlp": 1.02664518, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.294877743043817, + "language_loss": 0.76443684, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.7860043, + "num_input_tokens_seen": 135239715, + "step": 6300, + "time_per_iteration": 2.5723893642425537 + }, + { + "auxiliary_loss_clip": 0.01130399, + "auxiliary_loss_mlp": 0.01041371, + "balance_loss_clip": 1.05924141, + "balance_loss_mlp": 1.02346611, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 5.039866782041531, + "language_loss": 0.82293701, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84465468, + "num_input_tokens_seen": 135257035, + "step": 6301, + "time_per_iteration": 2.5211613178253174 + }, + { + "auxiliary_loss_clip": 0.01126267, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_clip": 1.05434549, + "balance_loss_mlp": 1.02865982, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.8709607980518361, + "language_loss": 0.67516208, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69686723, + "num_input_tokens_seen": 135275720, + "step": 6302, + "time_per_iteration": 2.537508726119995 + }, + { + "auxiliary_loss_clip": 0.01101417, + "auxiliary_loss_mlp": 0.01045799, + "balance_loss_clip": 1.05449915, + "balance_loss_mlp": 1.03082681, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.9481488736454007, + "language_loss": 0.68395072, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70542288, + "num_input_tokens_seen": 135294140, + "step": 6303, + "time_per_iteration": 2.6457161903381348 + }, + { + "auxiliary_loss_clip": 0.01140818, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.05628943, + "balance_loss_mlp": 1.02685618, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.803298306100324, + "language_loss": 0.77216899, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79399544, + "num_input_tokens_seen": 135314845, + "step": 6304, + "time_per_iteration": 2.505546808242798 + }, + { + "auxiliary_loss_clip": 0.01152258, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.0627377, + "balance_loss_mlp": 1.02405477, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.2085381970804163, + "language_loss": 0.80074167, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82267272, + "num_input_tokens_seen": 135333055, + "step": 6305, + "time_per_iteration": 2.4576432704925537 + }, + { + "auxiliary_loss_clip": 0.01063369, + "auxiliary_loss_mlp": 0.01021164, + "balance_loss_clip": 1.04433799, + "balance_loss_mlp": 1.01931655, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9845513398191699, + "language_loss": 0.64500129, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66584671, + "num_input_tokens_seen": 135387865, + "step": 6306, + "time_per_iteration": 2.962587833404541 + }, + { + "auxiliary_loss_clip": 0.01116944, + "auxiliary_loss_mlp": 0.01057326, + "balance_loss_clip": 1.05462742, + "balance_loss_mlp": 1.03985012, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.626227188594467, + "language_loss": 0.73339844, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75514108, + "num_input_tokens_seen": 135409095, + "step": 6307, + "time_per_iteration": 2.567028760910034 + }, + { + "auxiliary_loss_clip": 0.01120362, + "auxiliary_loss_mlp": 0.01048038, + "balance_loss_clip": 1.05602193, + "balance_loss_mlp": 1.03195691, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.4899990936635044, + "language_loss": 0.78225714, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80394113, + "num_input_tokens_seen": 135429585, + "step": 6308, + "time_per_iteration": 2.608874559402466 + }, + { + "auxiliary_loss_clip": 0.01107576, + "auxiliary_loss_mlp": 0.01046063, + "balance_loss_clip": 1.054914, + "balance_loss_mlp": 1.02950501, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 3.2611918692275528, + "language_loss": 0.72672105, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.74825746, + "num_input_tokens_seen": 135446320, + "step": 6309, + "time_per_iteration": 2.6409430503845215 + }, + { + "auxiliary_loss_clip": 0.01083827, + "auxiliary_loss_mlp": 0.01050254, + "balance_loss_clip": 1.04656291, + "balance_loss_mlp": 1.03370774, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.4771967690358903, + "language_loss": 0.78972423, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.81106508, + "num_input_tokens_seen": 135465720, + "step": 6310, + "time_per_iteration": 2.623539447784424 + }, + { + "auxiliary_loss_clip": 0.01131118, + "auxiliary_loss_mlp": 0.00781523, + "balance_loss_clip": 1.05414307, + "balance_loss_mlp": 1.00033903, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 1.8229851015009662, + "language_loss": 0.76397097, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.78309733, + "num_input_tokens_seen": 135485155, + "step": 6311, + "time_per_iteration": 2.571442127227783 + }, + { + "auxiliary_loss_clip": 0.01120158, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.05641973, + "balance_loss_mlp": 1.02315807, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.6851622997031317, + "language_loss": 0.70788413, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.72946548, + "num_input_tokens_seen": 135502675, + "step": 6312, + "time_per_iteration": 4.205365419387817 + }, + { + "auxiliary_loss_clip": 0.01050718, + "auxiliary_loss_mlp": 0.01001681, + "balance_loss_clip": 1.0508492, + "balance_loss_mlp": 1.00030994, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7825109015238275, + "language_loss": 0.56112075, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58164465, + "num_input_tokens_seen": 135562005, + "step": 6313, + "time_per_iteration": 3.138334035873413 + }, + { + "auxiliary_loss_clip": 0.01108181, + "auxiliary_loss_mlp": 0.01051821, + "balance_loss_clip": 1.0537163, + "balance_loss_mlp": 1.03602588, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 2.535950831580244, + "language_loss": 0.71183133, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73343134, + "num_input_tokens_seen": 135582600, + "step": 6314, + "time_per_iteration": 2.769648551940918 + }, + { + "auxiliary_loss_clip": 0.01139106, + "auxiliary_loss_mlp": 0.01046902, + "balance_loss_clip": 1.05924678, + "balance_loss_mlp": 1.03076088, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 2.037829870693715, + "language_loss": 0.72896278, + "learning_rate": 2.848696068594545e-06, + "loss": 0.7508229, + "num_input_tokens_seen": 135600280, + "step": 6315, + "time_per_iteration": 2.606565475463867 + }, + { + "auxiliary_loss_clip": 0.01127154, + "auxiliary_loss_mlp": 0.01044008, + "balance_loss_clip": 1.05347705, + "balance_loss_mlp": 1.02855837, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 6.083155685145054, + "language_loss": 0.7092852, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73099679, + "num_input_tokens_seen": 135621560, + "step": 6316, + "time_per_iteration": 2.7174019813537598 + }, + { + "auxiliary_loss_clip": 0.011085, + "auxiliary_loss_mlp": 0.01042275, + "balance_loss_clip": 1.05664301, + "balance_loss_mlp": 1.0277853, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.8482954672968537, + "language_loss": 0.65471476, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67622256, + "num_input_tokens_seen": 135641745, + "step": 6317, + "time_per_iteration": 2.6721861362457275 + }, + { + "auxiliary_loss_clip": 0.01128611, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.05593443, + "balance_loss_mlp": 1.02536845, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 3.5559489031186837, + "language_loss": 0.85813487, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.8798157, + "num_input_tokens_seen": 135660650, + "step": 6318, + "time_per_iteration": 2.5502049922943115 + }, + { + "auxiliary_loss_clip": 0.01118331, + "auxiliary_loss_mlp": 0.0104626, + "balance_loss_clip": 1.05283833, + "balance_loss_mlp": 1.02932024, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 3.4934423355952626, + "language_loss": 0.75825608, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.77990198, + "num_input_tokens_seen": 135679980, + "step": 6319, + "time_per_iteration": 2.5135879516601562 + }, + { + "auxiliary_loss_clip": 0.01142612, + "auxiliary_loss_mlp": 0.01044295, + "balance_loss_clip": 1.05702698, + "balance_loss_mlp": 1.02957249, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.831678253916964, + "language_loss": 0.63854963, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66041869, + "num_input_tokens_seen": 135699400, + "step": 6320, + "time_per_iteration": 2.4992096424102783 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.01039928, + "balance_loss_clip": 1.05831289, + "balance_loss_mlp": 1.0250628, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.6940210110349705, + "language_loss": 0.70986599, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73134899, + "num_input_tokens_seen": 135723455, + "step": 6321, + "time_per_iteration": 2.722250461578369 + }, + { + "auxiliary_loss_clip": 0.01103225, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.04706597, + "balance_loss_mlp": 1.02681422, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.7168783562370387, + "language_loss": 0.74574304, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76719743, + "num_input_tokens_seen": 135744335, + "step": 6322, + "time_per_iteration": 2.615269899368286 + }, + { + "auxiliary_loss_clip": 0.01131312, + "auxiliary_loss_mlp": 0.01042842, + "balance_loss_clip": 1.05501699, + "balance_loss_mlp": 1.02679622, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.2118385228961923, + "language_loss": 0.85197896, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87372053, + "num_input_tokens_seen": 135761440, + "step": 6323, + "time_per_iteration": 2.513636589050293 + }, + { + "auxiliary_loss_clip": 0.01118977, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.05221498, + "balance_loss_mlp": 1.02304256, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 3.692271839401951, + "language_loss": 0.73358226, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75517303, + "num_input_tokens_seen": 135779955, + "step": 6324, + "time_per_iteration": 2.5463712215423584 + }, + { + "auxiliary_loss_clip": 0.01116453, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.05905771, + "balance_loss_mlp": 1.02636254, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 2.200135479036055, + "language_loss": 0.84401011, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86559993, + "num_input_tokens_seen": 135799840, + "step": 6325, + "time_per_iteration": 2.5996506214141846 + }, + { + "auxiliary_loss_clip": 0.01114162, + "auxiliary_loss_mlp": 0.01034439, + "balance_loss_clip": 1.0543983, + "balance_loss_mlp": 1.01979423, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 2.0176027308396236, + "language_loss": 0.80079985, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.82228589, + "num_input_tokens_seen": 135817880, + "step": 6326, + "time_per_iteration": 3.9442214965820312 + }, + { + "auxiliary_loss_clip": 0.0112778, + "auxiliary_loss_mlp": 0.01043628, + "balance_loss_clip": 1.05373836, + "balance_loss_mlp": 1.0287745, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.902983666663338, + "language_loss": 0.72528195, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74699605, + "num_input_tokens_seen": 135838940, + "step": 6327, + "time_per_iteration": 2.6186676025390625 + }, + { + "auxiliary_loss_clip": 0.01140474, + "auxiliary_loss_mlp": 0.00781489, + "balance_loss_clip": 1.05612278, + "balance_loss_mlp": 1.00030255, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 2.035810452700554, + "language_loss": 0.83075982, + "learning_rate": 2.844108810081459e-06, + "loss": 0.8499794, + "num_input_tokens_seen": 135858325, + "step": 6328, + "time_per_iteration": 2.4949588775634766 + }, + { + "auxiliary_loss_clip": 0.01126163, + "auxiliary_loss_mlp": 0.01036136, + "balance_loss_clip": 1.0526669, + "balance_loss_mlp": 1.020818, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.384657668676241, + "language_loss": 0.61420602, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63582903, + "num_input_tokens_seen": 135878430, + "step": 6329, + "time_per_iteration": 2.509634494781494 + }, + { + "auxiliary_loss_clip": 0.01112967, + "auxiliary_loss_mlp": 0.01045358, + "balance_loss_clip": 1.05000138, + "balance_loss_mlp": 1.02971828, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.9953739279005276, + "language_loss": 0.5582605, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.57984376, + "num_input_tokens_seen": 135894755, + "step": 6330, + "time_per_iteration": 4.005000352859497 + }, + { + "auxiliary_loss_clip": 0.01095442, + "auxiliary_loss_mlp": 0.010364, + "balance_loss_clip": 1.05628824, + "balance_loss_mlp": 1.02235687, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.583758729458317, + "language_loss": 0.66282797, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.6841464, + "num_input_tokens_seen": 135918275, + "step": 6331, + "time_per_iteration": 2.6398415565490723 + }, + { + "auxiliary_loss_clip": 0.01124359, + "auxiliary_loss_mlp": 0.01044468, + "balance_loss_clip": 1.05533528, + "balance_loss_mlp": 1.02862513, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 2.3884248678722657, + "language_loss": 0.75505662, + "learning_rate": 2.842696256262919e-06, + "loss": 0.77674484, + "num_input_tokens_seen": 135937430, + "step": 6332, + "time_per_iteration": 2.4817941188812256 + }, + { + "auxiliary_loss_clip": 0.01080198, + "auxiliary_loss_mlp": 0.00782189, + "balance_loss_clip": 1.05255687, + "balance_loss_mlp": 1.00039065, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 1.746230887335497, + "language_loss": 0.82009512, + "learning_rate": 2.842343037886987e-06, + "loss": 0.83871901, + "num_input_tokens_seen": 135954210, + "step": 6333, + "time_per_iteration": 2.607361316680908 + }, + { + "auxiliary_loss_clip": 0.01124142, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.05104053, + "balance_loss_mlp": 1.01655483, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.5534121397944696, + "language_loss": 0.86369646, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88525081, + "num_input_tokens_seen": 135974425, + "step": 6334, + "time_per_iteration": 2.5429093837738037 + }, + { + "auxiliary_loss_clip": 0.01129698, + "auxiliary_loss_mlp": 0.01039065, + "balance_loss_clip": 1.0528934, + "balance_loss_mlp": 1.02388978, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 1.613182913044716, + "language_loss": 0.78673798, + "learning_rate": 2.841636505323321e-06, + "loss": 0.80842561, + "num_input_tokens_seen": 135991985, + "step": 6335, + "time_per_iteration": 2.473478078842163 + }, + { + "auxiliary_loss_clip": 0.01127939, + "auxiliary_loss_mlp": 0.01038643, + "balance_loss_clip": 1.05241287, + "balance_loss_mlp": 1.02233481, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.832708999190556, + "language_loss": 0.7243017, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.74596751, + "num_input_tokens_seen": 136010015, + "step": 6336, + "time_per_iteration": 3.873383045196533 + }, + { + "auxiliary_loss_clip": 0.01121421, + "auxiliary_loss_mlp": 0.01032319, + "balance_loss_clip": 1.05192447, + "balance_loss_mlp": 1.01793098, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 2.654578962645719, + "language_loss": 0.69522792, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71676528, + "num_input_tokens_seen": 136028440, + "step": 6337, + "time_per_iteration": 2.4780962467193604 + }, + { + "auxiliary_loss_clip": 0.01117238, + "auxiliary_loss_mlp": 0.01037992, + "balance_loss_clip": 1.04940248, + "balance_loss_mlp": 1.02195287, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.7961840260656232, + "language_loss": 0.63470459, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65625691, + "num_input_tokens_seen": 136048360, + "step": 6338, + "time_per_iteration": 2.609447479248047 + }, + { + "auxiliary_loss_clip": 0.01115982, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.04958749, + "balance_loss_mlp": 1.02813208, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.8960012724322104, + "language_loss": 0.69368398, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71528834, + "num_input_tokens_seen": 136065500, + "step": 6339, + "time_per_iteration": 2.5047976970672607 + }, + { + "auxiliary_loss_clip": 0.01112636, + "auxiliary_loss_mlp": 0.01047156, + "balance_loss_clip": 1.04976201, + "balance_loss_mlp": 1.03177786, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 1.9525399677909836, + "language_loss": 0.67586797, + "learning_rate": 2.839869615637177e-06, + "loss": 0.6974659, + "num_input_tokens_seen": 136084060, + "step": 6340, + "time_per_iteration": 2.535614013671875 + }, + { + "auxiliary_loss_clip": 0.01105565, + "auxiliary_loss_mlp": 0.01040615, + "balance_loss_clip": 1.05181479, + "balance_loss_mlp": 1.02394962, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.1703279433434, + "language_loss": 0.89996737, + "learning_rate": 2.839516142102522e-06, + "loss": 0.92142922, + "num_input_tokens_seen": 136102310, + "step": 6341, + "time_per_iteration": 2.538905143737793 + }, + { + "auxiliary_loss_clip": 0.01129183, + "auxiliary_loss_mlp": 0.0104538, + "balance_loss_clip": 1.05293584, + "balance_loss_mlp": 1.02904844, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 2.3653304937131545, + "language_loss": 0.75237972, + "learning_rate": 2.83916263673333e-06, + "loss": 0.77412534, + "num_input_tokens_seen": 136120725, + "step": 6342, + "time_per_iteration": 2.4809460639953613 + }, + { + "auxiliary_loss_clip": 0.01114108, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.05088913, + "balance_loss_mlp": 1.0221771, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.629460640698632, + "language_loss": 0.83414054, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85564852, + "num_input_tokens_seen": 136139105, + "step": 6343, + "time_per_iteration": 2.5284135341644287 + }, + { + "auxiliary_loss_clip": 0.01078337, + "auxiliary_loss_mlp": 0.01048382, + "balance_loss_clip": 1.04872763, + "balance_loss_mlp": 1.03201485, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.642493743713621, + "language_loss": 0.77280569, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79407293, + "num_input_tokens_seen": 136158265, + "step": 6344, + "time_per_iteration": 2.5906167030334473 + }, + { + "auxiliary_loss_clip": 0.01106362, + "auxiliary_loss_mlp": 0.010477, + "balance_loss_clip": 1.05213284, + "balance_loss_mlp": 1.02998567, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.3804341358138146, + "language_loss": 0.73516607, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75670671, + "num_input_tokens_seen": 136176100, + "step": 6345, + "time_per_iteration": 2.5913569927215576 + }, + { + "auxiliary_loss_clip": 0.01104787, + "auxiliary_loss_mlp": 0.00780171, + "balance_loss_clip": 1.05596828, + "balance_loss_mlp": 1.00025618, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.6961944713265398, + "language_loss": 0.69778383, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71663338, + "num_input_tokens_seen": 136195125, + "step": 6346, + "time_per_iteration": 2.5602407455444336 + }, + { + "auxiliary_loss_clip": 0.01127462, + "auxiliary_loss_mlp": 0.01034643, + "balance_loss_clip": 1.0509994, + "balance_loss_mlp": 1.01955652, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 2.064855617376158, + "language_loss": 0.75395119, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.7755723, + "num_input_tokens_seen": 136213885, + "step": 6347, + "time_per_iteration": 2.4895663261413574 + }, + { + "auxiliary_loss_clip": 0.01128208, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.05293643, + "balance_loss_mlp": 1.02896905, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 2.024752768743908, + "language_loss": 0.74593723, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76765901, + "num_input_tokens_seen": 136232700, + "step": 6348, + "time_per_iteration": 2.4809982776641846 + }, + { + "auxiliary_loss_clip": 0.01112096, + "auxiliary_loss_mlp": 0.01038027, + "balance_loss_clip": 1.05255854, + "balance_loss_mlp": 1.0229708, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 3.5932720818420036, + "language_loss": 0.87285244, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89435363, + "num_input_tokens_seen": 136248975, + "step": 6349, + "time_per_iteration": 2.51523756980896 + }, + { + "auxiliary_loss_clip": 0.01124771, + "auxiliary_loss_mlp": 0.01043875, + "balance_loss_clip": 1.05347586, + "balance_loss_mlp": 1.02865779, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 1.9622898866514562, + "language_loss": 0.76564878, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78733528, + "num_input_tokens_seen": 136266710, + "step": 6350, + "time_per_iteration": 2.4578731060028076 + }, + { + "auxiliary_loss_clip": 0.01105734, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.05162084, + "balance_loss_mlp": 1.02040434, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.054627125657497, + "language_loss": 0.7578842, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.77931011, + "num_input_tokens_seen": 136284445, + "step": 6351, + "time_per_iteration": 4.04201865196228 + }, + { + "auxiliary_loss_clip": 0.01128567, + "auxiliary_loss_mlp": 0.01045125, + "balance_loss_clip": 1.051278, + "balance_loss_mlp": 1.0281136, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.682529777033046, + "language_loss": 0.74351978, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76525664, + "num_input_tokens_seen": 136305730, + "step": 6352, + "time_per_iteration": 2.618638277053833 + }, + { + "auxiliary_loss_clip": 0.01100619, + "auxiliary_loss_mlp": 0.01037908, + "balance_loss_clip": 1.04852438, + "balance_loss_mlp": 1.02326345, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.7995311526739501, + "language_loss": 0.64264488, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66403019, + "num_input_tokens_seen": 136323850, + "step": 6353, + "time_per_iteration": 2.5256009101867676 + }, + { + "auxiliary_loss_clip": 0.01140399, + "auxiliary_loss_mlp": 0.01040238, + "balance_loss_clip": 1.05499625, + "balance_loss_mlp": 1.02511048, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.8075096177199594, + "language_loss": 0.83122158, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85302794, + "num_input_tokens_seen": 136344880, + "step": 6354, + "time_per_iteration": 2.4982645511627197 + }, + { + "auxiliary_loss_clip": 0.01139451, + "auxiliary_loss_mlp": 0.01038663, + "balance_loss_clip": 1.05663121, + "balance_loss_mlp": 1.02481079, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.758484585767668, + "language_loss": 0.80966014, + "learning_rate": 2.834564176091943e-06, + "loss": 0.83144128, + "num_input_tokens_seen": 136366060, + "step": 6355, + "time_per_iteration": 2.4780499935150146 + }, + { + "auxiliary_loss_clip": 0.01093756, + "auxiliary_loss_mlp": 0.0104139, + "balance_loss_clip": 1.04677463, + "balance_loss_mlp": 1.02627456, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 2.449528630802444, + "language_loss": 0.75227135, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77362287, + "num_input_tokens_seen": 136385625, + "step": 6356, + "time_per_iteration": 2.5808708667755127 + }, + { + "auxiliary_loss_clip": 0.01131172, + "auxiliary_loss_mlp": 0.00781387, + "balance_loss_clip": 1.0555073, + "balance_loss_mlp": 1.00053096, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.848436955656121, + "language_loss": 0.8113358, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83046132, + "num_input_tokens_seen": 136405750, + "step": 6357, + "time_per_iteration": 2.5452003479003906 + }, + { + "auxiliary_loss_clip": 0.01121094, + "auxiliary_loss_mlp": 0.01049058, + "balance_loss_clip": 1.05547655, + "balance_loss_mlp": 1.03176045, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 1.6570013140708206, + "language_loss": 0.7760191, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.79772061, + "num_input_tokens_seen": 136426085, + "step": 6358, + "time_per_iteration": 2.528048038482666 + }, + { + "auxiliary_loss_clip": 0.01116737, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.04924393, + "balance_loss_mlp": 1.03109872, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.290089213812186, + "language_loss": 0.78726959, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80890208, + "num_input_tokens_seen": 136442670, + "step": 6359, + "time_per_iteration": 2.5140132904052734 + }, + { + "auxiliary_loss_clip": 0.01073686, + "auxiliary_loss_mlp": 0.01055849, + "balance_loss_clip": 1.04441082, + "balance_loss_mlp": 1.03868294, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 2.319298203952304, + "language_loss": 0.69798058, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.71927595, + "num_input_tokens_seen": 136465730, + "step": 6360, + "time_per_iteration": 2.914166212081909 + }, + { + "auxiliary_loss_clip": 0.01110261, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.05141068, + "balance_loss_mlp": 1.02148271, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.6348968350480888, + "language_loss": 0.78544217, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80692339, + "num_input_tokens_seen": 136487215, + "step": 6361, + "time_per_iteration": 2.5674550533294678 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01039346, + "balance_loss_clip": 1.05182838, + "balance_loss_mlp": 1.0247016, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.4665809390771352, + "language_loss": 0.6549809, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67654395, + "num_input_tokens_seen": 136510365, + "step": 6362, + "time_per_iteration": 2.726186513900757 + }, + { + "auxiliary_loss_clip": 0.01137874, + "auxiliary_loss_mlp": 0.01037687, + "balance_loss_clip": 1.05327082, + "balance_loss_mlp": 1.02122414, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.9558936706843608, + "language_loss": 0.81733525, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.83909088, + "num_input_tokens_seen": 136527100, + "step": 6363, + "time_per_iteration": 2.4270946979522705 + }, + { + "auxiliary_loss_clip": 0.01085633, + "auxiliary_loss_mlp": 0.01046773, + "balance_loss_clip": 1.05192685, + "balance_loss_mlp": 1.03132331, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.7106090473059443, + "language_loss": 0.58661568, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60793972, + "num_input_tokens_seen": 136550870, + "step": 6364, + "time_per_iteration": 2.8585855960845947 + }, + { + "auxiliary_loss_clip": 0.01125145, + "auxiliary_loss_mlp": 0.0104101, + "balance_loss_clip": 1.05508184, + "balance_loss_mlp": 1.02456522, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 2.8410299084483146, + "language_loss": 0.69071788, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71237946, + "num_input_tokens_seen": 136569895, + "step": 6365, + "time_per_iteration": 2.577787160873413 + }, + { + "auxiliary_loss_clip": 0.01126472, + "auxiliary_loss_mlp": 0.0104168, + "balance_loss_clip": 1.05319762, + "balance_loss_mlp": 1.02510977, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 1.90834011309397, + "language_loss": 0.7328341, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75451565, + "num_input_tokens_seen": 136588585, + "step": 6366, + "time_per_iteration": 3.932326078414917 + }, + { + "auxiliary_loss_clip": 0.01121395, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.05469322, + "balance_loss_mlp": 1.02367544, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 3.6561766028447935, + "language_loss": 0.68080717, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70242471, + "num_input_tokens_seen": 136606640, + "step": 6367, + "time_per_iteration": 2.5633602142333984 + }, + { + "auxiliary_loss_clip": 0.01127554, + "auxiliary_loss_mlp": 0.01042756, + "balance_loss_clip": 1.05370355, + "balance_loss_mlp": 1.02643585, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 2.653004697472422, + "language_loss": 0.64510643, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66680956, + "num_input_tokens_seen": 136624940, + "step": 6368, + "time_per_iteration": 2.5189528465270996 + }, + { + "auxiliary_loss_clip": 0.01142142, + "auxiliary_loss_mlp": 0.01037532, + "balance_loss_clip": 1.05640221, + "balance_loss_mlp": 1.02145052, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.333839338713466, + "language_loss": 0.68207097, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70386767, + "num_input_tokens_seen": 136645540, + "step": 6369, + "time_per_iteration": 4.024900913238525 + }, + { + "auxiliary_loss_clip": 0.01089224, + "auxiliary_loss_mlp": 0.01047129, + "balance_loss_clip": 1.04667819, + "balance_loss_mlp": 1.03113139, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.6356711057662747, + "language_loss": 0.78248096, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80384445, + "num_input_tokens_seen": 136664530, + "step": 6370, + "time_per_iteration": 2.5977044105529785 + }, + { + "auxiliary_loss_clip": 0.01123965, + "auxiliary_loss_mlp": 0.01049848, + "balance_loss_clip": 1.05149126, + "balance_loss_mlp": 1.03228843, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.492493390891292, + "language_loss": 0.64712971, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66886783, + "num_input_tokens_seen": 136682315, + "step": 6371, + "time_per_iteration": 2.5789248943328857 + }, + { + "auxiliary_loss_clip": 0.01112898, + "auxiliary_loss_mlp": 0.01040528, + "balance_loss_clip": 1.05261004, + "balance_loss_mlp": 1.02373123, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 2.3238446124772927, + "language_loss": 0.72890812, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.75044239, + "num_input_tokens_seen": 136701185, + "step": 6372, + "time_per_iteration": 2.5954689979553223 + }, + { + "auxiliary_loss_clip": 0.01129171, + "auxiliary_loss_mlp": 0.01037316, + "balance_loss_clip": 1.0527159, + "balance_loss_mlp": 1.02150905, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.6860799831303341, + "language_loss": 0.84973305, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.87139791, + "num_input_tokens_seen": 136721265, + "step": 6373, + "time_per_iteration": 2.515915870666504 + }, + { + "auxiliary_loss_clip": 0.0109004, + "auxiliary_loss_mlp": 0.01044618, + "balance_loss_clip": 1.04512334, + "balance_loss_mlp": 1.02822697, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 2.3258038691816796, + "language_loss": 0.74667454, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.76802123, + "num_input_tokens_seen": 136741885, + "step": 6374, + "time_per_iteration": 2.686354160308838 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.05423641, + "balance_loss_mlp": 1.02601647, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.94297216890791, + "language_loss": 0.76151204, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.783243, + "num_input_tokens_seen": 136760905, + "step": 6375, + "time_per_iteration": 2.4995381832122803 + }, + { + "auxiliary_loss_clip": 0.0112761, + "auxiliary_loss_mlp": 0.01038629, + "balance_loss_clip": 1.05228972, + "balance_loss_mlp": 1.02214217, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 1.9076570877425718, + "language_loss": 0.7234149, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.74507725, + "num_input_tokens_seen": 136777240, + "step": 6376, + "time_per_iteration": 3.8766207695007324 + }, + { + "auxiliary_loss_clip": 0.01124989, + "auxiliary_loss_mlp": 0.01037911, + "balance_loss_clip": 1.05163467, + "balance_loss_mlp": 1.02120948, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.5923466731491658, + "language_loss": 0.67829418, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69992316, + "num_input_tokens_seen": 136801040, + "step": 6377, + "time_per_iteration": 2.5559794902801514 + }, + { + "auxiliary_loss_clip": 0.01115776, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.0582968, + "balance_loss_mlp": 1.0253191, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 1.9153202807418883, + "language_loss": 0.72810322, + "learning_rate": 2.826415354814344e-06, + "loss": 0.74968547, + "num_input_tokens_seen": 136819495, + "step": 6378, + "time_per_iteration": 2.5510029792785645 + }, + { + "auxiliary_loss_clip": 0.01087795, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.05185342, + "balance_loss_mlp": 1.01988745, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.783970111825476, + "language_loss": 0.69686007, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71809673, + "num_input_tokens_seen": 136838840, + "step": 6379, + "time_per_iteration": 2.6197307109832764 + }, + { + "auxiliary_loss_clip": 0.01132429, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.05926538, + "balance_loss_mlp": 1.02367163, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.7542727817477193, + "language_loss": 0.83176231, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85348207, + "num_input_tokens_seen": 136854425, + "step": 6380, + "time_per_iteration": 2.4611337184906006 + }, + { + "auxiliary_loss_clip": 0.01138819, + "auxiliary_loss_mlp": 0.01038579, + "balance_loss_clip": 1.05622232, + "balance_loss_mlp": 1.02333176, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.442577316364625, + "language_loss": 0.81493926, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83671325, + "num_input_tokens_seen": 136874355, + "step": 6381, + "time_per_iteration": 2.4669339656829834 + }, + { + "auxiliary_loss_clip": 0.0107029, + "auxiliary_loss_mlp": 0.01022192, + "balance_loss_clip": 1.04166031, + "balance_loss_mlp": 1.02046335, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.8004382330937381, + "language_loss": 0.60493684, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62586164, + "num_input_tokens_seen": 136937475, + "step": 6382, + "time_per_iteration": 3.0134594440460205 + }, + { + "auxiliary_loss_clip": 0.01140452, + "auxiliary_loss_mlp": 0.01038259, + "balance_loss_clip": 1.05361366, + "balance_loss_mlp": 1.02222586, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.787420424666366, + "language_loss": 0.66308659, + "learning_rate": 2.824641672639794e-06, + "loss": 0.6848737, + "num_input_tokens_seen": 136955805, + "step": 6383, + "time_per_iteration": 2.508176803588867 + }, + { + "auxiliary_loss_clip": 0.01109227, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.05260015, + "balance_loss_mlp": 1.01909947, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 1.5937969464876502, + "language_loss": 0.75077373, + "learning_rate": 2.824286842339587e-06, + "loss": 0.77221787, + "num_input_tokens_seen": 136975240, + "step": 6384, + "time_per_iteration": 2.5628695487976074 + }, + { + "auxiliary_loss_clip": 0.01123573, + "auxiliary_loss_mlp": 0.01039478, + "balance_loss_clip": 1.05494213, + "balance_loss_mlp": 1.0242492, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.426948521801175, + "language_loss": 0.76242876, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78405929, + "num_input_tokens_seen": 136994985, + "step": 6385, + "time_per_iteration": 2.495702028274536 + }, + { + "auxiliary_loss_clip": 0.01058218, + "auxiliary_loss_mlp": 0.01008911, + "balance_loss_clip": 1.03894818, + "balance_loss_mlp": 1.00701606, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.915004095164571, + "language_loss": 0.67013276, + "learning_rate": 2.82357708798151e-06, + "loss": 0.690804, + "num_input_tokens_seen": 137046290, + "step": 6386, + "time_per_iteration": 2.9534082412719727 + }, + { + "auxiliary_loss_clip": 0.01099407, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.04811549, + "balance_loss_mlp": 1.0156374, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.6154065987419872, + "language_loss": 0.72289985, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74419928, + "num_input_tokens_seen": 137064725, + "step": 6387, + "time_per_iteration": 2.561077356338501 + }, + { + "auxiliary_loss_clip": 0.01139157, + "auxiliary_loss_mlp": 0.01041896, + "balance_loss_clip": 1.05661368, + "balance_loss_mlp": 1.02753162, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.5792510549224141, + "language_loss": 0.81229424, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83410484, + "num_input_tokens_seen": 137086030, + "step": 6388, + "time_per_iteration": 2.5536086559295654 + }, + { + "auxiliary_loss_clip": 0.01104969, + "auxiliary_loss_mlp": 0.01037989, + "balance_loss_clip": 1.04884648, + "balance_loss_mlp": 1.02374315, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.7478248871832607, + "language_loss": 0.76284647, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78427613, + "num_input_tokens_seen": 137105400, + "step": 6389, + "time_per_iteration": 2.5162477493286133 + }, + { + "auxiliary_loss_clip": 0.01118723, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.0547297, + "balance_loss_mlp": 1.03064406, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.5600888063834684, + "language_loss": 0.76253897, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78419745, + "num_input_tokens_seen": 137124985, + "step": 6390, + "time_per_iteration": 4.071063756942749 + }, + { + "auxiliary_loss_clip": 0.01092614, + "auxiliary_loss_mlp": 0.01051322, + "balance_loss_clip": 1.04687953, + "balance_loss_mlp": 1.03446603, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.7926471016178191, + "language_loss": 0.69578373, + "learning_rate": 2.821802155794668e-06, + "loss": 0.71722305, + "num_input_tokens_seen": 137146745, + "step": 6391, + "time_per_iteration": 2.7096498012542725 + }, + { + "auxiliary_loss_clip": 0.01128393, + "auxiliary_loss_mlp": 0.01040327, + "balance_loss_clip": 1.05320013, + "balance_loss_mlp": 1.02453136, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.7233042937804441, + "language_loss": 0.84016436, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86185157, + "num_input_tokens_seen": 137163195, + "step": 6392, + "time_per_iteration": 2.493116855621338 + }, + { + "auxiliary_loss_clip": 0.01122728, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.04877961, + "balance_loss_mlp": 1.01963902, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 1.9924384290664778, + "language_loss": 0.60666299, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.62823141, + "num_input_tokens_seen": 137179330, + "step": 6393, + "time_per_iteration": 2.4518015384674072 + }, + { + "auxiliary_loss_clip": 0.0111507, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.05454481, + "balance_loss_mlp": 1.01891518, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 1.7884913556588529, + "language_loss": 0.71092373, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73242998, + "num_input_tokens_seen": 137198655, + "step": 6394, + "time_per_iteration": 2.620579481124878 + }, + { + "auxiliary_loss_clip": 0.01125236, + "auxiliary_loss_mlp": 0.01033246, + "balance_loss_clip": 1.04957891, + "balance_loss_mlp": 1.01644945, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.277838433980936, + "language_loss": 0.81543279, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83701766, + "num_input_tokens_seen": 137217120, + "step": 6395, + "time_per_iteration": 2.4834365844726562 + }, + { + "auxiliary_loss_clip": 0.01131271, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.05666971, + "balance_loss_mlp": 1.02644956, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 1.9180864613353625, + "language_loss": 0.70945704, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.73118436, + "num_input_tokens_seen": 137234410, + "step": 6396, + "time_per_iteration": 2.445831775665283 + }, + { + "auxiliary_loss_clip": 0.01046856, + "auxiliary_loss_mlp": 0.01006722, + "balance_loss_clip": 1.02924061, + "balance_loss_mlp": 1.00496948, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8969460269074833, + "language_loss": 0.59728289, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61781859, + "num_input_tokens_seen": 137294940, + "step": 6397, + "time_per_iteration": 3.1112704277038574 + }, + { + "auxiliary_loss_clip": 0.0113836, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.05387163, + "balance_loss_mlp": 1.01386404, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 2.1152712757998144, + "language_loss": 0.84927243, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87095118, + "num_input_tokens_seen": 137315035, + "step": 6398, + "time_per_iteration": 2.5007505416870117 + }, + { + "auxiliary_loss_clip": 0.01137794, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.05346394, + "balance_loss_mlp": 1.01925254, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 2.519664766855179, + "language_loss": 0.79929811, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.82101667, + "num_input_tokens_seen": 137333155, + "step": 6399, + "time_per_iteration": 2.4348888397216797 + }, + { + "auxiliary_loss_clip": 0.01139174, + "auxiliary_loss_mlp": 0.0078292, + "balance_loss_clip": 1.05276358, + "balance_loss_mlp": 1.00041437, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.8344173250846918, + "language_loss": 0.67264473, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69186568, + "num_input_tokens_seen": 137351515, + "step": 6400, + "time_per_iteration": 2.451650381088257 + }, + { + "auxiliary_loss_clip": 0.01124145, + "auxiliary_loss_mlp": 0.01048164, + "balance_loss_clip": 1.05662513, + "balance_loss_mlp": 1.03312039, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 2.096596506839758, + "language_loss": 0.73201996, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75374305, + "num_input_tokens_seen": 137371255, + "step": 6401, + "time_per_iteration": 2.554241895675659 + }, + { + "auxiliary_loss_clip": 0.01104957, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.05213153, + "balance_loss_mlp": 1.02443218, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 2.029187196015271, + "language_loss": 0.72322935, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74467868, + "num_input_tokens_seen": 137388980, + "step": 6402, + "time_per_iteration": 2.528658628463745 + }, + { + "auxiliary_loss_clip": 0.01136149, + "auxiliary_loss_mlp": 0.01039039, + "balance_loss_clip": 1.05407238, + "balance_loss_mlp": 1.02413213, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.9854108196675047, + "language_loss": 0.83172357, + "learning_rate": 2.817539143144128e-06, + "loss": 0.85347545, + "num_input_tokens_seen": 137406885, + "step": 6403, + "time_per_iteration": 2.436976194381714 + }, + { + "auxiliary_loss_clip": 0.01083324, + "auxiliary_loss_mlp": 0.01038794, + "balance_loss_clip": 1.04528332, + "balance_loss_mlp": 1.02258158, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 3.1914739030035673, + "language_loss": 0.83531034, + "learning_rate": 2.817183690261189e-06, + "loss": 0.8565315, + "num_input_tokens_seen": 137425535, + "step": 6404, + "time_per_iteration": 2.585796356201172 + }, + { + "auxiliary_loss_clip": 0.01110381, + "auxiliary_loss_mlp": 0.01039909, + "balance_loss_clip": 1.04900169, + "balance_loss_mlp": 1.02498412, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.6467126316912943, + "language_loss": 0.69862282, + "learning_rate": 2.816828206390563e-06, + "loss": 0.72012568, + "num_input_tokens_seen": 137447700, + "step": 6405, + "time_per_iteration": 4.051243305206299 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01042412, + "balance_loss_clip": 1.04760456, + "balance_loss_mlp": 1.02818465, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 1.911630163161452, + "language_loss": 0.7940104, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81550622, + "num_input_tokens_seen": 137462245, + "step": 6406, + "time_per_iteration": 2.498032331466675 + }, + { + "auxiliary_loss_clip": 0.01132912, + "auxiliary_loss_mlp": 0.0104241, + "balance_loss_clip": 1.05724311, + "balance_loss_mlp": 1.02636456, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.1870004341570426, + "language_loss": 0.84612954, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86788273, + "num_input_tokens_seen": 137476455, + "step": 6407, + "time_per_iteration": 2.450191020965576 + }, + { + "auxiliary_loss_clip": 0.01054108, + "auxiliary_loss_mlp": 0.01018005, + "balance_loss_clip": 1.035743, + "balance_loss_mlp": 1.01650286, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.8457820807053469, + "language_loss": 0.64902383, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66974497, + "num_input_tokens_seen": 137539845, + "step": 6408, + "time_per_iteration": 3.117112159729004 + }, + { + "auxiliary_loss_clip": 0.01108268, + "auxiliary_loss_mlp": 0.0105147, + "balance_loss_clip": 1.05202305, + "balance_loss_mlp": 1.03516197, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.4617297296645557, + "language_loss": 0.73460376, + "learning_rate": 2.8154059613008e-06, + "loss": 0.75620109, + "num_input_tokens_seen": 137559880, + "step": 6409, + "time_per_iteration": 4.091933488845825 + }, + { + "auxiliary_loss_clip": 0.01095287, + "auxiliary_loss_mlp": 0.0105761, + "balance_loss_clip": 1.04935837, + "balance_loss_mlp": 1.04019308, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.0691080653137717, + "language_loss": 0.70492423, + "learning_rate": 2.81505032269396e-06, + "loss": 0.7264533, + "num_input_tokens_seen": 137578225, + "step": 6410, + "time_per_iteration": 2.5804107189178467 + }, + { + "auxiliary_loss_clip": 0.0102871, + "auxiliary_loss_mlp": 0.00755688, + "balance_loss_clip": 1.03822923, + "balance_loss_mlp": 1.00013113, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6894703494019949, + "language_loss": 0.60264099, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62048495, + "num_input_tokens_seen": 137645770, + "step": 6411, + "time_per_iteration": 3.259977102279663 + }, + { + "auxiliary_loss_clip": 0.01099769, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.04980779, + "balance_loss_mlp": 1.01771843, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 1.9849511972865892, + "language_loss": 0.77647007, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79778874, + "num_input_tokens_seen": 137664090, + "step": 6412, + "time_per_iteration": 2.595421075820923 + }, + { + "auxiliary_loss_clip": 0.01102243, + "auxiliary_loss_mlp": 0.01038388, + "balance_loss_clip": 1.04705131, + "balance_loss_mlp": 1.02079248, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.7168710864252454, + "language_loss": 0.78208488, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80349123, + "num_input_tokens_seen": 137683190, + "step": 6413, + "time_per_iteration": 2.5759334564208984 + }, + { + "auxiliary_loss_clip": 0.01060282, + "auxiliary_loss_mlp": 0.01004603, + "balance_loss_clip": 1.03172255, + "balance_loss_mlp": 1.00298214, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8040286358082441, + "language_loss": 0.61357683, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63422567, + "num_input_tokens_seen": 137737315, + "step": 6414, + "time_per_iteration": 2.8835794925689697 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01041405, + "balance_loss_clip": 1.052351, + "balance_loss_mlp": 1.02566946, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.061053096716713, + "language_loss": 0.77304333, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79449719, + "num_input_tokens_seen": 137753535, + "step": 6415, + "time_per_iteration": 3.94734787940979 + }, + { + "auxiliary_loss_clip": 0.0111356, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.05390739, + "balance_loss_mlp": 1.02258801, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.6619245268716267, + "language_loss": 0.79730904, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81881189, + "num_input_tokens_seen": 137773405, + "step": 6416, + "time_per_iteration": 2.5650572776794434 + }, + { + "auxiliary_loss_clip": 0.01123242, + "auxiliary_loss_mlp": 0.00780488, + "balance_loss_clip": 1.05072522, + "balance_loss_mlp": 1.00042009, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.7536821658605668, + "language_loss": 0.78953218, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.80856943, + "num_input_tokens_seen": 137790810, + "step": 6417, + "time_per_iteration": 2.480074644088745 + }, + { + "auxiliary_loss_clip": 0.01114529, + "auxiliary_loss_mlp": 0.01039763, + "balance_loss_clip": 1.05597556, + "balance_loss_mlp": 1.02566099, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 2.29822902646084, + "language_loss": 0.7983076, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.8198505, + "num_input_tokens_seen": 137810265, + "step": 6418, + "time_per_iteration": 2.506502628326416 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.0103817, + "balance_loss_clip": 1.04679322, + "balance_loss_mlp": 1.0237931, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 3.1525134003096458, + "language_loss": 0.79850823, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81995022, + "num_input_tokens_seen": 137828580, + "step": 6419, + "time_per_iteration": 2.5153889656066895 + }, + { + "auxiliary_loss_clip": 0.01113528, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.05182505, + "balance_loss_mlp": 1.02388275, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.2085929985843253, + "language_loss": 0.67141789, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.6929599, + "num_input_tokens_seen": 137846145, + "step": 6420, + "time_per_iteration": 2.5637989044189453 + }, + { + "auxiliary_loss_clip": 0.01091051, + "auxiliary_loss_mlp": 0.01050441, + "balance_loss_clip": 1.04717255, + "balance_loss_mlp": 1.03461003, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 1.969465693155772, + "language_loss": 0.81618375, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83759868, + "num_input_tokens_seen": 137863705, + "step": 6421, + "time_per_iteration": 2.5404505729675293 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.05084658, + "balance_loss_mlp": 1.02190948, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 1.8629448045597463, + "language_loss": 0.71801984, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.73951405, + "num_input_tokens_seen": 137880285, + "step": 6422, + "time_per_iteration": 2.5764248371124268 + }, + { + "auxiliary_loss_clip": 0.01110527, + "auxiliary_loss_mlp": 0.01039729, + "balance_loss_clip": 1.0520277, + "balance_loss_mlp": 1.02573359, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.7573986789249338, + "language_loss": 0.66326809, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.68477064, + "num_input_tokens_seen": 137898335, + "step": 6423, + "time_per_iteration": 2.512903928756714 + }, + { + "auxiliary_loss_clip": 0.0112973, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.05418444, + "balance_loss_mlp": 1.02038682, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 2.5868525921551226, + "language_loss": 0.68626159, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70790935, + "num_input_tokens_seen": 137918605, + "step": 6424, + "time_per_iteration": 2.6614902019500732 + }, + { + "auxiliary_loss_clip": 0.01105895, + "auxiliary_loss_mlp": 0.01036357, + "balance_loss_clip": 1.05424905, + "balance_loss_mlp": 1.02170038, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.411642014636323, + "language_loss": 0.72431278, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74573529, + "num_input_tokens_seen": 137938245, + "step": 6425, + "time_per_iteration": 2.626013994216919 + }, + { + "auxiliary_loss_clip": 0.01106009, + "auxiliary_loss_mlp": 0.00782859, + "balance_loss_clip": 1.04901624, + "balance_loss_mlp": 1.00035071, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.503071297890607, + "language_loss": 0.8057512, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82463986, + "num_input_tokens_seen": 137956770, + "step": 6426, + "time_per_iteration": 2.6535730361938477 + }, + { + "auxiliary_loss_clip": 0.01129567, + "auxiliary_loss_mlp": 0.01038085, + "balance_loss_clip": 1.05375779, + "balance_loss_mlp": 1.02285647, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 1.8631608650093527, + "language_loss": 0.74751925, + "learning_rate": 2.80899974864781e-06, + "loss": 0.7691958, + "num_input_tokens_seen": 137977040, + "step": 6427, + "time_per_iteration": 2.5354650020599365 + }, + { + "auxiliary_loss_clip": 0.01085306, + "auxiliary_loss_mlp": 0.01048508, + "balance_loss_clip": 1.04849839, + "balance_loss_mlp": 1.03103137, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 2.364180551001927, + "language_loss": 0.70372045, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.72505856, + "num_input_tokens_seen": 137993545, + "step": 6428, + "time_per_iteration": 2.5599570274353027 + }, + { + "auxiliary_loss_clip": 0.01121952, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_clip": 1.057181, + "balance_loss_mlp": 1.02957392, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.0315453289901084, + "language_loss": 0.84551179, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.8671723, + "num_input_tokens_seen": 138010140, + "step": 6429, + "time_per_iteration": 4.106291770935059 + }, + { + "auxiliary_loss_clip": 0.01118088, + "auxiliary_loss_mlp": 0.0104031, + "balance_loss_clip": 1.05194294, + "balance_loss_mlp": 1.02530181, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.676780715594038, + "language_loss": 0.80787992, + "learning_rate": 2.807931078076015e-06, + "loss": 0.8294639, + "num_input_tokens_seen": 138028880, + "step": 6430, + "time_per_iteration": 2.5642449855804443 + }, + { + "auxiliary_loss_clip": 0.01042628, + "auxiliary_loss_mlp": 0.01003641, + "balance_loss_clip": 1.04225886, + "balance_loss_mlp": 1.00210321, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7179673755354554, + "language_loss": 0.58800173, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60846442, + "num_input_tokens_seen": 138098090, + "step": 6431, + "time_per_iteration": 3.181104898452759 + }, + { + "auxiliary_loss_clip": 0.0109797, + "auxiliary_loss_mlp": 0.01041932, + "balance_loss_clip": 1.05584097, + "balance_loss_mlp": 1.02594638, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.868041987058605, + "language_loss": 0.79845917, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81985825, + "num_input_tokens_seen": 138114735, + "step": 6432, + "time_per_iteration": 2.628044366836548 + }, + { + "auxiliary_loss_clip": 0.01129199, + "auxiliary_loss_mlp": 0.01048222, + "balance_loss_clip": 1.05118823, + "balance_loss_mlp": 1.03141916, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 1.8921572055393259, + "language_loss": 0.80326688, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82504112, + "num_input_tokens_seen": 138130480, + "step": 6433, + "time_per_iteration": 2.5257086753845215 + }, + { + "auxiliary_loss_clip": 0.01114407, + "auxiliary_loss_mlp": 0.01039561, + "balance_loss_clip": 1.05321133, + "balance_loss_mlp": 1.02324128, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.7485607678817583, + "language_loss": 0.70829159, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72983134, + "num_input_tokens_seen": 138150640, + "step": 6434, + "time_per_iteration": 2.6021127700805664 + }, + { + "auxiliary_loss_clip": 0.0110038, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_clip": 1.0489558, + "balance_loss_mlp": 1.0300082, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 1.9024894382797952, + "language_loss": 0.77359003, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79506648, + "num_input_tokens_seen": 138169700, + "step": 6435, + "time_per_iteration": 2.533958673477173 + }, + { + "auxiliary_loss_clip": 0.01127536, + "auxiliary_loss_mlp": 0.01037297, + "balance_loss_clip": 1.05502868, + "balance_loss_mlp": 1.02249169, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.6215328284987527, + "language_loss": 0.79976922, + "learning_rate": 2.805792910102915e-06, + "loss": 0.82141757, + "num_input_tokens_seen": 138185835, + "step": 6436, + "time_per_iteration": 2.600409746170044 + }, + { + "auxiliary_loss_clip": 0.01109616, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.05289102, + "balance_loss_mlp": 1.01974392, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.7133709700109798, + "language_loss": 0.76731962, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.78875625, + "num_input_tokens_seen": 138204080, + "step": 6437, + "time_per_iteration": 2.572187662124634 + }, + { + "auxiliary_loss_clip": 0.0112373, + "auxiliary_loss_mlp": 0.01040653, + "balance_loss_clip": 1.06072211, + "balance_loss_mlp": 1.02728391, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.1349306561065, + "language_loss": 0.81890053, + "learning_rate": 2.805079942855074e-06, + "loss": 0.8405444, + "num_input_tokens_seen": 138220710, + "step": 6438, + "time_per_iteration": 2.551301956176758 + }, + { + "auxiliary_loss_clip": 0.01120534, + "auxiliary_loss_mlp": 0.00781952, + "balance_loss_clip": 1.05328953, + "balance_loss_mlp": 1.00039935, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.3897495225903724, + "language_loss": 0.75376046, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77278531, + "num_input_tokens_seen": 138241720, + "step": 6439, + "time_per_iteration": 2.592738628387451 + }, + { + "auxiliary_loss_clip": 0.01139444, + "auxiliary_loss_mlp": 0.01034525, + "balance_loss_clip": 1.05758786, + "balance_loss_mlp": 1.02030885, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 5.934017173931028, + "language_loss": 0.73632598, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.75806564, + "num_input_tokens_seen": 138261885, + "step": 6440, + "time_per_iteration": 2.550295114517212 + }, + { + "auxiliary_loss_clip": 0.01132482, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.05677474, + "balance_loss_mlp": 1.0262568, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 2.1830842246793134, + "language_loss": 0.82238257, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84412724, + "num_input_tokens_seen": 138280255, + "step": 6441, + "time_per_iteration": 2.5211784839630127 + }, + { + "auxiliary_loss_clip": 0.01140453, + "auxiliary_loss_mlp": 0.01050521, + "balance_loss_clip": 1.05613935, + "balance_loss_mlp": 1.03616238, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.1686967750567825, + "language_loss": 0.81323874, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83514845, + "num_input_tokens_seen": 138296675, + "step": 6442, + "time_per_iteration": 2.549300193786621 + }, + { + "auxiliary_loss_clip": 0.0109505, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.04847503, + "balance_loss_mlp": 1.02567196, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.6487930428646447, + "language_loss": 0.83973622, + "learning_rate": 2.803296990719624e-06, + "loss": 0.86110193, + "num_input_tokens_seen": 138314985, + "step": 6443, + "time_per_iteration": 2.607146978378296 + }, + { + "auxiliary_loss_clip": 0.01038541, + "auxiliary_loss_mlp": 0.0100644, + "balance_loss_clip": 1.03081536, + "balance_loss_mlp": 1.00496197, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.8256328075114336, + "language_loss": 0.50227249, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.5227223, + "num_input_tokens_seen": 138373275, + "step": 6444, + "time_per_iteration": 4.554289102554321 + }, + { + "auxiliary_loss_clip": 0.01093543, + "auxiliary_loss_mlp": 0.0078307, + "balance_loss_clip": 1.04703176, + "balance_loss_mlp": 1.00035739, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.3782942672494096, + "language_loss": 0.78775972, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80652583, + "num_input_tokens_seen": 138391145, + "step": 6445, + "time_per_iteration": 2.57425856590271 + }, + { + "auxiliary_loss_clip": 0.01120441, + "auxiliary_loss_mlp": 0.01043996, + "balance_loss_clip": 1.05290794, + "balance_loss_mlp": 1.02894032, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.7396255814660473, + "language_loss": 0.80869544, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83033979, + "num_input_tokens_seen": 138409875, + "step": 6446, + "time_per_iteration": 2.56564998626709 + }, + { + "auxiliary_loss_clip": 0.01114086, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.05226326, + "balance_loss_mlp": 1.02790427, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.885134372125654, + "language_loss": 0.77819526, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79975784, + "num_input_tokens_seen": 138428965, + "step": 6447, + "time_per_iteration": 2.549274206161499 + }, + { + "auxiliary_loss_clip": 0.01113434, + "auxiliary_loss_mlp": 0.01039915, + "balance_loss_clip": 1.0507344, + "balance_loss_mlp": 1.0262419, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.5807334829400979, + "language_loss": 0.76305425, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78458774, + "num_input_tokens_seen": 138448090, + "step": 6448, + "time_per_iteration": 4.091294527053833 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.01041867, + "balance_loss_clip": 1.0483439, + "balance_loss_mlp": 1.02711511, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.600851829006414, + "language_loss": 0.7612555, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78272617, + "num_input_tokens_seen": 138466105, + "step": 6449, + "time_per_iteration": 2.525254011154175 + }, + { + "auxiliary_loss_clip": 0.01106826, + "auxiliary_loss_mlp": 0.00781494, + "balance_loss_clip": 1.05118179, + "balance_loss_mlp": 1.00043046, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.6583720292131146, + "language_loss": 0.78529525, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80417848, + "num_input_tokens_seen": 138485160, + "step": 6450, + "time_per_iteration": 2.5954227447509766 + }, + { + "auxiliary_loss_clip": 0.0114072, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.05285168, + "balance_loss_mlp": 1.02812231, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.367614140461401, + "language_loss": 0.77372855, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79557073, + "num_input_tokens_seen": 138504135, + "step": 6451, + "time_per_iteration": 2.6452584266662598 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.05356872, + "balance_loss_mlp": 1.01838124, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.6468295290363038, + "language_loss": 0.76404059, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78570223, + "num_input_tokens_seen": 138523955, + "step": 6452, + "time_per_iteration": 2.4702606201171875 + }, + { + "auxiliary_loss_clip": 0.01106263, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_clip": 1.04932165, + "balance_loss_mlp": 1.03461885, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5025618094129942, + "language_loss": 0.7952947, + "learning_rate": 2.799728803557182e-06, + "loss": 0.81684947, + "num_input_tokens_seen": 138541655, + "step": 6453, + "time_per_iteration": 2.502699851989746 + }, + { + "auxiliary_loss_clip": 0.01134236, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_clip": 1.05498803, + "balance_loss_mlp": 1.02690494, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.92722977745732, + "language_loss": 0.71710193, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73886693, + "num_input_tokens_seen": 138560860, + "step": 6454, + "time_per_iteration": 3.9055261611938477 + }, + { + "auxiliary_loss_clip": 0.01140806, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.05412269, + "balance_loss_mlp": 1.02949357, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 1.9164206065627434, + "language_loss": 0.77583277, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.79769707, + "num_input_tokens_seen": 138580200, + "step": 6455, + "time_per_iteration": 2.4618237018585205 + }, + { + "auxiliary_loss_clip": 0.01135802, + "auxiliary_loss_mlp": 0.01041106, + "balance_loss_clip": 1.05379725, + "balance_loss_mlp": 1.02568054, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.5190670214443667, + "language_loss": 0.75853848, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78030753, + "num_input_tokens_seen": 138598315, + "step": 6456, + "time_per_iteration": 2.4690232276916504 + }, + { + "auxiliary_loss_clip": 0.0106675, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.04950404, + "balance_loss_mlp": 1.0228467, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.421692107249052, + "language_loss": 0.60350531, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62455642, + "num_input_tokens_seen": 138615695, + "step": 6457, + "time_per_iteration": 2.6179096698760986 + }, + { + "auxiliary_loss_clip": 0.01138785, + "auxiliary_loss_mlp": 0.01040885, + "balance_loss_clip": 1.05267096, + "balance_loss_mlp": 1.02387357, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 1.9914232306303032, + "language_loss": 0.79878533, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82058203, + "num_input_tokens_seen": 138633180, + "step": 6458, + "time_per_iteration": 2.4611172676086426 + }, + { + "auxiliary_loss_clip": 0.0109541, + "auxiliary_loss_mlp": 0.01041625, + "balance_loss_clip": 1.0564003, + "balance_loss_mlp": 1.0246619, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.8719933802253539, + "language_loss": 0.81520438, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83657479, + "num_input_tokens_seen": 138654785, + "step": 6459, + "time_per_iteration": 2.760714054107666 + }, + { + "auxiliary_loss_clip": 0.0111366, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.05354345, + "balance_loss_mlp": 1.02229559, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 1.7425532070545693, + "language_loss": 0.61719978, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63870293, + "num_input_tokens_seen": 138673330, + "step": 6460, + "time_per_iteration": 2.542933225631714 + }, + { + "auxiliary_loss_clip": 0.01124402, + "auxiliary_loss_mlp": 0.01037781, + "balance_loss_clip": 1.05248237, + "balance_loss_mlp": 1.0237205, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.6070429279743625, + "language_loss": 0.86187673, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88349855, + "num_input_tokens_seen": 138694185, + "step": 6461, + "time_per_iteration": 2.5143682956695557 + }, + { + "auxiliary_loss_clip": 0.01120896, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.04977512, + "balance_loss_mlp": 1.02313709, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 2.3772618121965214, + "language_loss": 0.71245039, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73404127, + "num_input_tokens_seen": 138714625, + "step": 6462, + "time_per_iteration": 2.55063533782959 + }, + { + "auxiliary_loss_clip": 0.01097755, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.0486989, + "balance_loss_mlp": 1.02203798, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 1.9631328670580925, + "language_loss": 0.76139438, + "learning_rate": 2.796157583816052e-06, + "loss": 0.78275621, + "num_input_tokens_seen": 138733585, + "step": 6463, + "time_per_iteration": 2.5869412422180176 + }, + { + "auxiliary_loss_clip": 0.01108798, + "auxiliary_loss_mlp": 0.01048596, + "balance_loss_clip": 1.05433822, + "balance_loss_mlp": 1.0315733, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 2.035616722021065, + "language_loss": 0.70290577, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72447973, + "num_input_tokens_seen": 138752335, + "step": 6464, + "time_per_iteration": 2.531560182571411 + }, + { + "auxiliary_loss_clip": 0.0111187, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.05050755, + "balance_loss_mlp": 1.02011502, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.9879414636110408, + "language_loss": 0.69344163, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.71490765, + "num_input_tokens_seen": 138768450, + "step": 6465, + "time_per_iteration": 2.548712730407715 + }, + { + "auxiliary_loss_clip": 0.01109177, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.05900884, + "balance_loss_mlp": 1.02955294, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.0372352453655966, + "language_loss": 0.77990234, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80145299, + "num_input_tokens_seen": 138786775, + "step": 6466, + "time_per_iteration": 2.5577075481414795 + }, + { + "auxiliary_loss_clip": 0.01105678, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.05521452, + "balance_loss_mlp": 1.02298117, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.6696052929186336, + "language_loss": 0.69167703, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71312231, + "num_input_tokens_seen": 138810100, + "step": 6467, + "time_per_iteration": 2.631019353866577 + }, + { + "auxiliary_loss_clip": 0.01107416, + "auxiliary_loss_mlp": 0.01041883, + "balance_loss_clip": 1.05629134, + "balance_loss_mlp": 1.02606392, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.3028955705589227, + "language_loss": 0.83373022, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85522318, + "num_input_tokens_seen": 138825140, + "step": 6468, + "time_per_iteration": 4.14092230796814 + }, + { + "auxiliary_loss_clip": 0.01114825, + "auxiliary_loss_mlp": 0.01038812, + "balance_loss_clip": 1.05312765, + "balance_loss_mlp": 1.02477467, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 1.7753520791543018, + "language_loss": 0.84521353, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86674988, + "num_input_tokens_seen": 138844115, + "step": 6469, + "time_per_iteration": 2.529252767562866 + }, + { + "auxiliary_loss_clip": 0.01100756, + "auxiliary_loss_mlp": 0.01045563, + "balance_loss_clip": 1.05215549, + "balance_loss_mlp": 1.02870631, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.726925257285282, + "language_loss": 0.74894309, + "learning_rate": 2.793655932864273e-06, + "loss": 0.77040625, + "num_input_tokens_seen": 138860860, + "step": 6470, + "time_per_iteration": 2.5785021781921387 + }, + { + "auxiliary_loss_clip": 0.01103773, + "auxiliary_loss_mlp": 0.00781075, + "balance_loss_clip": 1.05552709, + "balance_loss_mlp": 1.00029874, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 4.214227390794968, + "language_loss": 0.74937683, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.76822531, + "num_input_tokens_seen": 138881910, + "step": 6471, + "time_per_iteration": 2.618664503097534 + }, + { + "auxiliary_loss_clip": 0.01088358, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.04638946, + "balance_loss_mlp": 1.0286845, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 2.0881707829933887, + "language_loss": 0.6821146, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70344412, + "num_input_tokens_seen": 138900975, + "step": 6472, + "time_per_iteration": 2.595341205596924 + }, + { + "auxiliary_loss_clip": 0.01108651, + "auxiliary_loss_mlp": 0.01042382, + "balance_loss_clip": 1.05239964, + "balance_loss_mlp": 1.02810693, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 3.259142833680912, + "language_loss": 0.76399755, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.78550792, + "num_input_tokens_seen": 138920795, + "step": 6473, + "time_per_iteration": 2.6023008823394775 + }, + { + "auxiliary_loss_clip": 0.01115878, + "auxiliary_loss_mlp": 0.01046202, + "balance_loss_clip": 1.05290246, + "balance_loss_mlp": 1.03068089, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 3.1960245115036963, + "language_loss": 0.70878744, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73040831, + "num_input_tokens_seen": 138938770, + "step": 6474, + "time_per_iteration": 2.4931039810180664 + }, + { + "auxiliary_loss_clip": 0.01140445, + "auxiliary_loss_mlp": 0.0104398, + "balance_loss_clip": 1.05546832, + "balance_loss_mlp": 1.02992487, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.664907208521305, + "language_loss": 0.68665069, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.70849496, + "num_input_tokens_seen": 138958880, + "step": 6475, + "time_per_iteration": 2.476379871368408 + }, + { + "auxiliary_loss_clip": 0.01116054, + "auxiliary_loss_mlp": 0.01057024, + "balance_loss_clip": 1.05018878, + "balance_loss_mlp": 1.0395236, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 1.838866082114976, + "language_loss": 0.75875616, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78048694, + "num_input_tokens_seen": 138977240, + "step": 6476, + "time_per_iteration": 2.513153314590454 + }, + { + "auxiliary_loss_clip": 0.01040593, + "auxiliary_loss_mlp": 0.01005801, + "balance_loss_clip": 1.02914476, + "balance_loss_mlp": 1.00421512, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.792634032930704, + "language_loss": 0.58271706, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.603181, + "num_input_tokens_seen": 139039035, + "step": 6477, + "time_per_iteration": 3.104630470275879 + }, + { + "auxiliary_loss_clip": 0.01101382, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.05708194, + "balance_loss_mlp": 1.01863027, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.9155789106470094, + "language_loss": 0.78287524, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80423617, + "num_input_tokens_seen": 139055560, + "step": 6478, + "time_per_iteration": 2.5639216899871826 + }, + { + "auxiliary_loss_clip": 0.01120645, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.04847646, + "balance_loss_mlp": 1.02478468, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 2.7329924339659986, + "language_loss": 0.82455528, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.84615576, + "num_input_tokens_seen": 139071865, + "step": 6479, + "time_per_iteration": 2.476398229598999 + }, + { + "auxiliary_loss_clip": 0.01137805, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.05492651, + "balance_loss_mlp": 1.02179873, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.6152459053667694, + "language_loss": 0.80182654, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82357591, + "num_input_tokens_seen": 139089640, + "step": 6480, + "time_per_iteration": 2.447568893432617 + }, + { + "auxiliary_loss_clip": 0.01115278, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.05435085, + "balance_loss_mlp": 1.02111816, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.846711800205254, + "language_loss": 0.83541083, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85692275, + "num_input_tokens_seen": 139109365, + "step": 6481, + "time_per_iteration": 2.5538840293884277 + }, + { + "auxiliary_loss_clip": 0.01109702, + "auxiliary_loss_mlp": 0.01043878, + "balance_loss_clip": 1.05272472, + "balance_loss_mlp": 1.02941847, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.8103538037979863, + "language_loss": 0.75200355, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77353936, + "num_input_tokens_seen": 139128260, + "step": 6482, + "time_per_iteration": 2.521601676940918 + }, + { + "auxiliary_loss_clip": 0.01110833, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.05736268, + "balance_loss_mlp": 1.02377033, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 2.1549307360040384, + "language_loss": 0.78413862, + "learning_rate": 2.78900610077756e-06, + "loss": 0.80562425, + "num_input_tokens_seen": 139147315, + "step": 6483, + "time_per_iteration": 4.110833168029785 + }, + { + "auxiliary_loss_clip": 0.01123523, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.05022991, + "balance_loss_mlp": 1.01504731, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.5552564122208712, + "language_loss": 0.80426037, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82580692, + "num_input_tokens_seen": 139167270, + "step": 6484, + "time_per_iteration": 2.541167736053467 + }, + { + "auxiliary_loss_clip": 0.01123246, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.05400741, + "balance_loss_mlp": 1.04023886, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.5869116457304344, + "language_loss": 0.7794838, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80129147, + "num_input_tokens_seen": 139185970, + "step": 6485, + "time_per_iteration": 2.5067193508148193 + }, + { + "auxiliary_loss_clip": 0.01101376, + "auxiliary_loss_mlp": 0.01043519, + "balance_loss_clip": 1.05369031, + "balance_loss_mlp": 1.02769399, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.247660377387827, + "language_loss": 0.85000777, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87145668, + "num_input_tokens_seen": 139203730, + "step": 6486, + "time_per_iteration": 2.6449711322784424 + }, + { + "auxiliary_loss_clip": 0.01120161, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.05462611, + "balance_loss_mlp": 1.0243839, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 1.7721122150968276, + "language_loss": 0.85527539, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87686908, + "num_input_tokens_seen": 139222560, + "step": 6487, + "time_per_iteration": 4.132507085800171 + }, + { + "auxiliary_loss_clip": 0.01108401, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.04884899, + "balance_loss_mlp": 1.02388561, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5944794322056242, + "language_loss": 0.72836757, + "learning_rate": 2.787216355829633e-06, + "loss": 0.74984807, + "num_input_tokens_seen": 139242165, + "step": 6488, + "time_per_iteration": 2.520197629928589 + }, + { + "auxiliary_loss_clip": 0.01100471, + "auxiliary_loss_mlp": 0.01049877, + "balance_loss_clip": 1.049016, + "balance_loss_mlp": 1.03297925, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 1.8098311606309596, + "language_loss": 0.68659472, + "learning_rate": 2.786858317231779e-06, + "loss": 0.70809817, + "num_input_tokens_seen": 139262525, + "step": 6489, + "time_per_iteration": 2.5799269676208496 + }, + { + "auxiliary_loss_clip": 0.01112136, + "auxiliary_loss_mlp": 0.01041845, + "balance_loss_clip": 1.05209935, + "balance_loss_mlp": 1.02749252, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.8689281504677135, + "language_loss": 0.80642557, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.82796538, + "num_input_tokens_seen": 139282835, + "step": 6490, + "time_per_iteration": 2.5650408267974854 + }, + { + "auxiliary_loss_clip": 0.01129071, + "auxiliary_loss_mlp": 0.01043803, + "balance_loss_clip": 1.05302286, + "balance_loss_mlp": 1.02905655, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 7.126757245375469, + "language_loss": 0.89458215, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91631091, + "num_input_tokens_seen": 139299490, + "step": 6491, + "time_per_iteration": 2.4644079208374023 + }, + { + "auxiliary_loss_clip": 0.01098775, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_clip": 1.04997873, + "balance_loss_mlp": 1.03259921, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 2.149021042939078, + "language_loss": 0.78666639, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80814028, + "num_input_tokens_seen": 139317865, + "step": 6492, + "time_per_iteration": 3.9474825859069824 + }, + { + "auxiliary_loss_clip": 0.01115923, + "auxiliary_loss_mlp": 0.01048265, + "balance_loss_clip": 1.04912066, + "balance_loss_mlp": 1.03305364, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.768841398628134, + "language_loss": 0.74388903, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76553094, + "num_input_tokens_seen": 139339840, + "step": 6493, + "time_per_iteration": 2.5632822513580322 + }, + { + "auxiliary_loss_clip": 0.01098783, + "auxiliary_loss_mlp": 0.01041936, + "balance_loss_clip": 1.05205441, + "balance_loss_mlp": 1.02641487, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.93412131220042, + "language_loss": 0.75895047, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78035772, + "num_input_tokens_seen": 139357555, + "step": 6494, + "time_per_iteration": 2.545347213745117 + }, + { + "auxiliary_loss_clip": 0.01136413, + "auxiliary_loss_mlp": 0.01051021, + "balance_loss_clip": 1.05575025, + "balance_loss_mlp": 1.03424871, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 1.9276652884036773, + "language_loss": 0.74375236, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76562667, + "num_input_tokens_seen": 139374455, + "step": 6495, + "time_per_iteration": 2.448469877243042 + }, + { + "auxiliary_loss_clip": 0.01140368, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_clip": 1.0549382, + "balance_loss_mlp": 1.0329361, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.7121415737141599, + "language_loss": 0.6813221, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70322263, + "num_input_tokens_seen": 139394770, + "step": 6496, + "time_per_iteration": 2.517704963684082 + }, + { + "auxiliary_loss_clip": 0.01031113, + "auxiliary_loss_mlp": 0.01011857, + "balance_loss_clip": 1.03159273, + "balance_loss_mlp": 1.01031899, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6594639655231536, + "language_loss": 0.53945452, + "learning_rate": 2.783992935430775e-06, + "loss": 0.55988425, + "num_input_tokens_seen": 139454760, + "step": 6497, + "time_per_iteration": 3.208434820175171 + }, + { + "auxiliary_loss_clip": 0.01097716, + "auxiliary_loss_mlp": 0.00780889, + "balance_loss_clip": 1.04909003, + "balance_loss_mlp": 1.00040436, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 1.899016782830438, + "language_loss": 0.69353831, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71232438, + "num_input_tokens_seen": 139472645, + "step": 6498, + "time_per_iteration": 2.54939603805542 + }, + { + "auxiliary_loss_clip": 0.01026675, + "auxiliary_loss_mlp": 0.01013012, + "balance_loss_clip": 1.02576888, + "balance_loss_mlp": 1.01147425, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.7328872249575259, + "language_loss": 0.51784551, + "learning_rate": 2.783276292417936e-06, + "loss": 0.53824246, + "num_input_tokens_seen": 139536730, + "step": 6499, + "time_per_iteration": 3.161355495452881 + }, + { + "auxiliary_loss_clip": 0.01129759, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_clip": 1.05390859, + "balance_loss_mlp": 1.03017163, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.8040432597478095, + "language_loss": 0.73826575, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76003015, + "num_input_tokens_seen": 139557540, + "step": 6500, + "time_per_iteration": 2.543031692504883 + }, + { + "auxiliary_loss_clip": 0.01132785, + "auxiliary_loss_mlp": 0.01036174, + "balance_loss_clip": 1.05787516, + "balance_loss_mlp": 1.02126098, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 1.8015306832255473, + "language_loss": 0.68792999, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70961958, + "num_input_tokens_seen": 139576875, + "step": 6501, + "time_per_iteration": 2.5217227935791016 + }, + { + "auxiliary_loss_clip": 0.01127765, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.05407047, + "balance_loss_mlp": 1.02603829, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.6761912085091777, + "language_loss": 0.78936589, + "learning_rate": 2.782201105168287e-06, + "loss": 0.8110497, + "num_input_tokens_seen": 139594295, + "step": 6502, + "time_per_iteration": 2.484165906906128 + }, + { + "auxiliary_loss_clip": 0.01112433, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.05204332, + "balance_loss_mlp": 1.02060986, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.5773023086734197, + "language_loss": 0.80354583, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.82501733, + "num_input_tokens_seen": 139614080, + "step": 6503, + "time_per_iteration": 2.5802114009857178 + }, + { + "auxiliary_loss_clip": 0.01112978, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.05275619, + "balance_loss_mlp": 1.01790631, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.8367335104507985, + "language_loss": 0.71149802, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.7329483, + "num_input_tokens_seen": 139632755, + "step": 6504, + "time_per_iteration": 2.5254671573638916 + }, + { + "auxiliary_loss_clip": 0.01132546, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.04949045, + "balance_loss_mlp": 1.01914382, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 2.0466951744641007, + "language_loss": 0.83055824, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.852229, + "num_input_tokens_seen": 139654205, + "step": 6505, + "time_per_iteration": 2.5168745517730713 + }, + { + "auxiliary_loss_clip": 0.01135296, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.0532558, + "balance_loss_mlp": 1.02413964, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 2.2712281781685673, + "language_loss": 0.71261525, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73437154, + "num_input_tokens_seen": 139673595, + "step": 6506, + "time_per_iteration": 2.461376667022705 + }, + { + "auxiliary_loss_clip": 0.01109492, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.0481863, + "balance_loss_mlp": 1.02369475, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 1.9701901712593137, + "language_loss": 0.75308907, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77456498, + "num_input_tokens_seen": 139690565, + "step": 6507, + "time_per_iteration": 4.036974906921387 + }, + { + "auxiliary_loss_clip": 0.01055715, + "auxiliary_loss_mlp": 0.01005984, + "balance_loss_clip": 1.02760983, + "balance_loss_mlp": 1.00446975, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7648981873694173, + "language_loss": 0.5654285, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.5860455, + "num_input_tokens_seen": 139749420, + "step": 6508, + "time_per_iteration": 3.1865575313568115 + }, + { + "auxiliary_loss_clip": 0.01126627, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.05416536, + "balance_loss_mlp": 1.02684677, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 3.4845746853788877, + "language_loss": 0.76721835, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78889596, + "num_input_tokens_seen": 139766265, + "step": 6509, + "time_per_iteration": 2.485034227371216 + }, + { + "auxiliary_loss_clip": 0.0110457, + "auxiliary_loss_mlp": 0.01040005, + "balance_loss_clip": 1.04641628, + "balance_loss_mlp": 1.02317309, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 4.915632466718366, + "language_loss": 0.82921797, + "learning_rate": 2.779332635075825e-06, + "loss": 0.85066372, + "num_input_tokens_seen": 139782400, + "step": 6510, + "time_per_iteration": 2.5296103954315186 + }, + { + "auxiliary_loss_clip": 0.0112328, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.04786432, + "balance_loss_mlp": 1.02310789, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.718635163670942, + "language_loss": 0.77212, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.79372925, + "num_input_tokens_seen": 139801435, + "step": 6511, + "time_per_iteration": 2.4670605659484863 + }, + { + "auxiliary_loss_clip": 0.01038102, + "auxiliary_loss_mlp": 0.01008457, + "balance_loss_clip": 1.02822948, + "balance_loss_mlp": 1.00683582, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7167740581555223, + "language_loss": 0.57800233, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59846795, + "num_input_tokens_seen": 139869700, + "step": 6512, + "time_per_iteration": 3.1444594860076904 + }, + { + "auxiliary_loss_clip": 0.01138004, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.05317962, + "balance_loss_mlp": 1.01903117, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.6262733595520393, + "language_loss": 0.69811195, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.71983951, + "num_input_tokens_seen": 139890140, + "step": 6513, + "time_per_iteration": 2.490243434906006 + }, + { + "auxiliary_loss_clip": 0.0109577, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.0499866, + "balance_loss_mlp": 1.0202229, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 3.5496955643145305, + "language_loss": 0.75701535, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.77832907, + "num_input_tokens_seen": 139908020, + "step": 6514, + "time_per_iteration": 2.564817428588867 + }, + { + "auxiliary_loss_clip": 0.0109558, + "auxiliary_loss_mlp": 0.010402, + "balance_loss_clip": 1.04680288, + "balance_loss_mlp": 1.02603221, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.7788775357522486, + "language_loss": 0.77587825, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79723608, + "num_input_tokens_seen": 139926180, + "step": 6515, + "time_per_iteration": 2.5147740840911865 + }, + { + "auxiliary_loss_clip": 0.0109039, + "auxiliary_loss_mlp": 0.01051151, + "balance_loss_clip": 1.04144812, + "balance_loss_mlp": 1.03686929, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.4401984995091273, + "language_loss": 0.7997092, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.82112467, + "num_input_tokens_seen": 139947420, + "step": 6516, + "time_per_iteration": 2.592247486114502 + }, + { + "auxiliary_loss_clip": 0.01096433, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.05113268, + "balance_loss_mlp": 1.02255774, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 1.8510325151888565, + "language_loss": 0.69974148, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72107768, + "num_input_tokens_seen": 139965800, + "step": 6517, + "time_per_iteration": 2.5574958324432373 + }, + { + "auxiliary_loss_clip": 0.01090437, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_clip": 1.04174542, + "balance_loss_mlp": 1.03275347, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.9062117694662903, + "language_loss": 0.72190541, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74328637, + "num_input_tokens_seen": 139988140, + "step": 6518, + "time_per_iteration": 2.655789375305176 + }, + { + "auxiliary_loss_clip": 0.01121239, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_clip": 1.05038476, + "balance_loss_mlp": 1.02557731, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.6836095626591547, + "language_loss": 0.61795086, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63956726, + "num_input_tokens_seen": 140010060, + "step": 6519, + "time_per_iteration": 2.633744478225708 + }, + { + "auxiliary_loss_clip": 0.01140304, + "auxiliary_loss_mlp": 0.01050012, + "balance_loss_clip": 1.05151081, + "balance_loss_mlp": 1.0342766, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 1.9691830077416366, + "language_loss": 0.67179096, + "learning_rate": 2.775744388563563e-06, + "loss": 0.69369411, + "num_input_tokens_seen": 140029400, + "step": 6520, + "time_per_iteration": 2.4653780460357666 + }, + { + "auxiliary_loss_clip": 0.01131294, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.048944, + "balance_loss_mlp": 1.02567458, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 2.237014665401821, + "language_loss": 0.79070199, + "learning_rate": 2.775385401898104e-06, + "loss": 0.81241578, + "num_input_tokens_seen": 140048940, + "step": 6521, + "time_per_iteration": 2.430631160736084 + }, + { + "auxiliary_loss_clip": 0.0112658, + "auxiliary_loss_mlp": 0.01041369, + "balance_loss_clip": 1.05029273, + "balance_loss_mlp": 1.02429819, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 6.858646826014058, + "language_loss": 0.70052832, + "learning_rate": 2.775026385829952e-06, + "loss": 0.72220778, + "num_input_tokens_seen": 140066380, + "step": 6522, + "time_per_iteration": 2.47709321975708 + }, + { + "auxiliary_loss_clip": 0.01117184, + "auxiliary_loss_mlp": 0.01038855, + "balance_loss_clip": 1.05450475, + "balance_loss_mlp": 1.0239774, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.9653563763091224, + "language_loss": 0.76910269, + "learning_rate": 2.774667340372722e-06, + "loss": 0.79066312, + "num_input_tokens_seen": 140085275, + "step": 6523, + "time_per_iteration": 3.921168804168701 + }, + { + "auxiliary_loss_clip": 0.01115287, + "auxiliary_loss_mlp": 0.01045293, + "balance_loss_clip": 1.04932475, + "balance_loss_mlp": 1.03053474, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.5938948050911153, + "language_loss": 0.61597037, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.63757622, + "num_input_tokens_seen": 140105105, + "step": 6524, + "time_per_iteration": 2.606804609298706 + }, + { + "auxiliary_loss_clip": 0.01134417, + "auxiliary_loss_mlp": 0.01043145, + "balance_loss_clip": 1.04970312, + "balance_loss_mlp": 1.02804708, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 2.5350934215436656, + "language_loss": 0.74121141, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76298702, + "num_input_tokens_seen": 140125645, + "step": 6525, + "time_per_iteration": 2.493835926055908 + }, + { + "auxiliary_loss_clip": 0.01115037, + "auxiliary_loss_mlp": 0.0103953, + "balance_loss_clip": 1.0501399, + "balance_loss_mlp": 1.02554703, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 1.7615487035913113, + "language_loss": 0.8092047, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83075047, + "num_input_tokens_seen": 140141925, + "step": 6526, + "time_per_iteration": 4.055907964706421 + }, + { + "auxiliary_loss_clip": 0.01123882, + "auxiliary_loss_mlp": 0.01043302, + "balance_loss_clip": 1.05039608, + "balance_loss_mlp": 1.02857924, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.8158733396960454, + "language_loss": 0.70138216, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72305405, + "num_input_tokens_seen": 140160965, + "step": 6527, + "time_per_iteration": 2.5016701221466064 + }, + { + "auxiliary_loss_clip": 0.01092681, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.0462116, + "balance_loss_mlp": 1.02025926, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.424224693339598, + "language_loss": 0.81838036, + "learning_rate": 2.772871672726965e-06, + "loss": 0.8396498, + "num_input_tokens_seen": 140177780, + "step": 6528, + "time_per_iteration": 2.5046234130859375 + }, + { + "auxiliary_loss_clip": 0.01116859, + "auxiliary_loss_mlp": 0.01041531, + "balance_loss_clip": 1.0568856, + "balance_loss_mlp": 1.02735066, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 2.7125379964408096, + "language_loss": 0.68586481, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70744872, + "num_input_tokens_seen": 140201660, + "step": 6529, + "time_per_iteration": 2.607579469680786 + }, + { + "auxiliary_loss_clip": 0.01112049, + "auxiliary_loss_mlp": 0.01040741, + "balance_loss_clip": 1.04520893, + "balance_loss_mlp": 1.02486801, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 3.58371214604719, + "language_loss": 0.80261868, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.82414663, + "num_input_tokens_seen": 140218585, + "step": 6530, + "time_per_iteration": 2.5558090209960938 + }, + { + "auxiliary_loss_clip": 0.01121811, + "auxiliary_loss_mlp": 0.01038489, + "balance_loss_clip": 1.04782176, + "balance_loss_mlp": 1.02411819, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.6494161407668892, + "language_loss": 0.75828111, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.7798841, + "num_input_tokens_seen": 140239905, + "step": 6531, + "time_per_iteration": 2.5004968643188477 + }, + { + "auxiliary_loss_clip": 0.01050721, + "auxiliary_loss_mlp": 0.01011403, + "balance_loss_clip": 1.02278042, + "balance_loss_mlp": 1.00973439, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.81233498157799, + "language_loss": 0.60353547, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62415671, + "num_input_tokens_seen": 140293820, + "step": 6532, + "time_per_iteration": 4.386438608169556 + }, + { + "auxiliary_loss_clip": 0.01031195, + "auxiliary_loss_mlp": 0.01012347, + "balance_loss_clip": 1.02303386, + "balance_loss_mlp": 1.01062989, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7752196638997515, + "language_loss": 0.55484897, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57528436, + "num_input_tokens_seen": 140360420, + "step": 6533, + "time_per_iteration": 3.1653661727905273 + }, + { + "auxiliary_loss_clip": 0.01120869, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.05647838, + "balance_loss_mlp": 1.02725852, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 2.382546811672622, + "language_loss": 0.76242679, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.7840538, + "num_input_tokens_seen": 140381950, + "step": 6534, + "time_per_iteration": 2.5985114574432373 + }, + { + "auxiliary_loss_clip": 0.01122753, + "auxiliary_loss_mlp": 0.01045134, + "balance_loss_clip": 1.04712117, + "balance_loss_mlp": 1.02869558, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.2503263078749813, + "language_loss": 0.78035998, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80203885, + "num_input_tokens_seen": 140399410, + "step": 6535, + "time_per_iteration": 2.452486038208008 + }, + { + "auxiliary_loss_clip": 0.01090419, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.04497647, + "balance_loss_mlp": 1.01996052, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.006352519518682, + "language_loss": 0.69028825, + "learning_rate": 2.769997081218978e-06, + "loss": 0.71152782, + "num_input_tokens_seen": 140419055, + "step": 6536, + "time_per_iteration": 2.575796604156494 + }, + { + "auxiliary_loss_clip": 0.01101854, + "auxiliary_loss_mlp": 0.01034358, + "balance_loss_clip": 1.0447129, + "balance_loss_mlp": 1.02051759, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 4.177716226111998, + "language_loss": 0.69388711, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71524924, + "num_input_tokens_seen": 140438800, + "step": 6537, + "time_per_iteration": 2.556140899658203 + }, + { + "auxiliary_loss_clip": 0.0111889, + "auxiliary_loss_mlp": 0.01033791, + "balance_loss_clip": 1.04894793, + "balance_loss_mlp": 1.01913452, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.771834606187345, + "language_loss": 0.78728557, + "learning_rate": 2.769278141085763e-06, + "loss": 0.80881238, + "num_input_tokens_seen": 140456880, + "step": 6538, + "time_per_iteration": 2.450085163116455 + }, + { + "auxiliary_loss_clip": 0.01009586, + "auxiliary_loss_mlp": 0.010016, + "balance_loss_clip": 1.02904415, + "balance_loss_mlp": 0.99984735, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.816628933872124, + "language_loss": 0.61938971, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63950157, + "num_input_tokens_seen": 140507510, + "step": 6539, + "time_per_iteration": 2.931554079055786 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01033605, + "balance_loss_clip": 1.04571843, + "balance_loss_mlp": 1.01860213, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.1344056449956907, + "language_loss": 0.67710102, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.69845682, + "num_input_tokens_seen": 140528740, + "step": 6540, + "time_per_iteration": 2.6444437503814697 + }, + { + "auxiliary_loss_clip": 0.01112212, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.05143309, + "balance_loss_mlp": 1.02215147, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.8620026591912324, + "language_loss": 0.73013592, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.75162441, + "num_input_tokens_seen": 140547560, + "step": 6541, + "time_per_iteration": 2.5473878383636475 + }, + { + "auxiliary_loss_clip": 0.01048069, + "auxiliary_loss_mlp": 0.01002022, + "balance_loss_clip": 1.02030146, + "balance_loss_mlp": 1.00041246, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8343937002733949, + "language_loss": 0.60369301, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62419391, + "num_input_tokens_seen": 140601175, + "step": 6542, + "time_per_iteration": 2.8705403804779053 + }, + { + "auxiliary_loss_clip": 0.01121389, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.04788744, + "balance_loss_mlp": 1.01934528, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.993297432724563, + "language_loss": 0.82303429, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84458023, + "num_input_tokens_seen": 140622200, + "step": 6543, + "time_per_iteration": 2.5084726810455322 + }, + { + "auxiliary_loss_clip": 0.01104116, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.0417496, + "balance_loss_mlp": 1.02166402, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 2.390278580428873, + "language_loss": 0.68960011, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71100676, + "num_input_tokens_seen": 140643125, + "step": 6544, + "time_per_iteration": 2.6291158199310303 + }, + { + "auxiliary_loss_clip": 0.01115474, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_clip": 1.0487839, + "balance_loss_mlp": 1.02833974, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 2.0845104738233338, + "language_loss": 0.74928737, + "learning_rate": 2.76676093244553e-06, + "loss": 0.7708751, + "num_input_tokens_seen": 140662500, + "step": 6545, + "time_per_iteration": 2.5977020263671875 + }, + { + "auxiliary_loss_clip": 0.01097345, + "auxiliary_loss_mlp": 0.01037489, + "balance_loss_clip": 1.05116105, + "balance_loss_mlp": 1.02463853, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.4601935141603153, + "language_loss": 0.74341071, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76475906, + "num_input_tokens_seen": 140681960, + "step": 6546, + "time_per_iteration": 2.5544211864471436 + }, + { + "auxiliary_loss_clip": 0.01104731, + "auxiliary_loss_mlp": 0.01036176, + "balance_loss_clip": 1.04617858, + "balance_loss_mlp": 1.02098918, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 2.1419430539161928, + "language_loss": 0.81517369, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83658278, + "num_input_tokens_seen": 140699170, + "step": 6547, + "time_per_iteration": 3.995258331298828 + }, + { + "auxiliary_loss_clip": 0.01113913, + "auxiliary_loss_mlp": 0.00782357, + "balance_loss_clip": 1.04320335, + "balance_loss_mlp": 1.00048542, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 1.9661918536515843, + "language_loss": 0.8396228, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.85858548, + "num_input_tokens_seen": 140714920, + "step": 6548, + "time_per_iteration": 2.4839985370635986 + }, + { + "auxiliary_loss_clip": 0.01119026, + "auxiliary_loss_mlp": 0.00778908, + "balance_loss_clip": 1.04748166, + "balance_loss_mlp": 1.00039697, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.6512087020190493, + "language_loss": 0.72630459, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74528396, + "num_input_tokens_seen": 140734595, + "step": 6549, + "time_per_iteration": 2.5041611194610596 + }, + { + "auxiliary_loss_clip": 0.01075278, + "auxiliary_loss_mlp": 0.01043681, + "balance_loss_clip": 1.04752505, + "balance_loss_mlp": 1.02758777, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.49633102818161, + "language_loss": 0.77705735, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79824692, + "num_input_tokens_seen": 140754050, + "step": 6550, + "time_per_iteration": 2.607832908630371 + }, + { + "auxiliary_loss_clip": 0.01102087, + "auxiliary_loss_mlp": 0.01035312, + "balance_loss_clip": 1.05061722, + "balance_loss_mlp": 1.02126873, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.616340006735574, + "language_loss": 0.81336403, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83473808, + "num_input_tokens_seen": 140771440, + "step": 6551, + "time_per_iteration": 2.641388177871704 + }, + { + "auxiliary_loss_clip": 0.01121725, + "auxiliary_loss_mlp": 0.0104128, + "balance_loss_clip": 1.04776192, + "balance_loss_mlp": 1.02693915, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 3.259718349210249, + "language_loss": 0.79961073, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82124078, + "num_input_tokens_seen": 140786715, + "step": 6552, + "time_per_iteration": 2.4462125301361084 + }, + { + "auxiliary_loss_clip": 0.01133401, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_clip": 1.04807174, + "balance_loss_mlp": 1.02963209, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 1.8005599734036084, + "language_loss": 0.70912838, + "learning_rate": 2.763882378305003e-06, + "loss": 0.7308954, + "num_input_tokens_seen": 140804950, + "step": 6553, + "time_per_iteration": 2.4213178157806396 + }, + { + "auxiliary_loss_clip": 0.01117236, + "auxiliary_loss_mlp": 0.00780442, + "balance_loss_clip": 1.04669893, + "balance_loss_mlp": 1.00036359, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.7613271038886322, + "language_loss": 0.63947821, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65845495, + "num_input_tokens_seen": 140822800, + "step": 6554, + "time_per_iteration": 2.555626153945923 + }, + { + "auxiliary_loss_clip": 0.0111222, + "auxiliary_loss_mlp": 0.01042598, + "balance_loss_clip": 1.04856193, + "balance_loss_mlp": 1.02915096, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 2.2067299292816114, + "language_loss": 0.79392719, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81547534, + "num_input_tokens_seen": 140842940, + "step": 6555, + "time_per_iteration": 2.7401034832000732 + }, + { + "auxiliary_loss_clip": 0.01109724, + "auxiliary_loss_mlp": 0.01046557, + "balance_loss_clip": 1.05027008, + "balance_loss_mlp": 1.03089285, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.755992962253531, + "language_loss": 0.72191036, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.74347317, + "num_input_tokens_seen": 140863060, + "step": 6556, + "time_per_iteration": 2.5533435344696045 + }, + { + "auxiliary_loss_clip": 0.01131745, + "auxiliary_loss_mlp": 0.01035966, + "balance_loss_clip": 1.04658437, + "balance_loss_mlp": 1.02146375, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1.8056273589069547, + "language_loss": 0.83552933, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.8572064, + "num_input_tokens_seen": 140883795, + "step": 6557, + "time_per_iteration": 2.5470852851867676 + }, + { + "auxiliary_loss_clip": 0.01121135, + "auxiliary_loss_mlp": 0.01038003, + "balance_loss_clip": 1.05365872, + "balance_loss_mlp": 1.02355456, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 4.6483105974822605, + "language_loss": 0.80378377, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.8253752, + "num_input_tokens_seen": 140903055, + "step": 6558, + "time_per_iteration": 2.5195353031158447 + }, + { + "auxiliary_loss_clip": 0.01130736, + "auxiliary_loss_mlp": 0.01039972, + "balance_loss_clip": 1.0493741, + "balance_loss_mlp": 1.02654886, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 2.410393946965668, + "language_loss": 0.70968407, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73139119, + "num_input_tokens_seen": 140920685, + "step": 6559, + "time_per_iteration": 2.4308104515075684 + }, + { + "auxiliary_loss_clip": 0.01114565, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.04794955, + "balance_loss_mlp": 1.02478266, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 2.0233778874113275, + "language_loss": 0.80588782, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82743955, + "num_input_tokens_seen": 140937320, + "step": 6560, + "time_per_iteration": 2.5215976238250732 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01047966, + "balance_loss_clip": 1.05038285, + "balance_loss_mlp": 1.03248119, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 3.2605031137500453, + "language_loss": 0.82868236, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85029972, + "num_input_tokens_seen": 140954855, + "step": 6561, + "time_per_iteration": 2.501711368560791 + }, + { + "auxiliary_loss_clip": 0.01120854, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.04793262, + "balance_loss_mlp": 1.03110456, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 6.709506276750176, + "language_loss": 0.79675525, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.81841815, + "num_input_tokens_seen": 140973250, + "step": 6562, + "time_per_iteration": 3.8790180683135986 + }, + { + "auxiliary_loss_clip": 0.01102312, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_clip": 1.04422569, + "balance_loss_mlp": 1.02959466, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.7997194943664425, + "language_loss": 0.81335747, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83482111, + "num_input_tokens_seen": 140993050, + "step": 6563, + "time_per_iteration": 2.5494046211242676 + }, + { + "auxiliary_loss_clip": 0.01087944, + "auxiliary_loss_mlp": 0.01056544, + "balance_loss_clip": 1.04688919, + "balance_loss_mlp": 1.03816164, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 1.945151223377749, + "language_loss": 0.69761455, + "learning_rate": 2.759921340790127e-06, + "loss": 0.71905941, + "num_input_tokens_seen": 141010815, + "step": 6564, + "time_per_iteration": 2.623878240585327 + }, + { + "auxiliary_loss_clip": 0.01121588, + "auxiliary_loss_mlp": 0.01042688, + "balance_loss_clip": 1.04626822, + "balance_loss_mlp": 1.02765548, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 2.109656260913157, + "language_loss": 0.83131051, + "learning_rate": 2.759561073299676e-06, + "loss": 0.85295331, + "num_input_tokens_seen": 141028720, + "step": 6565, + "time_per_iteration": 3.94061541557312 + }, + { + "auxiliary_loss_clip": 0.01094524, + "auxiliary_loss_mlp": 0.01052742, + "balance_loss_clip": 1.04232252, + "balance_loss_mlp": 1.03635049, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.8870232383056431, + "language_loss": 0.83803356, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85950619, + "num_input_tokens_seen": 141046025, + "step": 6566, + "time_per_iteration": 2.5558440685272217 + }, + { + "auxiliary_loss_clip": 0.01138852, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.05065072, + "balance_loss_mlp": 1.02803826, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.113121274456719, + "language_loss": 0.77436614, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79618967, + "num_input_tokens_seen": 141066865, + "step": 6567, + "time_per_iteration": 2.4771363735198975 + }, + { + "auxiliary_loss_clip": 0.01116391, + "auxiliary_loss_mlp": 0.01039846, + "balance_loss_clip": 1.04602408, + "balance_loss_mlp": 1.02592802, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 1.8608254494944847, + "language_loss": 0.80378711, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82534945, + "num_input_tokens_seen": 141084210, + "step": 6568, + "time_per_iteration": 2.453350067138672 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.05346894, + "balance_loss_mlp": 1.0207293, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 2.2602159059463336, + "language_loss": 0.84856516, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86998373, + "num_input_tokens_seen": 141103895, + "step": 6569, + "time_per_iteration": 2.576842784881592 + }, + { + "auxiliary_loss_clip": 0.01078024, + "auxiliary_loss_mlp": 0.01039893, + "balance_loss_clip": 1.04904139, + "balance_loss_mlp": 1.02590406, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 2.4725459224960393, + "language_loss": 0.74268413, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.76386333, + "num_input_tokens_seen": 141124000, + "step": 6570, + "time_per_iteration": 2.6508843898773193 + }, + { + "auxiliary_loss_clip": 0.01098152, + "auxiliary_loss_mlp": 0.01041233, + "balance_loss_clip": 1.04703259, + "balance_loss_mlp": 1.02572429, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 1.6440161041906405, + "language_loss": 0.79828691, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81968069, + "num_input_tokens_seen": 141142535, + "step": 6571, + "time_per_iteration": 2.556485176086426 + }, + { + "auxiliary_loss_clip": 0.01100709, + "auxiliary_loss_mlp": 0.01046489, + "balance_loss_clip": 1.04363871, + "balance_loss_mlp": 1.0315578, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 2.902401852673796, + "language_loss": 0.77756321, + "learning_rate": 2.757038395157997e-06, + "loss": 0.79903519, + "num_input_tokens_seen": 141161575, + "step": 6572, + "time_per_iteration": 3.9507129192352295 + }, + { + "auxiliary_loss_clip": 0.0109538, + "auxiliary_loss_mlp": 0.01037112, + "balance_loss_clip": 1.04432845, + "balance_loss_mlp": 1.02205563, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.8710431089232533, + "language_loss": 0.74703574, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.76836067, + "num_input_tokens_seen": 141181150, + "step": 6573, + "time_per_iteration": 2.5835845470428467 + }, + { + "auxiliary_loss_clip": 0.01119212, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.0451566, + "balance_loss_mlp": 1.02003312, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.4759302451363958, + "language_loss": 0.67968214, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70120919, + "num_input_tokens_seen": 141206310, + "step": 6574, + "time_per_iteration": 2.7130231857299805 + }, + { + "auxiliary_loss_clip": 0.01074686, + "auxiliary_loss_mlp": 0.01044074, + "balance_loss_clip": 1.04201651, + "balance_loss_mlp": 1.02649093, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 3.2675943286848077, + "language_loss": 0.7123456, + "learning_rate": 2.755956816505072e-06, + "loss": 0.7335332, + "num_input_tokens_seen": 141223925, + "step": 6575, + "time_per_iteration": 2.589233875274658 + }, + { + "auxiliary_loss_clip": 0.01109749, + "auxiliary_loss_mlp": 0.01048385, + "balance_loss_clip": 1.04531336, + "balance_loss_mlp": 1.03338885, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 5.540343104367024, + "language_loss": 0.73342633, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75500768, + "num_input_tokens_seen": 141239010, + "step": 6576, + "time_per_iteration": 2.4674015045166016 + }, + { + "auxiliary_loss_clip": 0.01133807, + "auxiliary_loss_mlp": 0.01040435, + "balance_loss_clip": 1.04938126, + "balance_loss_mlp": 1.02695251, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.803641982528927, + "language_loss": 0.83838224, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.86012465, + "num_input_tokens_seen": 141252255, + "step": 6577, + "time_per_iteration": 2.3930869102478027 + }, + { + "auxiliary_loss_clip": 0.01112223, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.0488019, + "balance_loss_mlp": 1.02722287, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.9301717876429105, + "language_loss": 0.90361416, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92515367, + "num_input_tokens_seen": 141269325, + "step": 6578, + "time_per_iteration": 2.5328989028930664 + }, + { + "auxiliary_loss_clip": 0.01109653, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.04758704, + "balance_loss_mlp": 1.02148986, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 2.3773874328849103, + "language_loss": 0.777013, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.79848903, + "num_input_tokens_seen": 141288505, + "step": 6579, + "time_per_iteration": 2.5005457401275635 + }, + { + "auxiliary_loss_clip": 0.0108037, + "auxiliary_loss_mlp": 0.01037395, + "balance_loss_clip": 1.0428108, + "balance_loss_mlp": 1.02056313, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 3.861033783015174, + "language_loss": 0.68190831, + "learning_rate": 2.754153612280037e-06, + "loss": 0.7030859, + "num_input_tokens_seen": 141303680, + "step": 6580, + "time_per_iteration": 2.5457286834716797 + }, + { + "auxiliary_loss_clip": 0.01121547, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.05014205, + "balance_loss_mlp": 1.01755881, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.94680891775515, + "language_loss": 0.58864141, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.61017823, + "num_input_tokens_seen": 141324090, + "step": 6581, + "time_per_iteration": 2.5336225032806396 + }, + { + "auxiliary_loss_clip": 0.01110691, + "auxiliary_loss_mlp": 0.01054403, + "balance_loss_clip": 1.04813027, + "balance_loss_mlp": 1.03683114, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 1.904679372310238, + "language_loss": 0.69554043, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.71719134, + "num_input_tokens_seen": 141342235, + "step": 6582, + "time_per_iteration": 2.479390859603882 + }, + { + "auxiliary_loss_clip": 0.01136163, + "auxiliary_loss_mlp": 0.00781241, + "balance_loss_clip": 1.05133271, + "balance_loss_mlp": 1.00052547, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 1.9573352431992082, + "language_loss": 0.76467812, + "learning_rate": 2.753071346464642e-06, + "loss": 0.7838521, + "num_input_tokens_seen": 141361195, + "step": 6583, + "time_per_iteration": 2.428948163986206 + }, + { + "auxiliary_loss_clip": 0.01092609, + "auxiliary_loss_mlp": 0.00780868, + "balance_loss_clip": 1.04816139, + "balance_loss_mlp": 1.00036681, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.5799378433020825, + "language_loss": 0.65845478, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.67718953, + "num_input_tokens_seen": 141378275, + "step": 6584, + "time_per_iteration": 2.684699535369873 + }, + { + "auxiliary_loss_clip": 0.01100661, + "auxiliary_loss_mlp": 0.01041296, + "balance_loss_clip": 1.05132103, + "balance_loss_mlp": 1.02594221, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.971986077166489, + "language_loss": 0.72401208, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74543166, + "num_input_tokens_seen": 141396960, + "step": 6585, + "time_per_iteration": 2.7335941791534424 + }, + { + "auxiliary_loss_clip": 0.01103821, + "auxiliary_loss_mlp": 0.01037782, + "balance_loss_clip": 1.04862881, + "balance_loss_mlp": 1.02300572, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.8633783240572797, + "language_loss": 0.73015618, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75157225, + "num_input_tokens_seen": 141417320, + "step": 6586, + "time_per_iteration": 2.601330280303955 + }, + { + "auxiliary_loss_clip": 0.01105916, + "auxiliary_loss_mlp": 0.01038092, + "balance_loss_clip": 1.04563522, + "balance_loss_mlp": 1.02275014, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 1.6526107148820723, + "language_loss": 0.71266347, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73410356, + "num_input_tokens_seen": 141435985, + "step": 6587, + "time_per_iteration": 4.060082197189331 + }, + { + "auxiliary_loss_clip": 0.01023051, + "auxiliary_loss_mlp": 0.01009578, + "balance_loss_clip": 1.03724551, + "balance_loss_mlp": 1.00786138, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9197195775586203, + "language_loss": 0.61113715, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63146347, + "num_input_tokens_seen": 141486075, + "step": 6588, + "time_per_iteration": 2.927520513534546 + }, + { + "auxiliary_loss_clip": 0.01109254, + "auxiliary_loss_mlp": 0.00780004, + "balance_loss_clip": 1.0465219, + "balance_loss_mlp": 1.00038469, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.7953702461774679, + "language_loss": 0.81436026, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83325285, + "num_input_tokens_seen": 141505280, + "step": 6589, + "time_per_iteration": 2.522028684616089 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01036729, + "balance_loss_clip": 1.05218458, + "balance_loss_mlp": 1.02123153, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.1419031378146984, + "language_loss": 0.70527518, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72676057, + "num_input_tokens_seen": 141523930, + "step": 6590, + "time_per_iteration": 2.495187520980835 + }, + { + "auxiliary_loss_clip": 0.01117844, + "auxiliary_loss_mlp": 0.01054115, + "balance_loss_clip": 1.04645967, + "balance_loss_mlp": 1.03762805, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.8623106010420734, + "language_loss": 0.75217807, + "learning_rate": 2.750184048805956e-06, + "loss": 0.77389765, + "num_input_tokens_seen": 141541320, + "step": 6591, + "time_per_iteration": 2.4843149185180664 + }, + { + "auxiliary_loss_clip": 0.01048862, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.04284739, + "balance_loss_mlp": 1.02527642, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.754691271799209, + "language_loss": 0.77838916, + "learning_rate": 2.749823008443152e-06, + "loss": 0.79927844, + "num_input_tokens_seen": 141561880, + "step": 6592, + "time_per_iteration": 2.883657932281494 + }, + { + "auxiliary_loss_clip": 0.01063695, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.04412723, + "balance_loss_mlp": 1.01759315, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.8108023788810104, + "language_loss": 0.69377315, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71473312, + "num_input_tokens_seen": 141586460, + "step": 6593, + "time_per_iteration": 2.983259439468384 + }, + { + "auxiliary_loss_clip": 0.01073177, + "auxiliary_loss_mlp": 0.01044265, + "balance_loss_clip": 1.04746985, + "balance_loss_mlp": 1.02830315, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 2.51870850873382, + "language_loss": 0.77845556, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.79962993, + "num_input_tokens_seen": 141605955, + "step": 6594, + "time_per_iteration": 2.6494300365448 + }, + { + "auxiliary_loss_clip": 0.01027507, + "auxiliary_loss_mlp": 0.01004509, + "balance_loss_clip": 1.02769697, + "balance_loss_mlp": 1.00285196, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9405468798006485, + "language_loss": 0.6301229, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65044308, + "num_input_tokens_seen": 141673140, + "step": 6595, + "time_per_iteration": 3.1604037284851074 + }, + { + "auxiliary_loss_clip": 0.01099422, + "auxiliary_loss_mlp": 0.01057008, + "balance_loss_clip": 1.04513764, + "balance_loss_mlp": 1.03878045, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 1.9667195804542914, + "language_loss": 0.62965727, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65122163, + "num_input_tokens_seen": 141692955, + "step": 6596, + "time_per_iteration": 2.611915111541748 + }, + { + "auxiliary_loss_clip": 0.01122238, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.04960871, + "balance_loss_mlp": 1.02193379, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 1.9853402788741685, + "language_loss": 0.78626287, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80785638, + "num_input_tokens_seen": 141710680, + "step": 6597, + "time_per_iteration": 2.5686755180358887 + }, + { + "auxiliary_loss_clip": 0.01102558, + "auxiliary_loss_mlp": 0.00781753, + "balance_loss_clip": 1.04617238, + "balance_loss_mlp": 1.0004884, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 3.193178334339659, + "language_loss": 0.67406499, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69290817, + "num_input_tokens_seen": 141729860, + "step": 6598, + "time_per_iteration": 2.564676523208618 + }, + { + "auxiliary_loss_clip": 0.01133768, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.04849374, + "balance_loss_mlp": 1.02300477, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 1.81305539314154, + "language_loss": 0.78855443, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81026077, + "num_input_tokens_seen": 141749060, + "step": 6599, + "time_per_iteration": 2.486008644104004 + }, + { + "auxiliary_loss_clip": 0.01097777, + "auxiliary_loss_mlp": 0.01038264, + "balance_loss_clip": 1.0485661, + "balance_loss_mlp": 1.02155042, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 2.0520961411243817, + "language_loss": 0.72980106, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.75116146, + "num_input_tokens_seen": 141769860, + "step": 6600, + "time_per_iteration": 2.599625587463379 + }, + { + "auxiliary_loss_clip": 0.01089502, + "auxiliary_loss_mlp": 0.01036269, + "balance_loss_clip": 1.04122841, + "balance_loss_mlp": 1.02077162, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 2.219095653058525, + "language_loss": 0.85430801, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87556571, + "num_input_tokens_seen": 141788465, + "step": 6601, + "time_per_iteration": 4.180763483047485 + }, + { + "auxiliary_loss_clip": 0.01107313, + "auxiliary_loss_mlp": 0.01043539, + "balance_loss_clip": 1.05269253, + "balance_loss_mlp": 1.02577651, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 3.6147356417341374, + "language_loss": 0.70571131, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72721988, + "num_input_tokens_seen": 141804955, + "step": 6602, + "time_per_iteration": 2.5319268703460693 + }, + { + "auxiliary_loss_clip": 0.01135308, + "auxiliary_loss_mlp": 0.01046037, + "balance_loss_clip": 1.0495683, + "balance_loss_mlp": 1.03099287, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 3.5647759729673782, + "language_loss": 0.83034277, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85215622, + "num_input_tokens_seen": 141820025, + "step": 6603, + "time_per_iteration": 2.4781274795532227 + }, + { + "auxiliary_loss_clip": 0.01117689, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.05131912, + "balance_loss_mlp": 1.02187979, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.5875545793168138, + "language_loss": 0.72913915, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.75068259, + "num_input_tokens_seen": 141838735, + "step": 6604, + "time_per_iteration": 4.23973274230957 + }, + { + "auxiliary_loss_clip": 0.01110645, + "auxiliary_loss_mlp": 0.01040741, + "balance_loss_clip": 1.04857552, + "balance_loss_mlp": 1.02578616, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.6029537734591877, + "language_loss": 0.82434875, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84586263, + "num_input_tokens_seen": 141858090, + "step": 6605, + "time_per_iteration": 2.5703976154327393 + }, + { + "auxiliary_loss_clip": 0.0113192, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.04885817, + "balance_loss_mlp": 1.02009559, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.7705986160747211, + "language_loss": 0.74065411, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.76230335, + "num_input_tokens_seen": 141877540, + "step": 6606, + "time_per_iteration": 2.459639549255371 + }, + { + "auxiliary_loss_clip": 0.01100455, + "auxiliary_loss_mlp": 0.01046709, + "balance_loss_clip": 1.05375004, + "balance_loss_mlp": 1.02984059, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 2.814110430591228, + "language_loss": 0.74113244, + "learning_rate": 2.744403998666805e-06, + "loss": 0.762604, + "num_input_tokens_seen": 141897315, + "step": 6607, + "time_per_iteration": 2.5659139156341553 + }, + { + "auxiliary_loss_clip": 0.01127537, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.05014729, + "balance_loss_mlp": 1.02223301, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.836476567499233, + "language_loss": 0.67803258, + "learning_rate": 2.744042505013797e-06, + "loss": 0.69968182, + "num_input_tokens_seen": 141919580, + "step": 6608, + "time_per_iteration": 2.6807377338409424 + }, + { + "auxiliary_loss_clip": 0.0109982, + "auxiliary_loss_mlp": 0.01057386, + "balance_loss_clip": 1.04531741, + "balance_loss_mlp": 1.03853881, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 1.9161744860273051, + "language_loss": 0.74377704, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76534909, + "num_input_tokens_seen": 141937045, + "step": 6609, + "time_per_iteration": 2.5486488342285156 + }, + { + "auxiliary_loss_clip": 0.01112191, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.04917586, + "balance_loss_mlp": 1.02391994, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 1.4466355933322355, + "language_loss": 0.71459055, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73609972, + "num_input_tokens_seen": 141956695, + "step": 6610, + "time_per_iteration": 3.907125949859619 + }, + { + "auxiliary_loss_clip": 0.0111372, + "auxiliary_loss_mlp": 0.01036129, + "balance_loss_clip": 1.04481387, + "balance_loss_mlp": 1.02099514, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.457158654125179, + "language_loss": 0.78716707, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80866551, + "num_input_tokens_seen": 141975935, + "step": 6611, + "time_per_iteration": 2.499962568283081 + }, + { + "auxiliary_loss_clip": 0.01121907, + "auxiliary_loss_mlp": 0.01041243, + "balance_loss_clip": 1.04886937, + "balance_loss_mlp": 1.02587116, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.7784321818643263, + "language_loss": 0.79003954, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81167108, + "num_input_tokens_seen": 141995750, + "step": 6612, + "time_per_iteration": 2.5467748641967773 + }, + { + "auxiliary_loss_clip": 0.0103703, + "auxiliary_loss_mlp": 0.01001899, + "balance_loss_clip": 1.04050636, + "balance_loss_mlp": 1.00023031, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8506277875261542, + "language_loss": 0.64970344, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67009276, + "num_input_tokens_seen": 142057655, + "step": 6613, + "time_per_iteration": 3.0367932319641113 + }, + { + "auxiliary_loss_clip": 0.01103863, + "auxiliary_loss_mlp": 0.01048124, + "balance_loss_clip": 1.04351163, + "balance_loss_mlp": 1.03062427, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.2900786816325533, + "language_loss": 0.7146982, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73621809, + "num_input_tokens_seen": 142076020, + "step": 6614, + "time_per_iteration": 2.5176010131835938 + }, + { + "auxiliary_loss_clip": 0.01119838, + "auxiliary_loss_mlp": 0.01040352, + "balance_loss_clip": 1.04981434, + "balance_loss_mlp": 1.02555847, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.8082722682719605, + "language_loss": 0.81650376, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83810568, + "num_input_tokens_seen": 142093790, + "step": 6615, + "time_per_iteration": 2.4605987071990967 + }, + { + "auxiliary_loss_clip": 0.01096851, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.04800987, + "balance_loss_mlp": 1.02059841, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 1.8931716114162787, + "language_loss": 0.67523056, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69654328, + "num_input_tokens_seen": 142110545, + "step": 6616, + "time_per_iteration": 2.5117897987365723 + }, + { + "auxiliary_loss_clip": 0.01136812, + "auxiliary_loss_mlp": 0.01042749, + "balance_loss_clip": 1.05003119, + "balance_loss_mlp": 1.02751422, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.7451418825292877, + "language_loss": 0.83629727, + "learning_rate": 2.740787794144541e-06, + "loss": 0.8580929, + "num_input_tokens_seen": 142128695, + "step": 6617, + "time_per_iteration": 2.447866916656494 + }, + { + "auxiliary_loss_clip": 0.01128923, + "auxiliary_loss_mlp": 0.01046108, + "balance_loss_clip": 1.04957032, + "balance_loss_mlp": 1.03267884, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.6305996172272927, + "language_loss": 0.72469449, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74644482, + "num_input_tokens_seen": 142148375, + "step": 6618, + "time_per_iteration": 2.441822052001953 + }, + { + "auxiliary_loss_clip": 0.01113, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.05107391, + "balance_loss_mlp": 1.02363658, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.7585627998033422, + "language_loss": 0.65425086, + "learning_rate": 2.740064215712231e-06, + "loss": 0.67578244, + "num_input_tokens_seen": 142169735, + "step": 6619, + "time_per_iteration": 2.589452028274536 + }, + { + "auxiliary_loss_clip": 0.01054838, + "auxiliary_loss_mlp": 0.01012041, + "balance_loss_clip": 1.02767992, + "balance_loss_mlp": 1.01055133, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.770125168478424, + "language_loss": 0.58185697, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60252571, + "num_input_tokens_seen": 142229520, + "step": 6620, + "time_per_iteration": 2.9917900562286377 + }, + { + "auxiliary_loss_clip": 0.01118645, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.05474937, + "balance_loss_mlp": 1.02673316, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.5417397483991122, + "language_loss": 0.79488957, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81646961, + "num_input_tokens_seen": 142247660, + "step": 6621, + "time_per_iteration": 2.5097386837005615 + }, + { + "auxiliary_loss_clip": 0.01112274, + "auxiliary_loss_mlp": 0.01035196, + "balance_loss_clip": 1.0507772, + "balance_loss_mlp": 1.02140903, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 2.336729786149787, + "language_loss": 0.78002977, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80150449, + "num_input_tokens_seen": 142266990, + "step": 6622, + "time_per_iteration": 2.515047550201416 + }, + { + "auxiliary_loss_clip": 0.01109111, + "auxiliary_loss_mlp": 0.01040469, + "balance_loss_clip": 1.04786563, + "balance_loss_mlp": 1.02566361, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.7888311220018693, + "language_loss": 0.75051713, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77201295, + "num_input_tokens_seen": 142287170, + "step": 6623, + "time_per_iteration": 2.538646936416626 + }, + { + "auxiliary_loss_clip": 0.01090526, + "auxiliary_loss_mlp": 0.01041064, + "balance_loss_clip": 1.04506636, + "balance_loss_mlp": 1.02640748, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.84791763193988, + "language_loss": 0.79932201, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.82063794, + "num_input_tokens_seen": 142305405, + "step": 6624, + "time_per_iteration": 2.549023389816284 + }, + { + "auxiliary_loss_clip": 0.0113694, + "auxiliary_loss_mlp": 0.01045234, + "balance_loss_clip": 1.05040383, + "balance_loss_mlp": 1.02869999, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.2683640677603765, + "language_loss": 0.84001243, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.86183417, + "num_input_tokens_seen": 142322710, + "step": 6625, + "time_per_iteration": 2.4512014389038086 + }, + { + "auxiliary_loss_clip": 0.01114924, + "auxiliary_loss_mlp": 0.01044645, + "balance_loss_clip": 1.04652548, + "balance_loss_mlp": 1.02949345, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.042845585436398, + "language_loss": 0.86568356, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88727921, + "num_input_tokens_seen": 142338535, + "step": 6626, + "time_per_iteration": 4.124813795089722 + }, + { + "auxiliary_loss_clip": 0.01072079, + "auxiliary_loss_mlp": 0.00786895, + "balance_loss_clip": 1.0424571, + "balance_loss_mlp": 1.00053048, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.4660056425969006, + "language_loss": 0.83491123, + "learning_rate": 2.737168780548417e-06, + "loss": 0.85350096, + "num_input_tokens_seen": 142354570, + "step": 6627, + "time_per_iteration": 2.6017792224884033 + }, + { + "auxiliary_loss_clip": 0.01095136, + "auxiliary_loss_mlp": 0.00781265, + "balance_loss_clip": 1.04747677, + "balance_loss_mlp": 1.0003705, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.4410619093332184, + "language_loss": 0.82819206, + "learning_rate": 2.736806725217998e-06, + "loss": 0.84695613, + "num_input_tokens_seen": 142374395, + "step": 6628, + "time_per_iteration": 2.5999608039855957 + }, + { + "auxiliary_loss_clip": 0.01096111, + "auxiliary_loss_mlp": 0.01062217, + "balance_loss_clip": 1.04787624, + "balance_loss_mlp": 1.04590869, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.75675355518443, + "language_loss": 0.71145236, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73303562, + "num_input_tokens_seen": 142396040, + "step": 6629, + "time_per_iteration": 2.5573606491088867 + }, + { + "auxiliary_loss_clip": 0.01102075, + "auxiliary_loss_mlp": 0.01041434, + "balance_loss_clip": 1.05218482, + "balance_loss_mlp": 1.02662826, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 2.258349537552333, + "language_loss": 0.80367458, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82510966, + "num_input_tokens_seen": 142415495, + "step": 6630, + "time_per_iteration": 2.555917501449585 + }, + { + "auxiliary_loss_clip": 0.01078079, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.05145931, + "balance_loss_mlp": 1.01675272, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 2.04324257270959, + "language_loss": 0.75332594, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77441436, + "num_input_tokens_seen": 142431865, + "step": 6631, + "time_per_iteration": 2.584531545639038 + }, + { + "auxiliary_loss_clip": 0.01095692, + "auxiliary_loss_mlp": 0.01039964, + "balance_loss_clip": 1.04227829, + "balance_loss_mlp": 1.02426434, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 2.0187575146494448, + "language_loss": 0.71557987, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73693639, + "num_input_tokens_seen": 142450595, + "step": 6632, + "time_per_iteration": 2.5315539836883545 + }, + { + "auxiliary_loss_clip": 0.01072743, + "auxiliary_loss_mlp": 0.00779309, + "balance_loss_clip": 1.05226088, + "balance_loss_mlp": 1.00037003, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 2.016474769751359, + "language_loss": 0.74948418, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76800472, + "num_input_tokens_seen": 142466650, + "step": 6633, + "time_per_iteration": 2.5956645011901855 + }, + { + "auxiliary_loss_clip": 0.01110386, + "auxiliary_loss_mlp": 0.01028438, + "balance_loss_clip": 1.05257368, + "balance_loss_mlp": 1.01399565, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.8377277266779293, + "language_loss": 0.81435561, + "learning_rate": 2.7346338069806e-06, + "loss": 0.8357439, + "num_input_tokens_seen": 142486165, + "step": 6634, + "time_per_iteration": 2.5535833835601807 + }, + { + "auxiliary_loss_clip": 0.01106807, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.05012774, + "balance_loss_mlp": 1.01672125, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.9211234527245686, + "language_loss": 0.74634415, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.76773691, + "num_input_tokens_seen": 142505035, + "step": 6635, + "time_per_iteration": 2.5204975605010986 + }, + { + "auxiliary_loss_clip": 0.01105685, + "auxiliary_loss_mlp": 0.01042404, + "balance_loss_clip": 1.05301654, + "balance_loss_mlp": 1.02610826, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 2.0421428002067348, + "language_loss": 0.66423506, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68571597, + "num_input_tokens_seen": 142521870, + "step": 6636, + "time_per_iteration": 2.5403892993927 + }, + { + "auxiliary_loss_clip": 0.01119878, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.04939866, + "balance_loss_mlp": 1.02580714, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.8912218013260522, + "language_loss": 0.81945151, + "learning_rate": 2.733546971601763e-06, + "loss": 0.84105319, + "num_input_tokens_seen": 142540455, + "step": 6637, + "time_per_iteration": 2.469973564147949 + }, + { + "auxiliary_loss_clip": 0.01030701, + "auxiliary_loss_mlp": 0.01021751, + "balance_loss_clip": 1.02980566, + "balance_loss_mlp": 1.01992726, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7242787726210856, + "language_loss": 0.53221297, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55273747, + "num_input_tokens_seen": 142599665, + "step": 6638, + "time_per_iteration": 3.1647841930389404 + }, + { + "auxiliary_loss_clip": 0.01111673, + "auxiliary_loss_mlp": 0.00781216, + "balance_loss_clip": 1.04937184, + "balance_loss_mlp": 1.0005455, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.62131870619422, + "language_loss": 0.75691473, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77584362, + "num_input_tokens_seen": 142618845, + "step": 6639, + "time_per_iteration": 2.525906801223755 + }, + { + "auxiliary_loss_clip": 0.0106372, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.04190624, + "balance_loss_mlp": 1.01967227, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.9420141412304628, + "language_loss": 0.76047993, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.7814616, + "num_input_tokens_seen": 142640885, + "step": 6640, + "time_per_iteration": 4.237142086029053 + }, + { + "auxiliary_loss_clip": 0.01111811, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.05697155, + "balance_loss_mlp": 1.02273393, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.1770821765039043, + "language_loss": 0.82318127, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84467316, + "num_input_tokens_seen": 142659340, + "step": 6641, + "time_per_iteration": 2.582043170928955 + }, + { + "auxiliary_loss_clip": 0.01137449, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.0537895, + "balance_loss_mlp": 1.02125573, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.7531300080116563, + "language_loss": 0.76595509, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78769124, + "num_input_tokens_seen": 142677085, + "step": 6642, + "time_per_iteration": 2.441835403442383 + }, + { + "auxiliary_loss_clip": 0.01105422, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.04736972, + "balance_loss_mlp": 1.02256823, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.0919617882669947, + "language_loss": 0.72185749, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74329346, + "num_input_tokens_seen": 142694595, + "step": 6643, + "time_per_iteration": 4.0157225131988525 + }, + { + "auxiliary_loss_clip": 0.01122642, + "auxiliary_loss_mlp": 0.01038765, + "balance_loss_clip": 1.04940701, + "balance_loss_mlp": 1.02374506, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 2.0016951826083833, + "language_loss": 0.66016495, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68177909, + "num_input_tokens_seen": 142714175, + "step": 6644, + "time_per_iteration": 2.490168333053589 + }, + { + "auxiliary_loss_clip": 0.01130512, + "auxiliary_loss_mlp": 0.01041611, + "balance_loss_clip": 1.04718375, + "balance_loss_mlp": 1.02623343, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 2.190743890562156, + "language_loss": 0.77978837, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80150962, + "num_input_tokens_seen": 142730955, + "step": 6645, + "time_per_iteration": 2.434622287750244 + }, + { + "auxiliary_loss_clip": 0.0112425, + "auxiliary_loss_mlp": 0.01039844, + "balance_loss_clip": 1.0500524, + "balance_loss_mlp": 1.02447748, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 3.691232157198565, + "language_loss": 0.70211899, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72375995, + "num_input_tokens_seen": 142751200, + "step": 6646, + "time_per_iteration": 2.5049946308135986 + }, + { + "auxiliary_loss_clip": 0.01080454, + "auxiliary_loss_mlp": 0.01039534, + "balance_loss_clip": 1.04006028, + "balance_loss_mlp": 1.02397752, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.8497623653060924, + "language_loss": 0.71266508, + "learning_rate": 2.729922381038513e-06, + "loss": 0.73386502, + "num_input_tokens_seen": 142770170, + "step": 6647, + "time_per_iteration": 2.5725107192993164 + }, + { + "auxiliary_loss_clip": 0.01096039, + "auxiliary_loss_mlp": 0.01036719, + "balance_loss_clip": 1.04980052, + "balance_loss_mlp": 1.02259898, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.4129277051805478, + "language_loss": 0.74096215, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.7622897, + "num_input_tokens_seen": 142792680, + "step": 6648, + "time_per_iteration": 2.672682762145996 + }, + { + "auxiliary_loss_clip": 0.01130589, + "auxiliary_loss_mlp": 0.01039416, + "balance_loss_clip": 1.04773474, + "balance_loss_mlp": 1.02369225, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 1.856092519528155, + "language_loss": 0.66127121, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68297124, + "num_input_tokens_seen": 142810510, + "step": 6649, + "time_per_iteration": 3.888244152069092 + }, + { + "auxiliary_loss_clip": 0.01108572, + "auxiliary_loss_mlp": 0.01046259, + "balance_loss_clip": 1.05330586, + "balance_loss_mlp": 1.03108335, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 2.027830946045437, + "language_loss": 0.75271416, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77426243, + "num_input_tokens_seen": 142832455, + "step": 6650, + "time_per_iteration": 2.654167652130127 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.04837322, + "balance_loss_mlp": 1.02949333, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.9209161741983836, + "language_loss": 0.71953428, + "learning_rate": 2.728471769038975e-06, + "loss": 0.7412895, + "num_input_tokens_seen": 142852590, + "step": 6651, + "time_per_iteration": 2.4429516792297363 + }, + { + "auxiliary_loss_clip": 0.01130772, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.04689503, + "balance_loss_mlp": 1.0267154, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 2.3436522943603797, + "language_loss": 0.73016763, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75189328, + "num_input_tokens_seen": 142870595, + "step": 6652, + "time_per_iteration": 2.436748504638672 + }, + { + "auxiliary_loss_clip": 0.01027953, + "auxiliary_loss_mlp": 0.01009556, + "balance_loss_clip": 1.02820802, + "balance_loss_mlp": 1.00773203, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8709836005837417, + "language_loss": 0.60650909, + "learning_rate": 2.727746297241862e-06, + "loss": 0.6268841, + "num_input_tokens_seen": 142925805, + "step": 6653, + "time_per_iteration": 3.0395047664642334 + }, + { + "auxiliary_loss_clip": 0.01094373, + "auxiliary_loss_mlp": 0.0104134, + "balance_loss_clip": 1.0502435, + "balance_loss_mlp": 1.02726185, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.3213759173313733, + "language_loss": 0.6675505, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.68890762, + "num_input_tokens_seen": 142943145, + "step": 6654, + "time_per_iteration": 2.5323715209960938 + }, + { + "auxiliary_loss_clip": 0.0112167, + "auxiliary_loss_mlp": 0.01043371, + "balance_loss_clip": 1.0502696, + "balance_loss_mlp": 1.02997768, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.6906057442861586, + "language_loss": 0.8995176, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92116803, + "num_input_tokens_seen": 142956925, + "step": 6655, + "time_per_iteration": 2.438225269317627 + }, + { + "auxiliary_loss_clip": 0.01108013, + "auxiliary_loss_mlp": 0.01040498, + "balance_loss_clip": 1.05089164, + "balance_loss_mlp": 1.02649724, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.810719438929069, + "language_loss": 0.73355502, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75504017, + "num_input_tokens_seen": 142978040, + "step": 6656, + "time_per_iteration": 2.5788564682006836 + }, + { + "auxiliary_loss_clip": 0.01131779, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_clip": 1.04856646, + "balance_loss_mlp": 1.03165376, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.5329524584930194, + "language_loss": 0.73653388, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75831676, + "num_input_tokens_seen": 142998390, + "step": 6657, + "time_per_iteration": 2.525499105453491 + }, + { + "auxiliary_loss_clip": 0.01133879, + "auxiliary_loss_mlp": 0.01046881, + "balance_loss_clip": 1.050524, + "balance_loss_mlp": 1.03073967, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.5190533321400848, + "language_loss": 0.79468054, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81648815, + "num_input_tokens_seen": 143021505, + "step": 6658, + "time_per_iteration": 2.5454070568084717 + }, + { + "auxiliary_loss_clip": 0.01119766, + "auxiliary_loss_mlp": 0.01044892, + "balance_loss_clip": 1.04688013, + "balance_loss_mlp": 1.03040779, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.7312612823251121, + "language_loss": 0.77432811, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79597473, + "num_input_tokens_seen": 143041375, + "step": 6659, + "time_per_iteration": 2.5368685722351074 + }, + { + "auxiliary_loss_clip": 0.01122209, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.04558921, + "balance_loss_mlp": 1.01987743, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.6179689715517191, + "language_loss": 0.72488546, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.7464323, + "num_input_tokens_seen": 143058725, + "step": 6660, + "time_per_iteration": 2.553623676300049 + }, + { + "auxiliary_loss_clip": 0.01103334, + "auxiliary_loss_mlp": 0.01046408, + "balance_loss_clip": 1.04346907, + "balance_loss_mlp": 1.03247237, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.6820260353788583, + "language_loss": 0.7125833, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73408067, + "num_input_tokens_seen": 143076995, + "step": 6661, + "time_per_iteration": 2.567971706390381 + }, + { + "auxiliary_loss_clip": 0.01135723, + "auxiliary_loss_mlp": 0.01044069, + "balance_loss_clip": 1.05219197, + "balance_loss_mlp": 1.02928066, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 2.184894352266078, + "language_loss": 0.75407565, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77587354, + "num_input_tokens_seen": 143096780, + "step": 6662, + "time_per_iteration": 2.50820255279541 + }, + { + "auxiliary_loss_clip": 0.01116856, + "auxiliary_loss_mlp": 0.0103635, + "balance_loss_clip": 1.04464459, + "balance_loss_mlp": 1.02131772, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 2.2137767401481, + "language_loss": 0.66564322, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68717527, + "num_input_tokens_seen": 143112590, + "step": 6663, + "time_per_iteration": 2.4999215602874756 + }, + { + "auxiliary_loss_clip": 0.01114945, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.04527116, + "balance_loss_mlp": 1.0270772, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.2315874424872604, + "language_loss": 0.8558113, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87738317, + "num_input_tokens_seen": 143130220, + "step": 6664, + "time_per_iteration": 2.479355812072754 + }, + { + "auxiliary_loss_clip": 0.01117311, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.04655921, + "balance_loss_mlp": 1.02219534, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 2.8341985898513617, + "language_loss": 0.8481704, + "learning_rate": 2.723391152229917e-06, + "loss": 0.86970764, + "num_input_tokens_seen": 143147160, + "step": 6665, + "time_per_iteration": 4.350629806518555 + }, + { + "auxiliary_loss_clip": 0.01120184, + "auxiliary_loss_mlp": 0.01039567, + "balance_loss_clip": 1.05107534, + "balance_loss_mlp": 1.02428436, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.9925742816095968, + "language_loss": 0.7828055, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80440307, + "num_input_tokens_seen": 143164605, + "step": 6666, + "time_per_iteration": 2.490180492401123 + }, + { + "auxiliary_loss_clip": 0.01126563, + "auxiliary_loss_mlp": 0.01038008, + "balance_loss_clip": 1.05405498, + "balance_loss_mlp": 1.02232027, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.9076154157456648, + "language_loss": 0.73254615, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75419188, + "num_input_tokens_seen": 143183965, + "step": 6667, + "time_per_iteration": 2.520098924636841 + }, + { + "auxiliary_loss_clip": 0.01117128, + "auxiliary_loss_mlp": 0.01049189, + "balance_loss_clip": 1.04724634, + "balance_loss_mlp": 1.0335722, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.6761447622011802, + "language_loss": 0.75734752, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77901065, + "num_input_tokens_seen": 143204965, + "step": 6668, + "time_per_iteration": 2.497328996658325 + }, + { + "auxiliary_loss_clip": 0.01101825, + "auxiliary_loss_mlp": 0.01042302, + "balance_loss_clip": 1.05280721, + "balance_loss_mlp": 1.02787185, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.902450983840658, + "language_loss": 0.82235324, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84379458, + "num_input_tokens_seen": 143225015, + "step": 6669, + "time_per_iteration": 2.630295753479004 + }, + { + "auxiliary_loss_clip": 0.01029604, + "auxiliary_loss_mlp": 0.01013453, + "balance_loss_clip": 1.0257417, + "balance_loss_mlp": 1.01189184, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.7034528878033735, + "language_loss": 0.53369927, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55412984, + "num_input_tokens_seen": 143294925, + "step": 6670, + "time_per_iteration": 3.251397132873535 + }, + { + "auxiliary_loss_clip": 0.01088583, + "auxiliary_loss_mlp": 0.01040543, + "balance_loss_clip": 1.05083573, + "balance_loss_mlp": 1.0256958, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.6874245516719721, + "language_loss": 0.88898546, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.91027677, + "num_input_tokens_seen": 143314170, + "step": 6671, + "time_per_iteration": 2.6681888103485107 + }, + { + "auxiliary_loss_clip": 0.01126549, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.05147362, + "balance_loss_mlp": 1.01790452, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.7997812242888342, + "language_loss": 0.78999043, + "learning_rate": 2.720848825281736e-06, + "loss": 0.8115918, + "num_input_tokens_seen": 143330050, + "step": 6672, + "time_per_iteration": 2.4631941318511963 + }, + { + "auxiliary_loss_clip": 0.01102385, + "auxiliary_loss_mlp": 0.0104061, + "balance_loss_clip": 1.05147433, + "balance_loss_mlp": 1.02436137, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 2.318610034194685, + "language_loss": 0.63142663, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65285659, + "num_input_tokens_seen": 143348650, + "step": 6673, + "time_per_iteration": 2.563443422317505 + }, + { + "auxiliary_loss_clip": 0.01109867, + "auxiliary_loss_mlp": 0.00781449, + "balance_loss_clip": 1.04955268, + "balance_loss_mlp": 1.00059462, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.4715441637754534, + "language_loss": 0.80085766, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.81977081, + "num_input_tokens_seen": 143370275, + "step": 6674, + "time_per_iteration": 2.5784552097320557 + }, + { + "auxiliary_loss_clip": 0.01093833, + "auxiliary_loss_mlp": 0.01038003, + "balance_loss_clip": 1.05521274, + "balance_loss_mlp": 1.02351892, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.254117768661874, + "language_loss": 0.8242929, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84561121, + "num_input_tokens_seen": 143385390, + "step": 6675, + "time_per_iteration": 2.5626611709594727 + }, + { + "auxiliary_loss_clip": 0.01120325, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.04737735, + "balance_loss_mlp": 1.02301884, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.8836120233694171, + "language_loss": 0.93309474, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95468283, + "num_input_tokens_seen": 143404215, + "step": 6676, + "time_per_iteration": 2.5725009441375732 + }, + { + "auxiliary_loss_clip": 0.01126936, + "auxiliary_loss_mlp": 0.01044278, + "balance_loss_clip": 1.04926717, + "balance_loss_mlp": 1.02783895, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 2.204013049579265, + "language_loss": 0.79358971, + "learning_rate": 2.719032057146399e-06, + "loss": 0.8153019, + "num_input_tokens_seen": 143422245, + "step": 6677, + "time_per_iteration": 2.4725842475891113 + }, + { + "auxiliary_loss_clip": 0.01109597, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.05007887, + "balance_loss_mlp": 1.02037358, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 4.491747778177664, + "language_loss": 0.8339361, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85538197, + "num_input_tokens_seen": 143443130, + "step": 6678, + "time_per_iteration": 2.5327706336975098 + }, + { + "auxiliary_loss_clip": 0.01130603, + "auxiliary_loss_mlp": 0.01043355, + "balance_loss_clip": 1.04949188, + "balance_loss_mlp": 1.0285964, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.7074640312171403, + "language_loss": 0.63269246, + "learning_rate": 2.718305158935434e-06, + "loss": 0.65443206, + "num_input_tokens_seen": 143461385, + "step": 6679, + "time_per_iteration": 3.9055604934692383 + }, + { + "auxiliary_loss_clip": 0.01100647, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.04240084, + "balance_loss_mlp": 1.01958656, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.4110187485000514, + "language_loss": 0.78775102, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80909467, + "num_input_tokens_seen": 143481750, + "step": 6680, + "time_per_iteration": 2.5508954524993896 + }, + { + "auxiliary_loss_clip": 0.01100638, + "auxiliary_loss_mlp": 0.00782179, + "balance_loss_clip": 1.04881072, + "balance_loss_mlp": 1.000723, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 2.0014681735074213, + "language_loss": 0.75938344, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77821159, + "num_input_tokens_seen": 143501540, + "step": 6681, + "time_per_iteration": 2.6590962409973145 + }, + { + "auxiliary_loss_clip": 0.01096568, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.05119503, + "balance_loss_mlp": 1.01788592, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 2.4746721019341646, + "language_loss": 0.64092487, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.6622104, + "num_input_tokens_seen": 143520530, + "step": 6682, + "time_per_iteration": 2.7518725395202637 + }, + { + "auxiliary_loss_clip": 0.01088471, + "auxiliary_loss_mlp": 0.01040784, + "balance_loss_clip": 1.04437995, + "balance_loss_mlp": 1.02623415, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.8857859667113326, + "language_loss": 0.73162353, + "learning_rate": 2.716851035765337e-06, + "loss": 0.7529161, + "num_input_tokens_seen": 143540210, + "step": 6683, + "time_per_iteration": 4.166389465332031 + }, + { + "auxiliary_loss_clip": 0.01114909, + "auxiliary_loss_mlp": 0.01043004, + "balance_loss_clip": 1.04488552, + "balance_loss_mlp": 1.02862775, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.8763884860015332, + "language_loss": 0.73361993, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75519902, + "num_input_tokens_seen": 143560940, + "step": 6684, + "time_per_iteration": 2.6261167526245117 + }, + { + "auxiliary_loss_clip": 0.01042318, + "auxiliary_loss_mlp": 0.01000197, + "balance_loss_clip": 1.0264771, + "balance_loss_mlp": 0.99858725, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8112393485888496, + "language_loss": 0.60352218, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62394732, + "num_input_tokens_seen": 143624015, + "step": 6685, + "time_per_iteration": 3.208988904953003 + }, + { + "auxiliary_loss_clip": 0.01120924, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.04682982, + "balance_loss_mlp": 1.02028382, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 1.8011976118730832, + "language_loss": 0.7010318, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72259557, + "num_input_tokens_seen": 143642750, + "step": 6686, + "time_per_iteration": 2.4796061515808105 + }, + { + "auxiliary_loss_clip": 0.01107192, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.04677486, + "balance_loss_mlp": 1.02219987, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.4748231508764384, + "language_loss": 0.74573278, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76716232, + "num_input_tokens_seen": 143664515, + "step": 6687, + "time_per_iteration": 3.9491140842437744 + }, + { + "auxiliary_loss_clip": 0.01110744, + "auxiliary_loss_mlp": 0.01040295, + "balance_loss_clip": 1.04970765, + "balance_loss_mlp": 1.02560282, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.857146760331339, + "language_loss": 0.70987928, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73138964, + "num_input_tokens_seen": 143683135, + "step": 6688, + "time_per_iteration": 2.5289018154144287 + }, + { + "auxiliary_loss_clip": 0.01107997, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.04441023, + "balance_loss_mlp": 1.02747464, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.6999003966349653, + "language_loss": 0.64229357, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.66380739, + "num_input_tokens_seen": 143703985, + "step": 6689, + "time_per_iteration": 2.544002056121826 + }, + { + "auxiliary_loss_clip": 0.01122668, + "auxiliary_loss_mlp": 0.01033926, + "balance_loss_clip": 1.04575121, + "balance_loss_mlp": 1.02011561, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 3.7587721596030517, + "language_loss": 0.73429471, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75586069, + "num_input_tokens_seen": 143719245, + "step": 6690, + "time_per_iteration": 2.4491353034973145 + }, + { + "auxiliary_loss_clip": 0.01101103, + "auxiliary_loss_mlp": 0.01040605, + "balance_loss_clip": 1.04558754, + "balance_loss_mlp": 1.0259304, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.5874425848939757, + "language_loss": 0.74814284, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76955998, + "num_input_tokens_seen": 143739575, + "step": 6691, + "time_per_iteration": 2.5732455253601074 + }, + { + "auxiliary_loss_clip": 0.01113619, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.04882038, + "balance_loss_mlp": 1.0294745, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.5935370926186696, + "language_loss": 0.72116303, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74274027, + "num_input_tokens_seen": 143758515, + "step": 6692, + "time_per_iteration": 2.5111091136932373 + }, + { + "auxiliary_loss_clip": 0.01084489, + "auxiliary_loss_mlp": 0.01043235, + "balance_loss_clip": 1.04383755, + "balance_loss_mlp": 1.02826262, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 2.591399552843746, + "language_loss": 0.83980989, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86108714, + "num_input_tokens_seen": 143776770, + "step": 6693, + "time_per_iteration": 2.5961759090423584 + }, + { + "auxiliary_loss_clip": 0.01096642, + "auxiliary_loss_mlp": 0.01053168, + "balance_loss_clip": 1.0520277, + "balance_loss_mlp": 1.03730083, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 3.2526115975643615, + "language_loss": 0.71352482, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.7350229, + "num_input_tokens_seen": 143798450, + "step": 6694, + "time_per_iteration": 2.6818301677703857 + }, + { + "auxiliary_loss_clip": 0.01104813, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.04781175, + "balance_loss_mlp": 1.02300215, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 1.8600502834273631, + "language_loss": 0.67793941, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.69936335, + "num_input_tokens_seen": 143816995, + "step": 6695, + "time_per_iteration": 2.5282375812530518 + }, + { + "auxiliary_loss_clip": 0.01102494, + "auxiliary_loss_mlp": 0.01052118, + "balance_loss_clip": 1.04343271, + "balance_loss_mlp": 1.03664434, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.1680844612692387, + "language_loss": 0.79834342, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81988949, + "num_input_tokens_seen": 143842090, + "step": 6696, + "time_per_iteration": 2.906770944595337 + }, + { + "auxiliary_loss_clip": 0.01109538, + "auxiliary_loss_mlp": 0.01060524, + "balance_loss_clip": 1.04939556, + "balance_loss_mlp": 1.04291666, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.9549609447201626, + "language_loss": 0.71250963, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73421025, + "num_input_tokens_seen": 143860800, + "step": 6697, + "time_per_iteration": 2.5132339000701904 + }, + { + "auxiliary_loss_clip": 0.01119169, + "auxiliary_loss_mlp": 0.01046309, + "balance_loss_clip": 1.04585314, + "balance_loss_mlp": 1.03215933, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.217407300482533, + "language_loss": 0.61211228, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63376707, + "num_input_tokens_seen": 143878950, + "step": 6698, + "time_per_iteration": 2.534884214401245 + }, + { + "auxiliary_loss_clip": 0.0111873, + "auxiliary_loss_mlp": 0.01036082, + "balance_loss_clip": 1.04613388, + "balance_loss_mlp": 1.02154469, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 2.018101619600037, + "language_loss": 0.76809037, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78963846, + "num_input_tokens_seen": 143898385, + "step": 6699, + "time_per_iteration": 2.4891510009765625 + }, + { + "auxiliary_loss_clip": 0.01094404, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.04618609, + "balance_loss_mlp": 1.01900864, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.70370951415, + "language_loss": 0.80334651, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82461798, + "num_input_tokens_seen": 143918795, + "step": 6700, + "time_per_iteration": 2.5572965145111084 + }, + { + "auxiliary_loss_clip": 0.01108198, + "auxiliary_loss_mlp": 0.01045003, + "balance_loss_clip": 1.04752052, + "balance_loss_mlp": 1.02858806, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 7.702134534817846, + "language_loss": 0.74977195, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77130389, + "num_input_tokens_seen": 143938245, + "step": 6701, + "time_per_iteration": 2.5856969356536865 + }, + { + "auxiliary_loss_clip": 0.01101938, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.04609704, + "balance_loss_mlp": 1.02711189, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.723180775330377, + "language_loss": 0.65674478, + "learning_rate": 2.709938026276208e-06, + "loss": 0.67817557, + "num_input_tokens_seen": 143960995, + "step": 6702, + "time_per_iteration": 2.593935012817383 + }, + { + "auxiliary_loss_clip": 0.01107197, + "auxiliary_loss_mlp": 0.01044185, + "balance_loss_clip": 1.04788327, + "balance_loss_mlp": 1.02757955, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.3856801781733663, + "language_loss": 0.65720755, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.67872137, + "num_input_tokens_seen": 143979910, + "step": 6703, + "time_per_iteration": 2.533681869506836 + }, + { + "auxiliary_loss_clip": 0.01063159, + "auxiliary_loss_mlp": 0.01041639, + "balance_loss_clip": 1.04492044, + "balance_loss_mlp": 1.02527153, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 2.488705696508732, + "language_loss": 0.8213532, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84240115, + "num_input_tokens_seen": 144000095, + "step": 6704, + "time_per_iteration": 4.207636833190918 + }, + { + "auxiliary_loss_clip": 0.01109442, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.04703069, + "balance_loss_mlp": 1.02396119, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 1.8283043425423917, + "language_loss": 0.73262954, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.7541151, + "num_input_tokens_seen": 144019695, + "step": 6705, + "time_per_iteration": 2.672882318496704 + }, + { + "auxiliary_loss_clip": 0.01116679, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.04698801, + "balance_loss_mlp": 1.02518964, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.6517358482121471, + "language_loss": 0.66493863, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68650049, + "num_input_tokens_seen": 144038525, + "step": 6706, + "time_per_iteration": 2.5049290657043457 + }, + { + "auxiliary_loss_clip": 0.01120871, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.04816985, + "balance_loss_mlp": 1.02666485, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 2.0635021468332733, + "language_loss": 0.7108019, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73242688, + "num_input_tokens_seen": 144059485, + "step": 6707, + "time_per_iteration": 2.4972105026245117 + }, + { + "auxiliary_loss_clip": 0.01104166, + "auxiliary_loss_mlp": 0.01032348, + "balance_loss_clip": 1.04726863, + "balance_loss_mlp": 1.01788151, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.5557117453978906, + "language_loss": 0.80295378, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82431889, + "num_input_tokens_seen": 144080265, + "step": 6708, + "time_per_iteration": 2.5438332557678223 + }, + { + "auxiliary_loss_clip": 0.01085683, + "auxiliary_loss_mlp": 0.01044265, + "balance_loss_clip": 1.04537582, + "balance_loss_mlp": 1.02883887, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.7372550330463676, + "language_loss": 0.83035129, + "learning_rate": 2.70738867321606e-06, + "loss": 0.85165071, + "num_input_tokens_seen": 144098040, + "step": 6709, + "time_per_iteration": 2.5450377464294434 + }, + { + "auxiliary_loss_clip": 0.011212, + "auxiliary_loss_mlp": 0.01039974, + "balance_loss_clip": 1.05013824, + "balance_loss_mlp": 1.02451265, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 2.5117514533615646, + "language_loss": 0.71436393, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73597568, + "num_input_tokens_seen": 144118265, + "step": 6710, + "time_per_iteration": 2.54963755607605 + }, + { + "auxiliary_loss_clip": 0.0109905, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_clip": 1.04519176, + "balance_loss_mlp": 1.02741194, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 2.2332810781400374, + "language_loss": 0.85054529, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87196767, + "num_input_tokens_seen": 144133865, + "step": 6711, + "time_per_iteration": 2.516568660736084 + }, + { + "auxiliary_loss_clip": 0.01124082, + "auxiliary_loss_mlp": 0.01042303, + "balance_loss_clip": 1.04878557, + "balance_loss_mlp": 1.02699673, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 1.8854129730208997, + "language_loss": 0.76251304, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78417689, + "num_input_tokens_seen": 144150125, + "step": 6712, + "time_per_iteration": 2.4448399543762207 + }, + { + "auxiliary_loss_clip": 0.01106231, + "auxiliary_loss_mlp": 0.01046268, + "balance_loss_clip": 1.04870403, + "balance_loss_mlp": 1.03063941, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 3.4082330158326393, + "language_loss": 0.78720212, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.80872703, + "num_input_tokens_seen": 144169295, + "step": 6713, + "time_per_iteration": 2.5956132411956787 + }, + { + "auxiliary_loss_clip": 0.01097322, + "auxiliary_loss_mlp": 0.01040068, + "balance_loss_clip": 1.04193521, + "balance_loss_mlp": 1.02355742, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 2.5813930023766614, + "language_loss": 0.88213837, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90351224, + "num_input_tokens_seen": 144185790, + "step": 6714, + "time_per_iteration": 2.5322084426879883 + }, + { + "auxiliary_loss_clip": 0.01120949, + "auxiliary_loss_mlp": 0.01041216, + "balance_loss_clip": 1.04860222, + "balance_loss_mlp": 1.02652383, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 1.59908817497887, + "language_loss": 0.69309819, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71471989, + "num_input_tokens_seen": 144205190, + "step": 6715, + "time_per_iteration": 2.5249485969543457 + }, + { + "auxiliary_loss_clip": 0.01089892, + "auxiliary_loss_mlp": 0.01037214, + "balance_loss_clip": 1.04280674, + "balance_loss_mlp": 1.02188349, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 2.5842866531611755, + "language_loss": 0.77416712, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79543817, + "num_input_tokens_seen": 144222705, + "step": 6716, + "time_per_iteration": 2.5586097240448 + }, + { + "auxiliary_loss_clip": 0.01085549, + "auxiliary_loss_mlp": 0.01040655, + "balance_loss_clip": 1.04570389, + "balance_loss_mlp": 1.02640986, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.947454663548841, + "language_loss": 0.76189077, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78315282, + "num_input_tokens_seen": 144239545, + "step": 6717, + "time_per_iteration": 2.6528944969177246 + }, + { + "auxiliary_loss_clip": 0.01030096, + "auxiliary_loss_mlp": 0.01008237, + "balance_loss_clip": 1.02629864, + "balance_loss_mlp": 1.00667524, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.936770430635935, + "language_loss": 0.60840893, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62879223, + "num_input_tokens_seen": 144288145, + "step": 6718, + "time_per_iteration": 2.9946389198303223 + }, + { + "auxiliary_loss_clip": 0.01137786, + "auxiliary_loss_mlp": 0.01040727, + "balance_loss_clip": 1.05001879, + "balance_loss_mlp": 1.02458572, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 2.8892769864438064, + "language_loss": 0.74594271, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76772785, + "num_input_tokens_seen": 144302315, + "step": 6719, + "time_per_iteration": 3.939028739929199 + }, + { + "auxiliary_loss_clip": 0.01122747, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.04686594, + "balance_loss_mlp": 1.02765942, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 5.358966357131475, + "language_loss": 0.81123024, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83289778, + "num_input_tokens_seen": 144318990, + "step": 6720, + "time_per_iteration": 2.5216214656829834 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.04392111, + "balance_loss_mlp": 1.02103877, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 2.059757900288152, + "language_loss": 0.76788843, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.78932214, + "num_input_tokens_seen": 144335765, + "step": 6721, + "time_per_iteration": 2.573265552520752 + }, + { + "auxiliary_loss_clip": 0.01091952, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.04368651, + "balance_loss_mlp": 1.01860976, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 1.784321893092312, + "language_loss": 0.7242583, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74549651, + "num_input_tokens_seen": 144355825, + "step": 6722, + "time_per_iteration": 2.6603314876556396 + }, + { + "auxiliary_loss_clip": 0.01122743, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.05046678, + "balance_loss_mlp": 1.02050865, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.7873047429226119, + "language_loss": 0.65939409, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.68096721, + "num_input_tokens_seen": 144374320, + "step": 6723, + "time_per_iteration": 3.968855619430542 + }, + { + "auxiliary_loss_clip": 0.01120892, + "auxiliary_loss_mlp": 0.01044781, + "balance_loss_clip": 1.05109167, + "balance_loss_mlp": 1.0287472, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.7189036487074218, + "language_loss": 0.73525512, + "learning_rate": 2.701921353880734e-06, + "loss": 0.75691187, + "num_input_tokens_seen": 144394325, + "step": 6724, + "time_per_iteration": 2.4847002029418945 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.05198383, + "balance_loss_mlp": 1.02138019, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 2.0774876805252265, + "language_loss": 0.74814701, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76955491, + "num_input_tokens_seen": 144412765, + "step": 6725, + "time_per_iteration": 2.5751242637634277 + }, + { + "auxiliary_loss_clip": 0.01116546, + "auxiliary_loss_mlp": 0.01037012, + "balance_loss_clip": 1.04812193, + "balance_loss_mlp": 1.02079916, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.5077160661160969, + "language_loss": 0.76844257, + "learning_rate": 2.701191924463126e-06, + "loss": 0.78997815, + "num_input_tokens_seen": 144435400, + "step": 6726, + "time_per_iteration": 2.690033197402954 + }, + { + "auxiliary_loss_clip": 0.01106944, + "auxiliary_loss_mlp": 0.00786103, + "balance_loss_clip": 1.04338205, + "balance_loss_mlp": 1.00075185, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.2395263415220117, + "language_loss": 0.81541961, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83435011, + "num_input_tokens_seen": 144452925, + "step": 6727, + "time_per_iteration": 3.934907913208008 + }, + { + "auxiliary_loss_clip": 0.01132797, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.04883432, + "balance_loss_mlp": 1.02309608, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 3.8150388930413475, + "language_loss": 0.85273778, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87444794, + "num_input_tokens_seen": 144470195, + "step": 6728, + "time_per_iteration": 2.3977558612823486 + }, + { + "auxiliary_loss_clip": 0.01099284, + "auxiliary_loss_mlp": 0.01041777, + "balance_loss_clip": 1.04740667, + "balance_loss_mlp": 1.02613688, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.7409312521956264, + "language_loss": 0.81594527, + "learning_rate": 2.700097580951786e-06, + "loss": 0.83735585, + "num_input_tokens_seen": 144490320, + "step": 6729, + "time_per_iteration": 2.5336999893188477 + }, + { + "auxiliary_loss_clip": 0.01108299, + "auxiliary_loss_mlp": 0.01045604, + "balance_loss_clip": 1.04495668, + "balance_loss_mlp": 1.03026152, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 1.7862645912135697, + "language_loss": 0.73428345, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.75582248, + "num_input_tokens_seen": 144508990, + "step": 6730, + "time_per_iteration": 2.502904176712036 + }, + { + "auxiliary_loss_clip": 0.01118905, + "auxiliary_loss_mlp": 0.01039166, + "balance_loss_clip": 1.04645348, + "balance_loss_mlp": 1.02420497, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 1.6460585472078064, + "language_loss": 0.6793983, + "learning_rate": 2.699367885848985e-06, + "loss": 0.70097905, + "num_input_tokens_seen": 144529550, + "step": 6731, + "time_per_iteration": 2.6018362045288086 + }, + { + "auxiliary_loss_clip": 0.01130231, + "auxiliary_loss_mlp": 0.01039838, + "balance_loss_clip": 1.04703283, + "balance_loss_mlp": 1.02606976, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.578485051458404, + "language_loss": 0.73970962, + "learning_rate": 2.699002998510517e-06, + "loss": 0.7614103, + "num_input_tokens_seen": 144549310, + "step": 6732, + "time_per_iteration": 2.4635372161865234 + }, + { + "auxiliary_loss_clip": 0.01104887, + "auxiliary_loss_mlp": 0.00779151, + "balance_loss_clip": 1.04901719, + "balance_loss_mlp": 1.0007081, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.691054782500838, + "language_loss": 0.77398086, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79282129, + "num_input_tokens_seen": 144567430, + "step": 6733, + "time_per_iteration": 2.5098564624786377 + }, + { + "auxiliary_loss_clip": 0.01104209, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_clip": 1.04127192, + "balance_loss_mlp": 1.02857828, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.8941000087619972, + "language_loss": 0.76850712, + "learning_rate": 2.698273144328627e-06, + "loss": 0.79000086, + "num_input_tokens_seen": 144585975, + "step": 6734, + "time_per_iteration": 2.517536163330078 + }, + { + "auxiliary_loss_clip": 0.01108878, + "auxiliary_loss_mlp": 0.01034795, + "balance_loss_clip": 1.04727221, + "balance_loss_mlp": 1.02007318, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.6393888333075224, + "language_loss": 0.65288723, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67432392, + "num_input_tokens_seen": 144605225, + "step": 6735, + "time_per_iteration": 2.5013184547424316 + }, + { + "auxiliary_loss_clip": 0.01091314, + "auxiliary_loss_mlp": 0.01044469, + "balance_loss_clip": 1.04169464, + "balance_loss_mlp": 1.02969921, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.7424294144976546, + "language_loss": 0.83045322, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85181105, + "num_input_tokens_seen": 144624145, + "step": 6736, + "time_per_iteration": 2.5358588695526123 + }, + { + "auxiliary_loss_clip": 0.01104908, + "auxiliary_loss_mlp": 0.00781837, + "balance_loss_clip": 1.04842365, + "balance_loss_mlp": 1.00063455, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.5400876503599483, + "language_loss": 0.7507937, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.76966119, + "num_input_tokens_seen": 144644470, + "step": 6737, + "time_per_iteration": 2.572504758834839 + }, + { + "auxiliary_loss_clip": 0.01118432, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.04658437, + "balance_loss_mlp": 1.02883673, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.3333784580728727, + "language_loss": 0.72047246, + "learning_rate": 2.696813118332519e-06, + "loss": 0.74209106, + "num_input_tokens_seen": 144661055, + "step": 6738, + "time_per_iteration": 2.444162368774414 + }, + { + "auxiliary_loss_clip": 0.01097052, + "auxiliary_loss_mlp": 0.01037657, + "balance_loss_clip": 1.04475713, + "balance_loss_mlp": 1.02419853, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 3.3026246488230897, + "language_loss": 0.7505123, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77185941, + "num_input_tokens_seen": 144677935, + "step": 6739, + "time_per_iteration": 2.508301258087158 + }, + { + "auxiliary_loss_clip": 0.01097614, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.04715931, + "balance_loss_mlp": 1.02684581, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.771064342898284, + "language_loss": 0.74170947, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76310205, + "num_input_tokens_seen": 144697725, + "step": 6740, + "time_per_iteration": 2.5997085571289062 + }, + { + "auxiliary_loss_clip": 0.01113022, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.04380965, + "balance_loss_mlp": 1.02146053, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.5343534404931267, + "language_loss": 0.7724849, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79397511, + "num_input_tokens_seen": 144718805, + "step": 6741, + "time_per_iteration": 2.48401141166687 + }, + { + "auxiliary_loss_clip": 0.01132352, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.04718566, + "balance_loss_mlp": 1.02372575, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 1.8612127843438668, + "language_loss": 0.71101618, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.73273265, + "num_input_tokens_seen": 144737105, + "step": 6742, + "time_per_iteration": 2.4408986568450928 + }, + { + "auxiliary_loss_clip": 0.01132724, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.04845786, + "balance_loss_mlp": 1.01891398, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.438868088511654, + "language_loss": 0.72002614, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74168843, + "num_input_tokens_seen": 144751350, + "step": 6743, + "time_per_iteration": 2.446924924850464 + }, + { + "auxiliary_loss_clip": 0.01111678, + "auxiliary_loss_mlp": 0.01039218, + "balance_loss_clip": 1.04622841, + "balance_loss_mlp": 1.0232203, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 2.167322790933498, + "language_loss": 0.70589614, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72740507, + "num_input_tokens_seen": 144770030, + "step": 6744, + "time_per_iteration": 4.078804016113281 + }, + { + "auxiliary_loss_clip": 0.01116931, + "auxiliary_loss_mlp": 0.01038481, + "balance_loss_clip": 1.04648733, + "balance_loss_mlp": 1.02536798, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.4332463302033271, + "language_loss": 0.79953337, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82108748, + "num_input_tokens_seen": 144790965, + "step": 6745, + "time_per_iteration": 2.542431354522705 + }, + { + "auxiliary_loss_clip": 0.01111729, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.04902482, + "balance_loss_mlp": 1.02592778, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 2.0193987527663575, + "language_loss": 0.66795981, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68948925, + "num_input_tokens_seen": 144807755, + "step": 6746, + "time_per_iteration": 2.5136756896972656 + }, + { + "auxiliary_loss_clip": 0.01092374, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.04290533, + "balance_loss_mlp": 1.0188818, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.4792060769599633, + "language_loss": 0.56986809, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59112048, + "num_input_tokens_seen": 144832405, + "step": 6747, + "time_per_iteration": 2.725101947784424 + }, + { + "auxiliary_loss_clip": 0.01097912, + "auxiliary_loss_mlp": 0.01047353, + "balance_loss_clip": 1.04777169, + "balance_loss_mlp": 1.03332162, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.891069407131547, + "language_loss": 0.84469402, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86614668, + "num_input_tokens_seen": 144853890, + "step": 6748, + "time_per_iteration": 2.603438377380371 + }, + { + "auxiliary_loss_clip": 0.01109171, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.05234528, + "balance_loss_mlp": 1.02412534, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 2.280854913046672, + "language_loss": 0.81659716, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83808124, + "num_input_tokens_seen": 144871395, + "step": 6749, + "time_per_iteration": 2.4738197326660156 + }, + { + "auxiliary_loss_clip": 0.01119067, + "auxiliary_loss_mlp": 0.00780978, + "balance_loss_clip": 1.04691434, + "balance_loss_mlp": 1.0005753, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.509998115935073, + "language_loss": 0.75133002, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77033043, + "num_input_tokens_seen": 144890975, + "step": 6750, + "time_per_iteration": 2.5026350021362305 + }, + { + "auxiliary_loss_clip": 0.01115416, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.04753256, + "balance_loss_mlp": 1.02189565, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 3.2147575050653114, + "language_loss": 0.73609036, + "learning_rate": 2.692065118669195e-06, + "loss": 0.75761878, + "num_input_tokens_seen": 144908170, + "step": 6751, + "time_per_iteration": 2.523360013961792 + }, + { + "auxiliary_loss_clip": 0.01083855, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.04424715, + "balance_loss_mlp": 1.03084111, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 2.0227360114272765, + "language_loss": 0.66823387, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.68954509, + "num_input_tokens_seen": 144928020, + "step": 6752, + "time_per_iteration": 2.615968704223633 + }, + { + "auxiliary_loss_clip": 0.0108613, + "auxiliary_loss_mlp": 0.01043044, + "balance_loss_clip": 1.04534805, + "balance_loss_mlp": 1.02667701, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.8478993916751516, + "language_loss": 0.70669699, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72798872, + "num_input_tokens_seen": 144951240, + "step": 6753, + "time_per_iteration": 2.8644306659698486 + }, + { + "auxiliary_loss_clip": 0.01109066, + "auxiliary_loss_mlp": 0.01041428, + "balance_loss_clip": 1.04358315, + "balance_loss_mlp": 1.02552557, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.391145042031159, + "language_loss": 0.72753537, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74904025, + "num_input_tokens_seen": 144969100, + "step": 6754, + "time_per_iteration": 2.5401387214660645 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01041019, + "balance_loss_clip": 1.04511034, + "balance_loss_mlp": 1.02640414, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.835258596742252, + "language_loss": 0.83031642, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85171652, + "num_input_tokens_seen": 144987065, + "step": 6755, + "time_per_iteration": 2.6172142028808594 + }, + { + "auxiliary_loss_clip": 0.01086666, + "auxiliary_loss_mlp": 0.01042979, + "balance_loss_clip": 1.04614782, + "balance_loss_mlp": 1.02769041, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 11.67173747856701, + "language_loss": 0.70634174, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72763824, + "num_input_tokens_seen": 145007310, + "step": 6756, + "time_per_iteration": 2.703535795211792 + }, + { + "auxiliary_loss_clip": 0.01070686, + "auxiliary_loss_mlp": 0.00782654, + "balance_loss_clip": 1.03852057, + "balance_loss_mlp": 1.00048876, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.9394877841298288, + "language_loss": 0.79264671, + "learning_rate": 2.689872236505755e-06, + "loss": 0.81118011, + "num_input_tokens_seen": 145026210, + "step": 6757, + "time_per_iteration": 2.7721104621887207 + }, + { + "auxiliary_loss_clip": 0.01111468, + "auxiliary_loss_mlp": 0.01035337, + "balance_loss_clip": 1.0498724, + "balance_loss_mlp": 1.02079916, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.8159761390951723, + "language_loss": 0.78616112, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.80762923, + "num_input_tokens_seen": 145045475, + "step": 6758, + "time_per_iteration": 4.182952165603638 + }, + { + "auxiliary_loss_clip": 0.01103404, + "auxiliary_loss_mlp": 0.01035245, + "balance_loss_clip": 1.05792308, + "balance_loss_mlp": 1.02048707, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.2418628418380924, + "language_loss": 0.88899601, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.91038245, + "num_input_tokens_seen": 145062260, + "step": 6759, + "time_per_iteration": 2.5264053344726562 + }, + { + "auxiliary_loss_clip": 0.01097251, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.04725146, + "balance_loss_mlp": 1.02402496, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 2.022983350207906, + "language_loss": 0.64288557, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66423702, + "num_input_tokens_seen": 145082470, + "step": 6760, + "time_per_iteration": 2.589153289794922 + }, + { + "auxiliary_loss_clip": 0.01120473, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.04492688, + "balance_loss_mlp": 1.02085268, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.4903250569745652, + "language_loss": 0.75091678, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77248514, + "num_input_tokens_seen": 145105685, + "step": 6761, + "time_per_iteration": 2.5732600688934326 + }, + { + "auxiliary_loss_clip": 0.01097223, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.04708171, + "balance_loss_mlp": 1.0268327, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.3933249296493073, + "language_loss": 0.70005345, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72143245, + "num_input_tokens_seen": 145125590, + "step": 6762, + "time_per_iteration": 4.031570911407471 + }, + { + "auxiliary_loss_clip": 0.01115539, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.04954004, + "balance_loss_mlp": 1.02151752, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 1.6379999983540008, + "language_loss": 0.73396373, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75547206, + "num_input_tokens_seen": 145146810, + "step": 6763, + "time_per_iteration": 2.5322766304016113 + }, + { + "auxiliary_loss_clip": 0.01092373, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.03931093, + "balance_loss_mlp": 1.02124584, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 2.0382991174176825, + "language_loss": 0.69072753, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71201837, + "num_input_tokens_seen": 145163130, + "step": 6764, + "time_per_iteration": 2.5046181678771973 + }, + { + "auxiliary_loss_clip": 0.01104384, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_clip": 1.04416895, + "balance_loss_mlp": 1.0251044, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 3.738310203239723, + "language_loss": 0.90783, + "learning_rate": 2.686946929177557e-06, + "loss": 0.92929268, + "num_input_tokens_seen": 145181420, + "step": 6765, + "time_per_iteration": 4.086086750030518 + }, + { + "auxiliary_loss_clip": 0.01119389, + "auxiliary_loss_mlp": 0.01044828, + "balance_loss_clip": 1.04423738, + "balance_loss_mlp": 1.02860403, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.5490672751680523, + "language_loss": 0.78938097, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.81102312, + "num_input_tokens_seen": 145198545, + "step": 6766, + "time_per_iteration": 2.5472724437713623 + }, + { + "auxiliary_loss_clip": 0.0113203, + "auxiliary_loss_mlp": 0.01043321, + "balance_loss_clip": 1.04597545, + "balance_loss_mlp": 1.02865875, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 2.5746886149571213, + "language_loss": 0.76702416, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78877771, + "num_input_tokens_seen": 145215835, + "step": 6767, + "time_per_iteration": 2.4891998767852783 + }, + { + "auxiliary_loss_clip": 0.0111989, + "auxiliary_loss_mlp": 0.01036349, + "balance_loss_clip": 1.04925334, + "balance_loss_mlp": 1.02199066, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 2.392174315669219, + "language_loss": 0.77902377, + "learning_rate": 2.685849508738034e-06, + "loss": 0.80058622, + "num_input_tokens_seen": 145236555, + "step": 6768, + "time_per_iteration": 2.5711441040039062 + }, + { + "auxiliary_loss_clip": 0.01130252, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.04765773, + "balance_loss_mlp": 1.02296495, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 6.985519474943736, + "language_loss": 0.87378234, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89545465, + "num_input_tokens_seen": 145254595, + "step": 6769, + "time_per_iteration": 2.4241271018981934 + }, + { + "auxiliary_loss_clip": 0.01106708, + "auxiliary_loss_mlp": 0.01046865, + "balance_loss_clip": 1.04947877, + "balance_loss_mlp": 1.03204679, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.8099351885380137, + "language_loss": 0.81079865, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83233434, + "num_input_tokens_seen": 145274005, + "step": 6770, + "time_per_iteration": 2.499673366546631 + }, + { + "auxiliary_loss_clip": 0.01133151, + "auxiliary_loss_mlp": 0.01035392, + "balance_loss_clip": 1.04700315, + "balance_loss_mlp": 1.01972246, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.6379753985961771, + "language_loss": 0.8046633, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82634866, + "num_input_tokens_seen": 145294850, + "step": 6771, + "time_per_iteration": 2.4813072681427 + }, + { + "auxiliary_loss_clip": 0.01094538, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.04192376, + "balance_loss_mlp": 1.02487612, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.9570935841361166, + "language_loss": 0.76295167, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78428948, + "num_input_tokens_seen": 145317050, + "step": 6772, + "time_per_iteration": 2.610694408416748 + }, + { + "auxiliary_loss_clip": 0.01107709, + "auxiliary_loss_mlp": 0.01043551, + "balance_loss_clip": 1.04483569, + "balance_loss_mlp": 1.02825618, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 2.0974007031919006, + "language_loss": 0.81544852, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83696115, + "num_input_tokens_seen": 145334480, + "step": 6773, + "time_per_iteration": 2.474452018737793 + }, + { + "auxiliary_loss_clip": 0.01029193, + "auxiliary_loss_mlp": 0.01010142, + "balance_loss_clip": 1.01859283, + "balance_loss_mlp": 1.00809121, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8202424483493639, + "language_loss": 0.6430465, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66343987, + "num_input_tokens_seen": 145388695, + "step": 6774, + "time_per_iteration": 2.9903268814086914 + }, + { + "auxiliary_loss_clip": 0.01088002, + "auxiliary_loss_mlp": 0.01036189, + "balance_loss_clip": 1.04657972, + "balance_loss_mlp": 1.0215143, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 5.616238501864443, + "language_loss": 0.72294199, + "learning_rate": 2.683287951431446e-06, + "loss": 0.7441839, + "num_input_tokens_seen": 145408240, + "step": 6775, + "time_per_iteration": 2.619067907333374 + }, + { + "auxiliary_loss_clip": 0.01105526, + "auxiliary_loss_mlp": 0.00781642, + "balance_loss_clip": 1.0500257, + "balance_loss_mlp": 1.00062776, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.505955170401688, + "language_loss": 0.78103364, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.79990536, + "num_input_tokens_seen": 145428395, + "step": 6776, + "time_per_iteration": 2.5198538303375244 + }, + { + "auxiliary_loss_clip": 0.01124311, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.04883575, + "balance_loss_mlp": 1.02343607, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 3.7393860147042393, + "language_loss": 0.78666294, + "learning_rate": 2.682555844513981e-06, + "loss": 0.80828726, + "num_input_tokens_seen": 145448290, + "step": 6777, + "time_per_iteration": 2.5153489112854004 + }, + { + "auxiliary_loss_clip": 0.01048452, + "auxiliary_loss_mlp": 0.0100044, + "balance_loss_clip": 1.02067411, + "balance_loss_mlp": 0.99881852, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6821857650768077, + "language_loss": 0.53152233, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55201125, + "num_input_tokens_seen": 145509785, + "step": 6778, + "time_per_iteration": 3.06144380569458 + }, + { + "auxiliary_loss_clip": 0.01133149, + "auxiliary_loss_mlp": 0.00781187, + "balance_loss_clip": 1.05039561, + "balance_loss_mlp": 1.0006547, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 10.83828899537372, + "language_loss": 0.82799578, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84713912, + "num_input_tokens_seen": 145528620, + "step": 6779, + "time_per_iteration": 2.4899351596832275 + }, + { + "auxiliary_loss_clip": 0.01121949, + "auxiliary_loss_mlp": 0.0103592, + "balance_loss_clip": 1.04760003, + "balance_loss_mlp": 1.02081001, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.740698952782984, + "language_loss": 0.76231408, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78389275, + "num_input_tokens_seen": 145547775, + "step": 6780, + "time_per_iteration": 2.524460554122925 + }, + { + "auxiliary_loss_clip": 0.0111287, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.0473249, + "balance_loss_mlp": 1.02108359, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 3.5764759776604804, + "language_loss": 0.6638689, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68534243, + "num_input_tokens_seen": 145564465, + "step": 6781, + "time_per_iteration": 2.4606783390045166 + }, + { + "auxiliary_loss_clip": 0.01103892, + "auxiliary_loss_mlp": 0.01037119, + "balance_loss_clip": 1.04271054, + "balance_loss_mlp": 1.02217603, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 1.6642993440774863, + "language_loss": 0.71592498, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73733509, + "num_input_tokens_seen": 145585965, + "step": 6782, + "time_per_iteration": 2.614570140838623 + }, + { + "auxiliary_loss_clip": 0.01121422, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.04620767, + "balance_loss_mlp": 1.02147651, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 2.604165084479001, + "language_loss": 0.81688219, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.83845913, + "num_input_tokens_seen": 145605000, + "step": 6783, + "time_per_iteration": 4.034742593765259 + }, + { + "auxiliary_loss_clip": 0.01115693, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.04824913, + "balance_loss_mlp": 1.02733707, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 2.338635139601718, + "language_loss": 0.81124896, + "learning_rate": 2.679992655730283e-06, + "loss": 0.83282381, + "num_input_tokens_seen": 145623740, + "step": 6784, + "time_per_iteration": 2.480276346206665 + }, + { + "auxiliary_loss_clip": 0.01103579, + "auxiliary_loss_mlp": 0.01042042, + "balance_loss_clip": 1.04886651, + "balance_loss_mlp": 1.02615094, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.6900333881023553, + "language_loss": 0.6562987, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67775494, + "num_input_tokens_seen": 145643515, + "step": 6785, + "time_per_iteration": 2.562033176422119 + }, + { + "auxiliary_loss_clip": 0.01114704, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.04785001, + "balance_loss_mlp": 1.0211122, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 2.2112238733203413, + "language_loss": 0.79561496, + "learning_rate": 2.679260083800989e-06, + "loss": 0.81711406, + "num_input_tokens_seen": 145660890, + "step": 6786, + "time_per_iteration": 2.5156350135803223 + }, + { + "auxiliary_loss_clip": 0.01130101, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.04809523, + "balance_loss_mlp": 1.02865565, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 2.9816017343525547, + "language_loss": 0.81255054, + "learning_rate": 2.678893759192982e-06, + "loss": 0.83427155, + "num_input_tokens_seen": 145680070, + "step": 6787, + "time_per_iteration": 2.4402942657470703 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01035059, + "balance_loss_clip": 1.04591656, + "balance_loss_mlp": 1.02112341, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 2.154623328178569, + "language_loss": 0.67375314, + "learning_rate": 2.678527408841255e-06, + "loss": 0.69526839, + "num_input_tokens_seen": 145698010, + "step": 6788, + "time_per_iteration": 2.4657909870147705 + }, + { + "auxiliary_loss_clip": 0.01100165, + "auxiliary_loss_mlp": 0.01052292, + "balance_loss_clip": 1.04128051, + "balance_loss_mlp": 1.03592503, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 2.2213919347055313, + "language_loss": 0.66351539, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68503994, + "num_input_tokens_seen": 145722215, + "step": 6789, + "time_per_iteration": 2.668421983718872 + }, + { + "auxiliary_loss_clip": 0.01087163, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.04419446, + "balance_loss_mlp": 1.02144957, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 2.6742343773623367, + "language_loss": 0.60512549, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62636089, + "num_input_tokens_seen": 145741090, + "step": 6790, + "time_per_iteration": 2.6088404655456543 + }, + { + "auxiliary_loss_clip": 0.01112831, + "auxiliary_loss_mlp": 0.01044929, + "balance_loss_clip": 1.04625297, + "balance_loss_mlp": 1.02938414, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 2.9709121170036177, + "language_loss": 0.70069814, + "learning_rate": 2.677428203462683e-06, + "loss": 0.72227573, + "num_input_tokens_seen": 145754985, + "step": 6791, + "time_per_iteration": 2.420055389404297 + }, + { + "auxiliary_loss_clip": 0.01036943, + "auxiliary_loss_mlp": 0.01006648, + "balance_loss_clip": 1.01898146, + "balance_loss_mlp": 1.0047766, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7432525308828215, + "language_loss": 0.59668648, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61712241, + "num_input_tokens_seen": 145815260, + "step": 6792, + "time_per_iteration": 3.0269646644592285 + }, + { + "auxiliary_loss_clip": 0.0113769, + "auxiliary_loss_mlp": 0.01044926, + "balance_loss_clip": 1.05158436, + "balance_loss_mlp": 1.02917814, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 1.6133853748414597, + "language_loss": 0.80007213, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82189828, + "num_input_tokens_seen": 145832665, + "step": 6793, + "time_per_iteration": 2.5049240589141846 + }, + { + "auxiliary_loss_clip": 0.01123401, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.04950154, + "balance_loss_mlp": 1.02094626, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 2.4871805185265345, + "language_loss": 0.849069, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87066609, + "num_input_tokens_seen": 145850240, + "step": 6794, + "time_per_iteration": 2.5199501514434814 + }, + { + "auxiliary_loss_clip": 0.01103771, + "auxiliary_loss_mlp": 0.0103996, + "balance_loss_clip": 1.0510484, + "balance_loss_mlp": 1.02486801, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.7797124790009153, + "language_loss": 0.80249429, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82393163, + "num_input_tokens_seen": 145869545, + "step": 6795, + "time_per_iteration": 2.541363477706909 + }, + { + "auxiliary_loss_clip": 0.01120738, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.04811001, + "balance_loss_mlp": 1.0228827, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 2.496817675767785, + "language_loss": 0.70359874, + "learning_rate": 2.675595680920792e-06, + "loss": 0.72519666, + "num_input_tokens_seen": 145884025, + "step": 6796, + "time_per_iteration": 2.4370696544647217 + }, + { + "auxiliary_loss_clip": 0.01115903, + "auxiliary_loss_mlp": 0.00785688, + "balance_loss_clip": 1.04545307, + "balance_loss_mlp": 1.00054955, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.9074225685807842, + "language_loss": 0.77661222, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.79562813, + "num_input_tokens_seen": 145903210, + "step": 6797, + "time_per_iteration": 3.9147143363952637 + }, + { + "auxiliary_loss_clip": 0.01121865, + "auxiliary_loss_mlp": 0.01045323, + "balance_loss_clip": 1.04647326, + "balance_loss_mlp": 1.03060055, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 1.864544091296408, + "language_loss": 0.8550806, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87675244, + "num_input_tokens_seen": 145920985, + "step": 6798, + "time_per_iteration": 2.4672651290893555 + }, + { + "auxiliary_loss_clip": 0.01129558, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.0487684, + "balance_loss_mlp": 1.029037, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.75977250088417, + "language_loss": 0.83934957, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86107004, + "num_input_tokens_seen": 145940350, + "step": 6799, + "time_per_iteration": 2.4735164642333984 + }, + { + "auxiliary_loss_clip": 0.01092682, + "auxiliary_loss_mlp": 0.01049116, + "balance_loss_clip": 1.04441452, + "balance_loss_mlp": 1.03223562, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.199441504135298, + "language_loss": 0.83461946, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85603738, + "num_input_tokens_seen": 145957460, + "step": 6800, + "time_per_iteration": 2.5399038791656494 + }, + { + "auxiliary_loss_clip": 0.01118471, + "auxiliary_loss_mlp": 0.01044149, + "balance_loss_clip": 1.04528427, + "balance_loss_mlp": 1.02904499, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 2.060677730535973, + "language_loss": 0.74470127, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.7663275, + "num_input_tokens_seen": 145975285, + "step": 6801, + "time_per_iteration": 4.007338523864746 + }, + { + "auxiliary_loss_clip": 0.01123489, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.04681659, + "balance_loss_mlp": 1.0209074, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 1.9694247857031546, + "language_loss": 0.79792684, + "learning_rate": 2.673395808607861e-06, + "loss": 0.81952173, + "num_input_tokens_seen": 145989150, + "step": 6802, + "time_per_iteration": 2.4637887477874756 + }, + { + "auxiliary_loss_clip": 0.0112443, + "auxiliary_loss_mlp": 0.010441, + "balance_loss_clip": 1.05402744, + "balance_loss_mlp": 1.02677858, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 2.2688490832466446, + "language_loss": 0.75590599, + "learning_rate": 2.673029073767934e-06, + "loss": 0.77759123, + "num_input_tokens_seen": 146006980, + "step": 6803, + "time_per_iteration": 2.4603271484375 + }, + { + "auxiliary_loss_clip": 0.01080649, + "auxiliary_loss_mlp": 0.00781935, + "balance_loss_clip": 1.05126703, + "balance_loss_mlp": 1.0006249, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.8476852005988462, + "language_loss": 0.78747362, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80609941, + "num_input_tokens_seen": 146025125, + "step": 6804, + "time_per_iteration": 2.5955843925476074 + }, + { + "auxiliary_loss_clip": 0.01134446, + "auxiliary_loss_mlp": 0.01041001, + "balance_loss_clip": 1.0471946, + "balance_loss_mlp": 1.02679133, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 2.2600783460822447, + "language_loss": 0.74923384, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77098835, + "num_input_tokens_seen": 146044990, + "step": 6805, + "time_per_iteration": 2.506012439727783 + }, + { + "auxiliary_loss_clip": 0.01087327, + "auxiliary_loss_mlp": 0.01039475, + "balance_loss_clip": 1.04278851, + "balance_loss_mlp": 1.02517593, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.6944646119723343, + "language_loss": 0.79289067, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81415868, + "num_input_tokens_seen": 146066045, + "step": 6806, + "time_per_iteration": 4.014370679855347 + }, + { + "auxiliary_loss_clip": 0.01123111, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.04847765, + "balance_loss_mlp": 1.01867485, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.9779755382343567, + "language_loss": 0.71523511, + "learning_rate": 2.671561879334007e-06, + "loss": 0.73680502, + "num_input_tokens_seen": 146086280, + "step": 6807, + "time_per_iteration": 2.5164060592651367 + }, + { + "auxiliary_loss_clip": 0.01035021, + "auxiliary_loss_mlp": 0.0100319, + "balance_loss_clip": 1.03761744, + "balance_loss_mlp": 1.00131822, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8052325615503361, + "language_loss": 0.5877744, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60815644, + "num_input_tokens_seen": 146148840, + "step": 6808, + "time_per_iteration": 3.168560743331909 + }, + { + "auxiliary_loss_clip": 0.01113157, + "auxiliary_loss_mlp": 0.01043296, + "balance_loss_clip": 1.04834723, + "balance_loss_mlp": 1.02985489, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.7126130736670329, + "language_loss": 0.5461427, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56770718, + "num_input_tokens_seen": 146166195, + "step": 6809, + "time_per_iteration": 2.5635368824005127 + }, + { + "auxiliary_loss_clip": 0.01106558, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.04363465, + "balance_loss_mlp": 1.01626444, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 3.84844718048821, + "language_loss": 0.83285087, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85421944, + "num_input_tokens_seen": 146185045, + "step": 6810, + "time_per_iteration": 2.555453300476074 + }, + { + "auxiliary_loss_clip": 0.0110732, + "auxiliary_loss_mlp": 0.01054044, + "balance_loss_clip": 1.04693758, + "balance_loss_mlp": 1.03758156, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.800160173290119, + "language_loss": 0.77440286, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79601645, + "num_input_tokens_seen": 146204655, + "step": 6811, + "time_per_iteration": 2.519258499145508 + }, + { + "auxiliary_loss_clip": 0.01135762, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.05116558, + "balance_loss_mlp": 1.02082551, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 2.017116237117309, + "language_loss": 0.70444143, + "learning_rate": 2.669727313417857e-06, + "loss": 0.72616255, + "num_input_tokens_seen": 146222000, + "step": 6812, + "time_per_iteration": 2.4367895126342773 + }, + { + "auxiliary_loss_clip": 0.01129585, + "auxiliary_loss_mlp": 0.01045693, + "balance_loss_clip": 1.04680824, + "balance_loss_mlp": 1.03048205, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.5981964804432258, + "language_loss": 0.66066158, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68241435, + "num_input_tokens_seen": 146242630, + "step": 6813, + "time_per_iteration": 2.482919931411743 + }, + { + "auxiliary_loss_clip": 0.01115735, + "auxiliary_loss_mlp": 0.0078147, + "balance_loss_clip": 1.04701734, + "balance_loss_mlp": 1.00058103, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.742744825893568, + "language_loss": 0.74160999, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.76058209, + "num_input_tokens_seen": 146263070, + "step": 6814, + "time_per_iteration": 2.5411148071289062 + }, + { + "auxiliary_loss_clip": 0.01085947, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.04473448, + "balance_loss_mlp": 1.02195299, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 2.7555032395666283, + "language_loss": 0.66276145, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68399489, + "num_input_tokens_seen": 146282890, + "step": 6815, + "time_per_iteration": 2.591198205947876 + }, + { + "auxiliary_loss_clip": 0.01123333, + "auxiliary_loss_mlp": 0.01045137, + "balance_loss_clip": 1.05381942, + "balance_loss_mlp": 1.03117776, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.7643079036184601, + "language_loss": 0.77085716, + "learning_rate": 2.668259203471188e-06, + "loss": 0.79254186, + "num_input_tokens_seen": 146301755, + "step": 6816, + "time_per_iteration": 2.5336201190948486 + }, + { + "auxiliary_loss_clip": 0.01116359, + "auxiliary_loss_mlp": 0.0104502, + "balance_loss_clip": 1.04936397, + "balance_loss_mlp": 1.03023863, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.725812859287105, + "language_loss": 0.81641716, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.83803093, + "num_input_tokens_seen": 146316835, + "step": 6817, + "time_per_iteration": 2.478060007095337 + }, + { + "auxiliary_loss_clip": 0.01114567, + "auxiliary_loss_mlp": 0.0104938, + "balance_loss_clip": 1.04725003, + "balance_loss_mlp": 1.03267884, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.5527950331632865, + "language_loss": 0.79856968, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82020915, + "num_input_tokens_seen": 146336650, + "step": 6818, + "time_per_iteration": 2.563488245010376 + }, + { + "auxiliary_loss_clip": 0.01108781, + "auxiliary_loss_mlp": 0.01038263, + "balance_loss_clip": 1.05107248, + "balance_loss_mlp": 1.02440512, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.8098489623320553, + "language_loss": 0.66243088, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68390131, + "num_input_tokens_seen": 146357640, + "step": 6819, + "time_per_iteration": 2.6437573432922363 + }, + { + "auxiliary_loss_clip": 0.01111684, + "auxiliary_loss_mlp": 0.01054995, + "balance_loss_clip": 1.04709065, + "balance_loss_mlp": 1.03770959, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.7883771663068624, + "language_loss": 0.85209996, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87376678, + "num_input_tokens_seen": 146379325, + "step": 6820, + "time_per_iteration": 2.543962240219116 + }, + { + "auxiliary_loss_clip": 0.01121359, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.04950345, + "balance_loss_mlp": 1.02176499, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.7025431327251896, + "language_loss": 0.70878625, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73035574, + "num_input_tokens_seen": 146398635, + "step": 6821, + "time_per_iteration": 2.51754093170166 + }, + { + "auxiliary_loss_clip": 0.01119075, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.05029726, + "balance_loss_mlp": 1.02385497, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 1.7950452551023268, + "language_loss": 0.74827653, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.7698437, + "num_input_tokens_seen": 146417585, + "step": 6822, + "time_per_iteration": 2.4880762100219727 + }, + { + "auxiliary_loss_clip": 0.01112405, + "auxiliary_loss_mlp": 0.01039864, + "balance_loss_clip": 1.04763472, + "balance_loss_mlp": 1.02564847, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 1.9900496254866813, + "language_loss": 0.75625497, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77777767, + "num_input_tokens_seen": 146437035, + "step": 6823, + "time_per_iteration": 4.026854038238525 + }, + { + "auxiliary_loss_clip": 0.01094646, + "auxiliary_loss_mlp": 0.0104235, + "balance_loss_clip": 1.05482411, + "balance_loss_mlp": 1.02591145, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 3.237075118042896, + "language_loss": 0.73107296, + "learning_rate": 2.665321768127001e-06, + "loss": 0.7524429, + "num_input_tokens_seen": 146457370, + "step": 6824, + "time_per_iteration": 2.649921178817749 + }, + { + "auxiliary_loss_clip": 0.01104062, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.04676127, + "balance_loss_mlp": 1.01910877, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 2.013535819794579, + "language_loss": 0.72091246, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.74229944, + "num_input_tokens_seen": 146478105, + "step": 6825, + "time_per_iteration": 2.62996506690979 + }, + { + "auxiliary_loss_clip": 0.01092165, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.04321694, + "balance_loss_mlp": 1.02593255, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.8532623263064159, + "language_loss": 0.84389198, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86521077, + "num_input_tokens_seen": 146497835, + "step": 6826, + "time_per_iteration": 2.5748021602630615 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.00780889, + "balance_loss_clip": 1.05111361, + "balance_loss_mlp": 1.00068212, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.7784447573956905, + "language_loss": 0.67068183, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68960547, + "num_input_tokens_seen": 146517735, + "step": 6827, + "time_per_iteration": 2.554108142852783 + }, + { + "auxiliary_loss_clip": 0.01109366, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.04637957, + "balance_loss_mlp": 1.02181745, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.4488205385146289, + "language_loss": 0.7211318, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74257994, + "num_input_tokens_seen": 146537640, + "step": 6828, + "time_per_iteration": 2.5429863929748535 + }, + { + "auxiliary_loss_clip": 0.01107731, + "auxiliary_loss_mlp": 0.01046494, + "balance_loss_clip": 1.04773426, + "balance_loss_mlp": 1.03069854, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 1.7809152318130785, + "language_loss": 0.83131099, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85285318, + "num_input_tokens_seen": 146554695, + "step": 6829, + "time_per_iteration": 2.488375186920166 + }, + { + "auxiliary_loss_clip": 0.011237, + "auxiliary_loss_mlp": 0.01037346, + "balance_loss_clip": 1.05167317, + "balance_loss_mlp": 1.02370203, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.581757220077353, + "language_loss": 0.89711761, + "learning_rate": 2.663117631608206e-06, + "loss": 0.91872811, + "num_input_tokens_seen": 146573740, + "step": 6830, + "time_per_iteration": 2.5170159339904785 + }, + { + "auxiliary_loss_clip": 0.01095824, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.04767013, + "balance_loss_mlp": 1.01859093, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 3.1930033962188964, + "language_loss": 0.65381074, + "learning_rate": 2.662750187431268e-06, + "loss": 0.67509925, + "num_input_tokens_seen": 146592885, + "step": 6831, + "time_per_iteration": 2.6051981449127197 + }, + { + "auxiliary_loss_clip": 0.01131414, + "auxiliary_loss_mlp": 0.01037536, + "balance_loss_clip": 1.04955053, + "balance_loss_mlp": 1.02408361, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 2.0039918311989444, + "language_loss": 0.69997454, + "learning_rate": 2.662382718122776e-06, + "loss": 0.72166395, + "num_input_tokens_seen": 146611995, + "step": 6832, + "time_per_iteration": 2.5538253784179688 + }, + { + "auxiliary_loss_clip": 0.01089778, + "auxiliary_loss_mlp": 0.01041767, + "balance_loss_clip": 1.05111802, + "balance_loss_mlp": 1.02802217, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 9.793294597601932, + "language_loss": 0.74186146, + "learning_rate": 2.662015223696666e-06, + "loss": 0.76317692, + "num_input_tokens_seen": 146628045, + "step": 6833, + "time_per_iteration": 2.6038317680358887 + }, + { + "auxiliary_loss_clip": 0.01091101, + "auxiliary_loss_mlp": 0.01041092, + "balance_loss_clip": 1.04965639, + "balance_loss_mlp": 1.02445078, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 2.1816031308142936, + "language_loss": 0.72526783, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74658978, + "num_input_tokens_seen": 146648355, + "step": 6834, + "time_per_iteration": 2.6539793014526367 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01047023, + "balance_loss_clip": 1.04798317, + "balance_loss_mlp": 1.03160942, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 7.324863970344533, + "language_loss": 0.71444529, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73616278, + "num_input_tokens_seen": 146668370, + "step": 6835, + "time_per_iteration": 2.513561487197876 + }, + { + "auxiliary_loss_clip": 0.01123213, + "auxiliary_loss_mlp": 0.01038339, + "balance_loss_clip": 1.04667759, + "balance_loss_mlp": 1.02251959, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 2.4589932733904285, + "language_loss": 0.87141824, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89303374, + "num_input_tokens_seen": 146686665, + "step": 6836, + "time_per_iteration": 2.4910826683044434 + }, + { + "auxiliary_loss_clip": 0.01117981, + "auxiliary_loss_mlp": 0.01036777, + "balance_loss_clip": 1.04668963, + "balance_loss_mlp": 1.02231741, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 2.6433016528837747, + "language_loss": 0.68601346, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.70756102, + "num_input_tokens_seen": 146706570, + "step": 6837, + "time_per_iteration": 3.9463694095611572 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.04946971, + "balance_loss_mlp": 1.02519011, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 2.049531419755554, + "language_loss": 0.75304806, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77480185, + "num_input_tokens_seen": 146723425, + "step": 6838, + "time_per_iteration": 2.443586826324463 + }, + { + "auxiliary_loss_clip": 0.01101823, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.0553081, + "balance_loss_mlp": 1.02355456, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.875918133843088, + "language_loss": 0.82190824, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84331179, + "num_input_tokens_seen": 146741640, + "step": 6839, + "time_per_iteration": 2.5442123413085938 + }, + { + "auxiliary_loss_clip": 0.01130441, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.04734588, + "balance_loss_mlp": 1.0227294, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 2.0867437874415464, + "language_loss": 0.80512846, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82679975, + "num_input_tokens_seen": 146759195, + "step": 6840, + "time_per_iteration": 2.4477274417877197 + }, + { + "auxiliary_loss_clip": 0.011159, + "auxiliary_loss_mlp": 0.01036171, + "balance_loss_clip": 1.04581499, + "balance_loss_mlp": 1.02234328, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 2.3130545989247726, + "language_loss": 0.6775744, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.69909507, + "num_input_tokens_seen": 146774990, + "step": 6841, + "time_per_iteration": 3.951570510864258 + }, + { + "auxiliary_loss_clip": 0.01038084, + "auxiliary_loss_mlp": 0.01019355, + "balance_loss_clip": 1.02013052, + "balance_loss_mlp": 1.01736474, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7843272174533842, + "language_loss": 0.59658521, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61715961, + "num_input_tokens_seen": 146839610, + "step": 6842, + "time_per_iteration": 3.1396102905273438 + }, + { + "auxiliary_loss_clip": 0.01113206, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.04845285, + "balance_loss_mlp": 1.02174044, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.3369334707761804, + "language_loss": 0.69656962, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.71804702, + "num_input_tokens_seen": 146857360, + "step": 6843, + "time_per_iteration": 2.4565579891204834 + }, + { + "auxiliary_loss_clip": 0.01025153, + "auxiliary_loss_mlp": 0.01010739, + "balance_loss_clip": 1.02954125, + "balance_loss_mlp": 1.00868821, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.717875224097716, + "language_loss": 0.53657365, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55693257, + "num_input_tokens_seen": 146917055, + "step": 6844, + "time_per_iteration": 3.0956945419311523 + }, + { + "auxiliary_loss_clip": 0.01122234, + "auxiliary_loss_mlp": 0.01037578, + "balance_loss_clip": 1.05174899, + "balance_loss_mlp": 1.02359509, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.897032985240273, + "language_loss": 0.66195053, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68354869, + "num_input_tokens_seen": 146935215, + "step": 6845, + "time_per_iteration": 3.824028253555298 + }, + { + "auxiliary_loss_clip": 0.01130335, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.0501411, + "balance_loss_mlp": 1.02022338, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.9675659451478171, + "language_loss": 0.7003969, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72204161, + "num_input_tokens_seen": 146951970, + "step": 6846, + "time_per_iteration": 2.412722587585449 + }, + { + "auxiliary_loss_clip": 0.01105584, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.0458982, + "balance_loss_mlp": 1.02427244, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.5103001464741244, + "language_loss": 0.64735478, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.66880339, + "num_input_tokens_seen": 146975615, + "step": 6847, + "time_per_iteration": 2.592137336730957 + }, + { + "auxiliary_loss_clip": 0.01108927, + "auxiliary_loss_mlp": 0.01040734, + "balance_loss_clip": 1.04780006, + "balance_loss_mlp": 1.02620268, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.377070319661822, + "language_loss": 0.70429707, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72579366, + "num_input_tokens_seen": 146998855, + "step": 6848, + "time_per_iteration": 2.6282799243927 + }, + { + "auxiliary_loss_clip": 0.01029589, + "auxiliary_loss_mlp": 0.0075655, + "balance_loss_clip": 1.02056909, + "balance_loss_mlp": 1.00036705, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8941631086927507, + "language_loss": 0.56170356, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.57956493, + "num_input_tokens_seen": 147062710, + "step": 6849, + "time_per_iteration": 3.153343439102173 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01036939, + "balance_loss_clip": 1.04707563, + "balance_loss_mlp": 1.02214503, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.9211328136734587, + "language_loss": 0.75946832, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78095281, + "num_input_tokens_seen": 147086075, + "step": 6850, + "time_per_iteration": 2.6278371810913086 + }, + { + "auxiliary_loss_clip": 0.01080831, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.04253256, + "balance_loss_mlp": 1.0182426, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.5898684870046516, + "language_loss": 0.6808486, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70197624, + "num_input_tokens_seen": 147107590, + "step": 6851, + "time_per_iteration": 2.6857736110687256 + }, + { + "auxiliary_loss_clip": 0.01100089, + "auxiliary_loss_mlp": 0.01042398, + "balance_loss_clip": 1.05168986, + "balance_loss_mlp": 1.02591109, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.2151801999669156, + "language_loss": 0.79611701, + "learning_rate": 2.655028075792743e-06, + "loss": 0.8175419, + "num_input_tokens_seen": 147123715, + "step": 6852, + "time_per_iteration": 2.525991916656494 + }, + { + "auxiliary_loss_clip": 0.01135331, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.04953957, + "balance_loss_mlp": 1.01938963, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 3.9835065605541167, + "language_loss": 0.77440625, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.79610318, + "num_input_tokens_seen": 147144290, + "step": 6853, + "time_per_iteration": 2.4975764751434326 + }, + { + "auxiliary_loss_clip": 0.0112139, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.04467249, + "balance_loss_mlp": 1.02539182, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.7850718890903805, + "language_loss": 0.66264111, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.68427622, + "num_input_tokens_seen": 147166340, + "step": 6854, + "time_per_iteration": 2.6150436401367188 + }, + { + "auxiliary_loss_clip": 0.01102044, + "auxiliary_loss_mlp": 0.01038721, + "balance_loss_clip": 1.04406226, + "balance_loss_mlp": 1.02387905, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.683947308609086, + "language_loss": 0.83589876, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85730642, + "num_input_tokens_seen": 147184025, + "step": 6855, + "time_per_iteration": 2.513991355895996 + }, + { + "auxiliary_loss_clip": 0.01114389, + "auxiliary_loss_mlp": 0.01041855, + "balance_loss_clip": 1.04825449, + "balance_loss_mlp": 1.02810979, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.805542440178534, + "language_loss": 0.78878939, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81035185, + "num_input_tokens_seen": 147202730, + "step": 6856, + "time_per_iteration": 2.468108654022217 + }, + { + "auxiliary_loss_clip": 0.01096912, + "auxiliary_loss_mlp": 0.01037183, + "balance_loss_clip": 1.04562914, + "balance_loss_mlp": 1.02274096, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 2.2284761521721355, + "language_loss": 0.79634976, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81769073, + "num_input_tokens_seen": 147215315, + "step": 6857, + "time_per_iteration": 2.49820613861084 + }, + { + "auxiliary_loss_clip": 0.0111971, + "auxiliary_loss_mlp": 0.00780376, + "balance_loss_clip": 1.04571462, + "balance_loss_mlp": 1.00042379, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 1.7968169973195, + "language_loss": 0.70916259, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72816348, + "num_input_tokens_seen": 147233330, + "step": 6858, + "time_per_iteration": 2.520972490310669 + }, + { + "auxiliary_loss_clip": 0.01119408, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.04806852, + "balance_loss_mlp": 1.02287459, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.515034524891982, + "language_loss": 0.59044635, + "learning_rate": 2.652451598005391e-06, + "loss": 0.6120221, + "num_input_tokens_seen": 147257780, + "step": 6859, + "time_per_iteration": 2.711524724960327 + }, + { + "auxiliary_loss_clip": 0.01129646, + "auxiliary_loss_mlp": 0.01036666, + "balance_loss_clip": 1.04547048, + "balance_loss_mlp": 1.02229548, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.511960927124497, + "language_loss": 0.74387443, + "learning_rate": 2.652083430674264e-06, + "loss": 0.76553756, + "num_input_tokens_seen": 147276055, + "step": 6860, + "time_per_iteration": 2.414139986038208 + }, + { + "auxiliary_loss_clip": 0.01059392, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.04604316, + "balance_loss_mlp": 1.02060771, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.6147316654856865, + "language_loss": 0.74366379, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76460576, + "num_input_tokens_seen": 147293200, + "step": 6861, + "time_per_iteration": 2.707531452178955 + }, + { + "auxiliary_loss_clip": 0.0110778, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.04453802, + "balance_loss_mlp": 1.02089369, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 3.733489849073477, + "language_loss": 0.79457325, + "learning_rate": 2.651347021844765e-06, + "loss": 0.81599283, + "num_input_tokens_seen": 147310640, + "step": 6862, + "time_per_iteration": 4.146456718444824 + }, + { + "auxiliary_loss_clip": 0.01103889, + "auxiliary_loss_mlp": 0.01040153, + "balance_loss_clip": 1.04240286, + "balance_loss_mlp": 1.02556133, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.9056803846727441, + "language_loss": 0.76208234, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78352278, + "num_input_tokens_seen": 147329435, + "step": 6863, + "time_per_iteration": 2.5161972045898438 + }, + { + "auxiliary_loss_clip": 0.0103647, + "auxiliary_loss_mlp": 0.01016159, + "balance_loss_clip": 1.01703501, + "balance_loss_mlp": 1.01390612, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.7054920985861795, + "language_loss": 0.52699637, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54752266, + "num_input_tokens_seen": 147385805, + "step": 6864, + "time_per_iteration": 3.039875030517578 + }, + { + "auxiliary_loss_clip": 0.01133347, + "auxiliary_loss_mlp": 0.01038129, + "balance_loss_clip": 1.04771757, + "balance_loss_mlp": 1.02352571, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.8709780799061468, + "language_loss": 0.72180188, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74351662, + "num_input_tokens_seen": 147405160, + "step": 6865, + "time_per_iteration": 2.4795310497283936 + }, + { + "auxiliary_loss_clip": 0.01045312, + "auxiliary_loss_mlp": 0.01005442, + "balance_loss_clip": 1.01778162, + "balance_loss_mlp": 1.00352311, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9262823079780914, + "language_loss": 0.6661731, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68668061, + "num_input_tokens_seen": 147460245, + "step": 6866, + "time_per_iteration": 2.8965084552764893 + }, + { + "auxiliary_loss_clip": 0.0113128, + "auxiliary_loss_mlp": 0.01040308, + "balance_loss_clip": 1.04817832, + "balance_loss_mlp": 1.0264678, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 7.596251769804548, + "language_loss": 0.81392342, + "learning_rate": 2.649505567780375e-06, + "loss": 0.83563924, + "num_input_tokens_seen": 147476200, + "step": 6867, + "time_per_iteration": 2.412855386734009 + }, + { + "auxiliary_loss_clip": 0.01111688, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.0473367, + "balance_loss_mlp": 1.02418935, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.344763548851761, + "language_loss": 0.77678269, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.79828632, + "num_input_tokens_seen": 147494315, + "step": 6868, + "time_per_iteration": 2.545408010482788 + }, + { + "auxiliary_loss_clip": 0.01035251, + "auxiliary_loss_mlp": 0.01003283, + "balance_loss_clip": 1.0179193, + "balance_loss_mlp": 1.00168538, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8416507599328402, + "language_loss": 0.57832283, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59870815, + "num_input_tokens_seen": 147543665, + "step": 6869, + "time_per_iteration": 2.793835163116455 + }, + { + "auxiliary_loss_clip": 0.01121479, + "auxiliary_loss_mlp": 0.01039643, + "balance_loss_clip": 1.05105662, + "balance_loss_mlp": 1.02542162, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 2.4203577712106656, + "language_loss": 0.75270039, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77431166, + "num_input_tokens_seen": 147564870, + "step": 6870, + "time_per_iteration": 2.525940418243408 + }, + { + "auxiliary_loss_clip": 0.01099528, + "auxiliary_loss_mlp": 0.01044509, + "balance_loss_clip": 1.0469116, + "balance_loss_mlp": 1.02994132, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.8840606484008167, + "language_loss": 0.83470732, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85614765, + "num_input_tokens_seen": 147584840, + "step": 6871, + "time_per_iteration": 2.5591652393341064 + }, + { + "auxiliary_loss_clip": 0.01102989, + "auxiliary_loss_mlp": 0.01042585, + "balance_loss_clip": 1.04874825, + "balance_loss_mlp": 1.02773106, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 3.052052037025225, + "language_loss": 0.68592465, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.70738041, + "num_input_tokens_seen": 147604635, + "step": 6872, + "time_per_iteration": 2.5909264087677 + }, + { + "auxiliary_loss_clip": 0.01115342, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.04845548, + "balance_loss_mlp": 1.02151227, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 2.314904687139901, + "language_loss": 0.75518703, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.77669132, + "num_input_tokens_seen": 147620700, + "step": 6873, + "time_per_iteration": 2.5025999546051025 + }, + { + "auxiliary_loss_clip": 0.01105806, + "auxiliary_loss_mlp": 0.01039508, + "balance_loss_clip": 1.04408526, + "balance_loss_mlp": 1.02441025, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 3.373834690845021, + "language_loss": 0.83095104, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85240418, + "num_input_tokens_seen": 147639490, + "step": 6874, + "time_per_iteration": 2.5243332386016846 + }, + { + "auxiliary_loss_clip": 0.01097741, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.04524398, + "balance_loss_mlp": 1.02237797, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 3.5868777242022234, + "language_loss": 0.72067875, + "learning_rate": 2.646557961279436e-06, + "loss": 0.74202645, + "num_input_tokens_seen": 147657205, + "step": 6875, + "time_per_iteration": 2.539741039276123 + }, + { + "auxiliary_loss_clip": 0.01100162, + "auxiliary_loss_mlp": 0.01044116, + "balance_loss_clip": 1.04425371, + "balance_loss_mlp": 1.0307405, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.7275262805817526, + "language_loss": 0.8286562, + "learning_rate": 2.646189399991154e-06, + "loss": 0.85009897, + "num_input_tokens_seen": 147677005, + "step": 6876, + "time_per_iteration": 2.5516409873962402 + }, + { + "auxiliary_loss_clip": 0.01118112, + "auxiliary_loss_mlp": 0.01042612, + "balance_loss_clip": 1.04779768, + "balance_loss_mlp": 1.02662623, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.867290488704532, + "language_loss": 0.65048403, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.67209125, + "num_input_tokens_seen": 147693435, + "step": 6877, + "time_per_iteration": 3.979604959487915 + }, + { + "auxiliary_loss_clip": 0.01120923, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.04939806, + "balance_loss_mlp": 1.0231142, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 1.7073600963476359, + "language_loss": 0.76321006, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78479373, + "num_input_tokens_seen": 147714000, + "step": 6878, + "time_per_iteration": 2.4900853633880615 + }, + { + "auxiliary_loss_clip": 0.0111938, + "auxiliary_loss_mlp": 0.00780387, + "balance_loss_clip": 1.04768968, + "balance_loss_mlp": 1.00061035, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.9586192486679876, + "language_loss": 0.79771519, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.81671292, + "num_input_tokens_seen": 147731010, + "step": 6879, + "time_per_iteration": 2.499093532562256 + }, + { + "auxiliary_loss_clip": 0.01131647, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.04835415, + "balance_loss_mlp": 1.02406025, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 2.437830586847007, + "language_loss": 0.84900886, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.87071311, + "num_input_tokens_seen": 147750880, + "step": 6880, + "time_per_iteration": 3.98217511177063 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.04692888, + "balance_loss_mlp": 1.01825953, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.6984496839794545, + "language_loss": 0.70310867, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72453535, + "num_input_tokens_seen": 147771360, + "step": 6881, + "time_per_iteration": 2.5263254642486572 + }, + { + "auxiliary_loss_clip": 0.01132733, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.0514462, + "balance_loss_mlp": 1.02792764, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.7956613064174538, + "language_loss": 0.81279778, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83453727, + "num_input_tokens_seen": 147787440, + "step": 6882, + "time_per_iteration": 2.41384220123291 + }, + { + "auxiliary_loss_clip": 0.01109979, + "auxiliary_loss_mlp": 0.01048864, + "balance_loss_clip": 1.045892, + "balance_loss_mlp": 1.03203809, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 2.51255159069827, + "language_loss": 0.69163531, + "learning_rate": 2.643608785656077e-06, + "loss": 0.7132237, + "num_input_tokens_seen": 147805720, + "step": 6883, + "time_per_iteration": 2.5016918182373047 + }, + { + "auxiliary_loss_clip": 0.01119999, + "auxiliary_loss_mlp": 0.01037137, + "balance_loss_clip": 1.04570079, + "balance_loss_mlp": 1.02315402, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.7909044039815354, + "language_loss": 0.75409132, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77566266, + "num_input_tokens_seen": 147824605, + "step": 6884, + "time_per_iteration": 2.479238271713257 + }, + { + "auxiliary_loss_clip": 0.01098752, + "auxiliary_loss_mlp": 0.0103996, + "balance_loss_clip": 1.04678893, + "balance_loss_mlp": 1.02517772, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.9659131462863113, + "language_loss": 0.75957692, + "learning_rate": 2.642871247413523e-06, + "loss": 0.78096402, + "num_input_tokens_seen": 147845445, + "step": 6885, + "time_per_iteration": 3.9884307384490967 + }, + { + "auxiliary_loss_clip": 0.01134476, + "auxiliary_loss_mlp": 0.01041646, + "balance_loss_clip": 1.04889405, + "balance_loss_mlp": 1.02682853, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 1.7860381625313901, + "language_loss": 0.69672477, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.71848595, + "num_input_tokens_seen": 147865580, + "step": 6886, + "time_per_iteration": 2.4701015949249268 + }, + { + "auxiliary_loss_clip": 0.01134745, + "auxiliary_loss_mlp": 0.00781271, + "balance_loss_clip": 1.05052495, + "balance_loss_mlp": 1.00048041, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.634602763770718, + "language_loss": 0.75558698, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77474713, + "num_input_tokens_seen": 147885230, + "step": 6887, + "time_per_iteration": 2.4403183460235596 + }, + { + "auxiliary_loss_clip": 0.01119782, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.04655313, + "balance_loss_mlp": 1.0194912, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 2.252950200255508, + "language_loss": 0.7068035, + "learning_rate": 2.641764757251592e-06, + "loss": 0.7283411, + "num_input_tokens_seen": 147903035, + "step": 6888, + "time_per_iteration": 2.4574081897735596 + }, + { + "auxiliary_loss_clip": 0.01129023, + "auxiliary_loss_mlp": 0.01039196, + "balance_loss_clip": 1.04647541, + "balance_loss_mlp": 1.02485514, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 2.0109739037418084, + "language_loss": 0.7629323, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.78461444, + "num_input_tokens_seen": 147918745, + "step": 6889, + "time_per_iteration": 2.412194013595581 + }, + { + "auxiliary_loss_clip": 0.01098894, + "auxiliary_loss_mlp": 0.00779221, + "balance_loss_clip": 1.05134892, + "balance_loss_mlp": 1.00052023, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.6269660624681963, + "language_loss": 0.80111438, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.81989551, + "num_input_tokens_seen": 147938265, + "step": 6890, + "time_per_iteration": 2.5907135009765625 + }, + { + "auxiliary_loss_clip": 0.01130276, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.04866028, + "balance_loss_mlp": 1.02392554, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 2.0416197143350114, + "language_loss": 0.74387932, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76557249, + "num_input_tokens_seen": 147957320, + "step": 6891, + "time_per_iteration": 2.445692300796509 + }, + { + "auxiliary_loss_clip": 0.01099057, + "auxiliary_loss_mlp": 0.01041642, + "balance_loss_clip": 1.05381966, + "balance_loss_mlp": 1.02567363, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.8622150535220166, + "language_loss": 0.84142864, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86283565, + "num_input_tokens_seen": 147977045, + "step": 6892, + "time_per_iteration": 2.5892539024353027 + }, + { + "auxiliary_loss_clip": 0.01086402, + "auxiliary_loss_mlp": 0.00784395, + "balance_loss_clip": 1.04253769, + "balance_loss_mlp": 1.00042498, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.6545977427559408, + "language_loss": 0.70369792, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72240591, + "num_input_tokens_seen": 147996905, + "step": 6893, + "time_per_iteration": 2.669403314590454 + }, + { + "auxiliary_loss_clip": 0.01131276, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.04904819, + "balance_loss_mlp": 1.02367997, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.7014698466425182, + "language_loss": 0.73125273, + "learning_rate": 2.639551120239279e-06, + "loss": 0.75294256, + "num_input_tokens_seen": 148017875, + "step": 6894, + "time_per_iteration": 2.5094990730285645 + }, + { + "auxiliary_loss_clip": 0.0112331, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.04749334, + "balance_loss_mlp": 1.02210653, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 4.057867439898816, + "language_loss": 0.62856758, + "learning_rate": 2.63918209577416e-06, + "loss": 0.65016395, + "num_input_tokens_seen": 148032300, + "step": 6895, + "time_per_iteration": 2.436445713043213 + }, + { + "auxiliary_loss_clip": 0.01088773, + "auxiliary_loss_mlp": 0.01043946, + "balance_loss_clip": 1.0465337, + "balance_loss_mlp": 1.02785873, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.407846822415996, + "language_loss": 0.70561898, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72694612, + "num_input_tokens_seen": 148053260, + "step": 6896, + "time_per_iteration": 2.5977890491485596 + }, + { + "auxiliary_loss_clip": 0.01131827, + "auxiliary_loss_mlp": 0.01043088, + "balance_loss_clip": 1.04693842, + "balance_loss_mlp": 1.0271734, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.7449578867693327, + "language_loss": 0.7262308, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.74797994, + "num_input_tokens_seen": 148072965, + "step": 6897, + "time_per_iteration": 2.5130536556243896 + }, + { + "auxiliary_loss_clip": 0.011201, + "auxiliary_loss_mlp": 0.01045939, + "balance_loss_clip": 1.04983985, + "balance_loss_mlp": 1.03134191, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 5.165906727985434, + "language_loss": 0.84581995, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86748028, + "num_input_tokens_seen": 148093240, + "step": 6898, + "time_per_iteration": 2.565699577331543 + }, + { + "auxiliary_loss_clip": 0.01086493, + "auxiliary_loss_mlp": 0.01037297, + "balance_loss_clip": 1.04412985, + "balance_loss_mlp": 1.02266991, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 3.679275556595443, + "language_loss": 0.74282598, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76406395, + "num_input_tokens_seen": 148110925, + "step": 6899, + "time_per_iteration": 2.6053075790405273 + }, + { + "auxiliary_loss_clip": 0.01099981, + "auxiliary_loss_mlp": 0.01041144, + "balance_loss_clip": 1.04429412, + "balance_loss_mlp": 1.02477646, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.9264751542651841, + "language_loss": 0.75791276, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.779324, + "num_input_tokens_seen": 148130670, + "step": 6900, + "time_per_iteration": 2.637794256210327 + }, + { + "auxiliary_loss_clip": 0.01122624, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.05397224, + "balance_loss_mlp": 1.0274266, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.603872706754315, + "language_loss": 0.80587411, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82753646, + "num_input_tokens_seen": 148148350, + "step": 6901, + "time_per_iteration": 3.9754788875579834 + }, + { + "auxiliary_loss_clip": 0.01091571, + "auxiliary_loss_mlp": 0.01040404, + "balance_loss_clip": 1.04302406, + "balance_loss_mlp": 1.02482307, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 3.3542542275439207, + "language_loss": 0.69550633, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71682608, + "num_input_tokens_seen": 148167550, + "step": 6902, + "time_per_iteration": 2.6034622192382812 + }, + { + "auxiliary_loss_clip": 0.01101445, + "auxiliary_loss_mlp": 0.00779586, + "balance_loss_clip": 1.04712367, + "balance_loss_mlp": 1.00047636, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 1.617985622146972, + "language_loss": 0.83935118, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85816151, + "num_input_tokens_seen": 148184740, + "step": 6903, + "time_per_iteration": 2.5755040645599365 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01042309, + "balance_loss_clip": 1.04837978, + "balance_loss_mlp": 1.0256201, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 2.2719847989769884, + "language_loss": 0.6784395, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.70022517, + "num_input_tokens_seen": 148204605, + "step": 6904, + "time_per_iteration": 2.5378174781799316 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.007819, + "balance_loss_clip": 1.0501256, + "balance_loss_mlp": 1.00047517, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.7336167883817217, + "language_loss": 0.77866113, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79784083, + "num_input_tokens_seen": 148224675, + "step": 6905, + "time_per_iteration": 2.4954354763031006 + }, + { + "auxiliary_loss_clip": 0.01134421, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.04835153, + "balance_loss_mlp": 1.01780629, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 1.7131642341355313, + "language_loss": 0.68399763, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70567185, + "num_input_tokens_seen": 148243375, + "step": 6906, + "time_per_iteration": 2.57139253616333 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01038188, + "balance_loss_clip": 1.04405451, + "balance_loss_mlp": 1.02378106, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 3.4457274887528033, + "language_loss": 0.67029774, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.6917544, + "num_input_tokens_seen": 148261140, + "step": 6907, + "time_per_iteration": 2.532977342605591 + }, + { + "auxiliary_loss_clip": 0.01104489, + "auxiliary_loss_mlp": 0.01039552, + "balance_loss_clip": 1.05041671, + "balance_loss_mlp": 1.02564025, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 2.839987248662565, + "language_loss": 0.76684988, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.78829026, + "num_input_tokens_seen": 148279655, + "step": 6908, + "time_per_iteration": 2.5449113845825195 + }, + { + "auxiliary_loss_clip": 0.01033618, + "auxiliary_loss_mlp": 0.01023769, + "balance_loss_clip": 1.02272534, + "balance_loss_mlp": 1.02186131, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7729783321957147, + "language_loss": 0.64796501, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66853887, + "num_input_tokens_seen": 148339005, + "step": 6909, + "time_per_iteration": 3.058894395828247 + }, + { + "auxiliary_loss_clip": 0.01094083, + "auxiliary_loss_mlp": 0.01041668, + "balance_loss_clip": 1.04830432, + "balance_loss_mlp": 1.02698195, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.4627788565132864, + "language_loss": 0.8682124, + "learning_rate": 2.633643828093996e-06, + "loss": 0.88956988, + "num_input_tokens_seen": 148358715, + "step": 6910, + "time_per_iteration": 2.5684661865234375 + }, + { + "auxiliary_loss_clip": 0.01042736, + "auxiliary_loss_mlp": 0.01005805, + "balance_loss_clip": 1.02398694, + "balance_loss_mlp": 1.00404024, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8004967722461451, + "language_loss": 0.62102962, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64151502, + "num_input_tokens_seen": 148417280, + "step": 6911, + "time_per_iteration": 3.057100534439087 + }, + { + "auxiliary_loss_clip": 0.01139207, + "auxiliary_loss_mlp": 0.01043385, + "balance_loss_clip": 1.0504564, + "balance_loss_mlp": 1.02774501, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.5583869857770547, + "language_loss": 0.88400793, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90583384, + "num_input_tokens_seen": 148432610, + "step": 6912, + "time_per_iteration": 2.450244903564453 + }, + { + "auxiliary_loss_clip": 0.01117192, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.04829216, + "balance_loss_mlp": 1.01998138, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 3.4664316346325337, + "language_loss": 0.63471687, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65623164, + "num_input_tokens_seen": 148451510, + "step": 6913, + "time_per_iteration": 2.5171003341674805 + }, + { + "auxiliary_loss_clip": 0.0110784, + "auxiliary_loss_mlp": 0.00780608, + "balance_loss_clip": 1.0462265, + "balance_loss_mlp": 1.00056922, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.8507174481818855, + "language_loss": 0.76088929, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77977377, + "num_input_tokens_seen": 148469945, + "step": 6914, + "time_per_iteration": 2.556058645248413 + }, + { + "auxiliary_loss_clip": 0.01083299, + "auxiliary_loss_mlp": 0.01046681, + "balance_loss_clip": 1.04526258, + "balance_loss_mlp": 1.03034949, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.863157903098286, + "language_loss": 0.87504339, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89634323, + "num_input_tokens_seen": 148486655, + "step": 6915, + "time_per_iteration": 2.6088123321533203 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01041515, + "balance_loss_clip": 1.04470396, + "balance_loss_mlp": 1.02670932, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 3.279604821607986, + "language_loss": 0.70928013, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73070222, + "num_input_tokens_seen": 148505035, + "step": 6916, + "time_per_iteration": 4.042081117630005 + }, + { + "auxiliary_loss_clip": 0.01136185, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.04951644, + "balance_loss_mlp": 1.02092791, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.5555655392325316, + "language_loss": 0.7210297, + "learning_rate": 2.631057450157852e-06, + "loss": 0.74275792, + "num_input_tokens_seen": 148525575, + "step": 6917, + "time_per_iteration": 2.480597972869873 + }, + { + "auxiliary_loss_clip": 0.01104223, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.04511833, + "balance_loss_mlp": 1.0176065, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.6746595619555478, + "language_loss": 0.80973506, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83109558, + "num_input_tokens_seen": 148547270, + "step": 6918, + "time_per_iteration": 2.548304319381714 + }, + { + "auxiliary_loss_clip": 0.01120825, + "auxiliary_loss_mlp": 0.01038272, + "balance_loss_clip": 1.04917383, + "balance_loss_mlp": 1.02185738, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.5026726073605534, + "language_loss": 0.70249081, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72408181, + "num_input_tokens_seen": 148572100, + "step": 6919, + "time_per_iteration": 4.188501834869385 + }, + { + "auxiliary_loss_clip": 0.01113969, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.04909348, + "balance_loss_mlp": 1.02031934, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 1.9427934511252087, + "language_loss": 0.8113609, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83285928, + "num_input_tokens_seen": 148591245, + "step": 6920, + "time_per_iteration": 2.5086798667907715 + }, + { + "auxiliary_loss_clip": 0.0111461, + "auxiliary_loss_mlp": 0.01038393, + "balance_loss_clip": 1.0496006, + "balance_loss_mlp": 1.02196598, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 3.383705262286925, + "language_loss": 0.6578269, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.67935699, + "num_input_tokens_seen": 148607980, + "step": 6921, + "time_per_iteration": 2.523407459259033 + }, + { + "auxiliary_loss_clip": 0.0111351, + "auxiliary_loss_mlp": 0.01042455, + "balance_loss_clip": 1.04838634, + "balance_loss_mlp": 1.02683878, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.323675740873169, + "language_loss": 0.80376196, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82532161, + "num_input_tokens_seen": 148624490, + "step": 6922, + "time_per_iteration": 2.546051263809204 + }, + { + "auxiliary_loss_clip": 0.01109949, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.0475626, + "balance_loss_mlp": 1.01971459, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.6777263311808974, + "language_loss": 0.67567056, + "learning_rate": 2.628839621341247e-06, + "loss": 0.69711757, + "num_input_tokens_seen": 148646490, + "step": 6923, + "time_per_iteration": 2.568012237548828 + }, + { + "auxiliary_loss_clip": 0.01107539, + "auxiliary_loss_mlp": 0.0104569, + "balance_loss_clip": 1.05113339, + "balance_loss_mlp": 1.02884555, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.8764601005829085, + "language_loss": 0.75773424, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.77926648, + "num_input_tokens_seen": 148668580, + "step": 6924, + "time_per_iteration": 4.00544810295105 + }, + { + "auxiliary_loss_clip": 0.01136211, + "auxiliary_loss_mlp": 0.01034994, + "balance_loss_clip": 1.04944682, + "balance_loss_mlp": 1.02018249, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.821780874797999, + "language_loss": 0.73099667, + "learning_rate": 2.62810015415423e-06, + "loss": 0.75270873, + "num_input_tokens_seen": 148688410, + "step": 6925, + "time_per_iteration": 2.447641134262085 + }, + { + "auxiliary_loss_clip": 0.01111564, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.04607594, + "balance_loss_mlp": 1.02062702, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 2.2840310019202517, + "language_loss": 0.83410847, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.85558075, + "num_input_tokens_seen": 148704855, + "step": 6926, + "time_per_iteration": 2.4807729721069336 + }, + { + "auxiliary_loss_clip": 0.01100061, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.04845214, + "balance_loss_mlp": 1.02621436, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.7890099189137372, + "language_loss": 0.86445355, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88585705, + "num_input_tokens_seen": 148723065, + "step": 6927, + "time_per_iteration": 2.498136281967163 + }, + { + "auxiliary_loss_clip": 0.01124003, + "auxiliary_loss_mlp": 0.01048476, + "balance_loss_clip": 1.04946661, + "balance_loss_mlp": 1.03206122, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 7.265629352131683, + "language_loss": 0.72617501, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74789977, + "num_input_tokens_seen": 148741780, + "step": 6928, + "time_per_iteration": 2.473869562149048 + }, + { + "auxiliary_loss_clip": 0.0110525, + "auxiliary_loss_mlp": 0.01040068, + "balance_loss_clip": 1.04634714, + "balance_loss_mlp": 1.02463067, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 2.3067875559423783, + "language_loss": 0.78324318, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80469632, + "num_input_tokens_seen": 148759795, + "step": 6929, + "time_per_iteration": 2.536736011505127 + }, + { + "auxiliary_loss_clip": 0.01134162, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.04931951, + "balance_loss_mlp": 1.01871061, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 3.1045167355800722, + "language_loss": 0.7120356, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73370922, + "num_input_tokens_seen": 148778680, + "step": 6930, + "time_per_iteration": 2.448477268218994 + }, + { + "auxiliary_loss_clip": 0.01105095, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.04519272, + "balance_loss_mlp": 1.02291393, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.6677444552946312, + "language_loss": 0.80896932, + "learning_rate": 2.625881181419007e-06, + "loss": 0.8303982, + "num_input_tokens_seen": 148796470, + "step": 6931, + "time_per_iteration": 2.4980952739715576 + }, + { + "auxiliary_loss_clip": 0.01081597, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.04046607, + "balance_loss_mlp": 1.02506471, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.8731220156121153, + "language_loss": 0.7918579, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81307971, + "num_input_tokens_seen": 148815300, + "step": 6932, + "time_per_iteration": 2.6075847148895264 + }, + { + "auxiliary_loss_clip": 0.01109369, + "auxiliary_loss_mlp": 0.00781385, + "balance_loss_clip": 1.04627144, + "balance_loss_mlp": 1.00058055, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 1.849609527209523, + "language_loss": 0.81567705, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83458459, + "num_input_tokens_seen": 148834315, + "step": 6933, + "time_per_iteration": 2.5906825065612793 + }, + { + "auxiliary_loss_clip": 0.01134583, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.04779816, + "balance_loss_mlp": 1.02047968, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 1.7525068943971756, + "language_loss": 0.77066767, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79237843, + "num_input_tokens_seen": 148852420, + "step": 6934, + "time_per_iteration": 2.443828582763672 + }, + { + "auxiliary_loss_clip": 0.01123927, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.04986215, + "balance_loss_mlp": 1.02431989, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 2.082258898943993, + "language_loss": 0.67456186, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69619417, + "num_input_tokens_seen": 148869305, + "step": 6935, + "time_per_iteration": 2.453836679458618 + }, + { + "auxiliary_loss_clip": 0.01108705, + "auxiliary_loss_mlp": 0.01049701, + "balance_loss_clip": 1.04965067, + "balance_loss_mlp": 1.03419209, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.36513002930173, + "language_loss": 0.73441535, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75599939, + "num_input_tokens_seen": 148886395, + "step": 6936, + "time_per_iteration": 2.508643388748169 + }, + { + "auxiliary_loss_clip": 0.01122696, + "auxiliary_loss_mlp": 0.01037135, + "balance_loss_clip": 1.05375099, + "balance_loss_mlp": 1.02284145, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 2.195173742521702, + "language_loss": 0.74289238, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.76449072, + "num_input_tokens_seen": 148905235, + "step": 6937, + "time_per_iteration": 2.46779203414917 + }, + { + "auxiliary_loss_clip": 0.01108235, + "auxiliary_loss_mlp": 0.01038189, + "balance_loss_clip": 1.04613757, + "balance_loss_mlp": 1.02390742, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.5494247724462635, + "language_loss": 0.84525204, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86671627, + "num_input_tokens_seen": 148928130, + "step": 6938, + "time_per_iteration": 2.627161979675293 + }, + { + "auxiliary_loss_clip": 0.01110328, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.04963243, + "balance_loss_mlp": 1.02052522, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 1.862697325452762, + "language_loss": 0.74168986, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76315516, + "num_input_tokens_seen": 148948790, + "step": 6939, + "time_per_iteration": 2.56723952293396 + }, + { + "auxiliary_loss_clip": 0.01122523, + "auxiliary_loss_mlp": 0.0103745, + "balance_loss_clip": 1.04721034, + "balance_loss_mlp": 1.02222669, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.7043008297648115, + "language_loss": 0.74783015, + "learning_rate": 2.622551121253579e-06, + "loss": 0.76942992, + "num_input_tokens_seen": 148967690, + "step": 6940, + "time_per_iteration": 2.5106773376464844 + }, + { + "auxiliary_loss_clip": 0.01134464, + "auxiliary_loss_mlp": 0.01040228, + "balance_loss_clip": 1.04927385, + "balance_loss_mlp": 1.02610159, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.7791893003755304, + "language_loss": 0.71677202, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73851895, + "num_input_tokens_seen": 148987150, + "step": 6941, + "time_per_iteration": 4.002891540527344 + }, + { + "auxiliary_loss_clip": 0.01119196, + "auxiliary_loss_mlp": 0.01045681, + "balance_loss_clip": 1.04873562, + "balance_loss_mlp": 1.03097093, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 6.05369539295372, + "language_loss": 0.73365921, + "learning_rate": 2.621810847844104e-06, + "loss": 0.75530803, + "num_input_tokens_seen": 149004895, + "step": 6942, + "time_per_iteration": 2.5120046138763428 + }, + { + "auxiliary_loss_clip": 0.01099202, + "auxiliary_loss_mlp": 0.0104799, + "balance_loss_clip": 1.04620707, + "balance_loss_mlp": 1.03158689, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.233632079264023, + "language_loss": 0.72632354, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74779546, + "num_input_tokens_seen": 149020970, + "step": 6943, + "time_per_iteration": 2.534703254699707 + }, + { + "auxiliary_loss_clip": 0.01104155, + "auxiliary_loss_mlp": 0.00781514, + "balance_loss_clip": 1.04742658, + "balance_loss_mlp": 1.00049627, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 3.892230804015883, + "language_loss": 0.63676119, + "learning_rate": 2.621070480118111e-06, + "loss": 0.65561795, + "num_input_tokens_seen": 149041795, + "step": 6944, + "time_per_iteration": 2.6610875129699707 + }, + { + "auxiliary_loss_clip": 0.01103044, + "auxiliary_loss_mlp": 0.01036088, + "balance_loss_clip": 1.04051852, + "balance_loss_mlp": 1.02123451, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.568357502820188, + "language_loss": 0.70068616, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72207749, + "num_input_tokens_seen": 149063700, + "step": 6945, + "time_per_iteration": 2.588200569152832 + }, + { + "auxiliary_loss_clip": 0.01090885, + "auxiliary_loss_mlp": 0.01059712, + "balance_loss_clip": 1.04033673, + "balance_loss_mlp": 1.04125798, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.8706564599300763, + "language_loss": 0.80774587, + "learning_rate": 2.620330018187899e-06, + "loss": 0.82925183, + "num_input_tokens_seen": 149082410, + "step": 6946, + "time_per_iteration": 2.525002956390381 + }, + { + "auxiliary_loss_clip": 0.0112081, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.04875576, + "balance_loss_mlp": 1.02326727, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.324754748408534, + "language_loss": 0.77563268, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79721922, + "num_input_tokens_seen": 149098745, + "step": 6947, + "time_per_iteration": 2.440431833267212 + }, + { + "auxiliary_loss_clip": 0.01132243, + "auxiliary_loss_mlp": 0.01038551, + "balance_loss_clip": 1.04739964, + "balance_loss_mlp": 1.02320313, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 1.827953902713039, + "language_loss": 0.71712148, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73882937, + "num_input_tokens_seen": 149122255, + "step": 6948, + "time_per_iteration": 2.5208487510681152 + }, + { + "auxiliary_loss_clip": 0.01116653, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.04608262, + "balance_loss_mlp": 1.02177548, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 2.0595669206431366, + "language_loss": 0.76783729, + "learning_rate": 2.619219148905362e-06, + "loss": 0.78936458, + "num_input_tokens_seen": 149142845, + "step": 6949, + "time_per_iteration": 2.498838186264038 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01039116, + "balance_loss_clip": 1.05438185, + "balance_loss_mlp": 1.02352357, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.655873345820893, + "language_loss": 0.8224622, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84403628, + "num_input_tokens_seen": 149163375, + "step": 6950, + "time_per_iteration": 2.5499391555786133 + }, + { + "auxiliary_loss_clip": 0.01102274, + "auxiliary_loss_mlp": 0.00778401, + "balance_loss_clip": 1.04957366, + "balance_loss_mlp": 1.00052881, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.2784604725307558, + "language_loss": 0.75988734, + "learning_rate": 2.618478451956007e-06, + "loss": 0.77869409, + "num_input_tokens_seen": 149185610, + "step": 6951, + "time_per_iteration": 2.5702457427978516 + }, + { + "auxiliary_loss_clip": 0.01087046, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.04530191, + "balance_loss_mlp": 1.02266347, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 2.282615045947952, + "language_loss": 0.73005986, + "learning_rate": 2.61810806829516e-06, + "loss": 0.7513119, + "num_input_tokens_seen": 149203990, + "step": 6952, + "time_per_iteration": 2.5730159282684326 + }, + { + "auxiliary_loss_clip": 0.01118154, + "auxiliary_loss_mlp": 0.01035896, + "balance_loss_clip": 1.05123973, + "balance_loss_mlp": 1.02206779, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 2.5563285985727746, + "language_loss": 0.72188532, + "learning_rate": 2.617737661195593e-06, + "loss": 0.74342585, + "num_input_tokens_seen": 149221385, + "step": 6953, + "time_per_iteration": 2.432617664337158 + }, + { + "auxiliary_loss_clip": 0.0111996, + "auxiliary_loss_mlp": 0.01043807, + "balance_loss_clip": 1.04725587, + "balance_loss_mlp": 1.02848244, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 2.0173959115994116, + "language_loss": 0.76358914, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78522682, + "num_input_tokens_seen": 149241175, + "step": 6954, + "time_per_iteration": 2.4849486351013184 + }, + { + "auxiliary_loss_clip": 0.01096014, + "auxiliary_loss_mlp": 0.01047212, + "balance_loss_clip": 1.04910159, + "balance_loss_mlp": 1.03068948, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 3.811371915243345, + "language_loss": 0.84453934, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86597157, + "num_input_tokens_seen": 149259115, + "step": 6955, + "time_per_iteration": 4.099254608154297 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01041542, + "balance_loss_clip": 1.04729414, + "balance_loss_mlp": 1.02717149, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.6413647485508327, + "language_loss": 0.83410335, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85571367, + "num_input_tokens_seen": 149278705, + "step": 6956, + "time_per_iteration": 2.52901291847229 + }, + { + "auxiliary_loss_clip": 0.01096285, + "auxiliary_loss_mlp": 0.01043754, + "balance_loss_clip": 1.04440069, + "balance_loss_mlp": 1.02782738, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 4.295605030243614, + "language_loss": 0.71395749, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73535788, + "num_input_tokens_seen": 149294040, + "step": 6957, + "time_per_iteration": 2.5226683616638184 + }, + { + "auxiliary_loss_clip": 0.01100514, + "auxiliary_loss_mlp": 0.01040947, + "balance_loss_clip": 1.04731381, + "balance_loss_mlp": 1.02769089, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.190451964270425, + "language_loss": 0.75365013, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77506471, + "num_input_tokens_seen": 149310385, + "step": 6958, + "time_per_iteration": 2.5286049842834473 + }, + { + "auxiliary_loss_clip": 0.01085892, + "auxiliary_loss_mlp": 0.00780956, + "balance_loss_clip": 1.04149771, + "balance_loss_mlp": 1.00051928, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.6617535065807698, + "language_loss": 0.77161288, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.79028141, + "num_input_tokens_seen": 149328235, + "step": 6959, + "time_per_iteration": 4.087128400802612 + }, + { + "auxiliary_loss_clip": 0.01091302, + "auxiliary_loss_mlp": 0.00780893, + "balance_loss_clip": 1.04388452, + "balance_loss_mlp": 1.0005213, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 1.5995467341636893, + "language_loss": 0.7676847, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78640664, + "num_input_tokens_seen": 149347465, + "step": 6960, + "time_per_iteration": 2.5568935871124268 + }, + { + "auxiliary_loss_clip": 0.01100442, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.04928064, + "balance_loss_mlp": 1.01899838, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.619156564057309, + "language_loss": 0.75777709, + "learning_rate": 2.614773562290835e-06, + "loss": 0.779109, + "num_input_tokens_seen": 149366685, + "step": 6961, + "time_per_iteration": 2.533149480819702 + }, + { + "auxiliary_loss_clip": 0.01028792, + "auxiliary_loss_mlp": 0.01016932, + "balance_loss_clip": 1.0311954, + "balance_loss_mlp": 1.01504862, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7913556443492341, + "language_loss": 0.54726493, + "learning_rate": 2.61440294487496e-06, + "loss": 0.5677222, + "num_input_tokens_seen": 149422925, + "step": 6962, + "time_per_iteration": 3.0462546348571777 + }, + { + "auxiliary_loss_clip": 0.01122732, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.05268764, + "balance_loss_mlp": 1.02370596, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.826033613074011, + "language_loss": 0.85417092, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87577832, + "num_input_tokens_seen": 149440820, + "step": 6963, + "time_per_iteration": 3.9030046463012695 + }, + { + "auxiliary_loss_clip": 0.01109083, + "auxiliary_loss_mlp": 0.01036502, + "balance_loss_clip": 1.05098414, + "balance_loss_mlp": 1.02205968, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 2.3553492785089563, + "language_loss": 0.70482987, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.7262857, + "num_input_tokens_seen": 149461060, + "step": 6964, + "time_per_iteration": 2.5178680419921875 + }, + { + "auxiliary_loss_clip": 0.01128239, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.04737139, + "balance_loss_mlp": 1.02964401, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.7962499563421348, + "language_loss": 0.71097875, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73269451, + "num_input_tokens_seen": 149483115, + "step": 6965, + "time_per_iteration": 2.568049192428589 + }, + { + "auxiliary_loss_clip": 0.01081456, + "auxiliary_loss_mlp": 0.01042115, + "balance_loss_clip": 1.04459286, + "balance_loss_mlp": 1.02945483, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 2.2946459099771093, + "language_loss": 0.72268355, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74391925, + "num_input_tokens_seen": 149501495, + "step": 6966, + "time_per_iteration": 2.563282012939453 + }, + { + "auxiliary_loss_clip": 0.01125364, + "auxiliary_loss_mlp": 0.01036978, + "balance_loss_clip": 1.04740405, + "balance_loss_mlp": 1.02215993, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.357976523983687, + "language_loss": 0.71509814, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73672152, + "num_input_tokens_seen": 149523170, + "step": 6967, + "time_per_iteration": 2.637211322784424 + }, + { + "auxiliary_loss_clip": 0.01037527, + "auxiliary_loss_mlp": 0.010035, + "balance_loss_clip": 1.01884556, + "balance_loss_mlp": 1.00171185, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6717112826584762, + "language_loss": 0.46230614, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48271644, + "num_input_tokens_seen": 149583955, + "step": 6968, + "time_per_iteration": 3.0281176567077637 + }, + { + "auxiliary_loss_clip": 0.01122998, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_clip": 1.04694974, + "balance_loss_mlp": 1.02648866, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.7006798816450117, + "language_loss": 0.74801433, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.76966214, + "num_input_tokens_seen": 149604440, + "step": 6969, + "time_per_iteration": 2.5222771167755127 + }, + { + "auxiliary_loss_clip": 0.01106045, + "auxiliary_loss_mlp": 0.01042385, + "balance_loss_clip": 1.04410863, + "balance_loss_mlp": 1.02828228, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 1.723762715913979, + "language_loss": 0.80823165, + "learning_rate": 2.611437167992705e-06, + "loss": 0.82971597, + "num_input_tokens_seen": 149623745, + "step": 6970, + "time_per_iteration": 2.5284364223480225 + }, + { + "auxiliary_loss_clip": 0.01121069, + "auxiliary_loss_mlp": 0.01043192, + "balance_loss_clip": 1.04930007, + "balance_loss_mlp": 1.02830231, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 3.2847865182896085, + "language_loss": 0.83322978, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.85487235, + "num_input_tokens_seen": 149643025, + "step": 6971, + "time_per_iteration": 2.4762768745422363 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.05404115, + "balance_loss_mlp": 1.03185475, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.8086323819018375, + "language_loss": 0.74655038, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.76811725, + "num_input_tokens_seen": 149660695, + "step": 6972, + "time_per_iteration": 2.495021104812622 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01037343, + "balance_loss_clip": 1.04231346, + "balance_loss_mlp": 1.02332389, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.450257660934662, + "language_loss": 0.72945988, + "learning_rate": 2.610324618710212e-06, + "loss": 0.75087214, + "num_input_tokens_seen": 149682040, + "step": 6973, + "time_per_iteration": 2.649108409881592 + }, + { + "auxiliary_loss_clip": 0.01102287, + "auxiliary_loss_mlp": 0.01039266, + "balance_loss_clip": 1.05228484, + "balance_loss_mlp": 1.02500916, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 2.3993749528210744, + "language_loss": 0.74854821, + "learning_rate": 2.609953722643489e-06, + "loss": 0.76996374, + "num_input_tokens_seen": 149700855, + "step": 6974, + "time_per_iteration": 2.5366170406341553 + }, + { + "auxiliary_loss_clip": 0.01119882, + "auxiliary_loss_mlp": 0.01036036, + "balance_loss_clip": 1.04604506, + "balance_loss_mlp": 1.02248836, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 1.9511785542419602, + "language_loss": 0.72673351, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74829268, + "num_input_tokens_seen": 149717360, + "step": 6975, + "time_per_iteration": 2.4684128761291504 + }, + { + "auxiliary_loss_clip": 0.01114358, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.04665709, + "balance_loss_mlp": 1.02564394, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.624215776822937, + "language_loss": 0.80985534, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83139867, + "num_input_tokens_seen": 149738975, + "step": 6976, + "time_per_iteration": 2.519650459289551 + }, + { + "auxiliary_loss_clip": 0.01101112, + "auxiliary_loss_mlp": 0.01043519, + "balance_loss_clip": 1.04193676, + "balance_loss_mlp": 1.02833772, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 2.1260444680572825, + "language_loss": 0.67130488, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.69275117, + "num_input_tokens_seen": 149757055, + "step": 6977, + "time_per_iteration": 2.49960994720459 + }, + { + "auxiliary_loss_clip": 0.01124528, + "auxiliary_loss_mlp": 0.01041786, + "balance_loss_clip": 1.05165696, + "balance_loss_mlp": 1.02794003, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.53553354397327, + "language_loss": 0.81492996, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83659309, + "num_input_tokens_seen": 149772885, + "step": 6978, + "time_per_iteration": 2.4438233375549316 + }, + { + "auxiliary_loss_clip": 0.01132048, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_clip": 1.04643083, + "balance_loss_mlp": 1.03067672, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 2.0938558959832254, + "language_loss": 0.82577419, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84754795, + "num_input_tokens_seen": 149791515, + "step": 6979, + "time_per_iteration": 2.4781923294067383 + }, + { + "auxiliary_loss_clip": 0.01128684, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.04605126, + "balance_loss_mlp": 1.02642608, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 5.2203606624166525, + "language_loss": 0.83300316, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85469019, + "num_input_tokens_seen": 149807250, + "step": 6980, + "time_per_iteration": 3.9055228233337402 + }, + { + "auxiliary_loss_clip": 0.01132571, + "auxiliary_loss_mlp": 0.01040415, + "balance_loss_clip": 1.04752183, + "balance_loss_mlp": 1.02649164, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 3.498798733843003, + "language_loss": 0.78849888, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81022882, + "num_input_tokens_seen": 149821640, + "step": 6981, + "time_per_iteration": 2.428006172180176 + }, + { + "auxiliary_loss_clip": 0.01096909, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.04721844, + "balance_loss_mlp": 1.02245224, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.7121598332115109, + "language_loss": 0.83973098, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86105758, + "num_input_tokens_seen": 149840545, + "step": 6982, + "time_per_iteration": 2.5334160327911377 + }, + { + "auxiliary_loss_clip": 0.01121791, + "auxiliary_loss_mlp": 0.01046114, + "balance_loss_clip": 1.0476836, + "balance_loss_mlp": 1.031106, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 3.0286570796928918, + "language_loss": 0.5672403, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58891934, + "num_input_tokens_seen": 149860375, + "step": 6983, + "time_per_iteration": 2.5334932804107666 + }, + { + "auxiliary_loss_clip": 0.01123665, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.05256295, + "balance_loss_mlp": 1.02806973, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 2.1037243993159804, + "language_loss": 0.82420158, + "learning_rate": 2.606243492174471e-06, + "loss": 0.845851, + "num_input_tokens_seen": 149877850, + "step": 6984, + "time_per_iteration": 2.462721109390259 + }, + { + "auxiliary_loss_clip": 0.01111757, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.04399335, + "balance_loss_mlp": 1.02083373, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.9911098485002887, + "language_loss": 0.78739554, + "learning_rate": 2.605872342456914e-06, + "loss": 0.80886686, + "num_input_tokens_seen": 149896110, + "step": 6985, + "time_per_iteration": 2.5085654258728027 + }, + { + "auxiliary_loss_clip": 0.01134663, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.04709816, + "balance_loss_mlp": 1.02241325, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 1.6844498006562314, + "language_loss": 0.7806679, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.80239356, + "num_input_tokens_seen": 149916495, + "step": 6986, + "time_per_iteration": 2.5048201084136963 + }, + { + "auxiliary_loss_clip": 0.01106417, + "auxiliary_loss_mlp": 0.01033805, + "balance_loss_clip": 1.04503989, + "balance_loss_mlp": 1.020823, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.5154810047230893, + "language_loss": 0.7214185, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74282074, + "num_input_tokens_seen": 149936445, + "step": 6987, + "time_per_iteration": 2.566237211227417 + }, + { + "auxiliary_loss_clip": 0.01108477, + "auxiliary_loss_mlp": 0.00785181, + "balance_loss_clip": 1.04617953, + "balance_loss_mlp": 1.00070024, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.5218772224077097, + "language_loss": 0.75017524, + "learning_rate": 2.604758755512104e-06, + "loss": 0.76911175, + "num_input_tokens_seen": 149959430, + "step": 6988, + "time_per_iteration": 2.6705777645111084 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.01036708, + "balance_loss_clip": 1.05023587, + "balance_loss_mlp": 1.02147341, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.7405331201844862, + "language_loss": 0.7406528, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76228297, + "num_input_tokens_seen": 149980365, + "step": 6989, + "time_per_iteration": 2.5433945655822754 + }, + { + "auxiliary_loss_clip": 0.01110934, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.04573822, + "balance_loss_mlp": 1.01712561, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.4770121924663635, + "language_loss": 0.70480889, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.7262395, + "num_input_tokens_seen": 149997375, + "step": 6990, + "time_per_iteration": 2.511915922164917 + }, + { + "auxiliary_loss_clip": 0.01041708, + "auxiliary_loss_mlp": 0.0075652, + "balance_loss_clip": 1.02987242, + "balance_loss_mlp": 1.00038612, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8280832047828506, + "language_loss": 0.60451043, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62249267, + "num_input_tokens_seen": 150051230, + "step": 6991, + "time_per_iteration": 2.9654290676116943 + }, + { + "auxiliary_loss_clip": 0.01133885, + "auxiliary_loss_mlp": 0.01041488, + "balance_loss_clip": 1.04772973, + "balance_loss_mlp": 1.02666438, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.5629493686350866, + "language_loss": 0.83244634, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85420001, + "num_input_tokens_seen": 150071135, + "step": 6992, + "time_per_iteration": 2.4844343662261963 + }, + { + "auxiliary_loss_clip": 0.01045351, + "auxiliary_loss_mlp": 0.0101533, + "balance_loss_clip": 1.017308, + "balance_loss_mlp": 1.01363742, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.806555210979196, + "language_loss": 0.6550808, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67568761, + "num_input_tokens_seen": 150125220, + "step": 6993, + "time_per_iteration": 2.9984819889068604 + }, + { + "auxiliary_loss_clip": 0.01135799, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.04811454, + "balance_loss_mlp": 1.02137613, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 2.955028660839501, + "language_loss": 0.8353945, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85713303, + "num_input_tokens_seen": 150142300, + "step": 6994, + "time_per_iteration": 2.417832136154175 + }, + { + "auxiliary_loss_clip": 0.01121627, + "auxiliary_loss_mlp": 0.00780517, + "balance_loss_clip": 1.04833877, + "balance_loss_mlp": 1.0006721, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.6571563038697823, + "language_loss": 0.78126085, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.8002823, + "num_input_tokens_seen": 150161345, + "step": 6995, + "time_per_iteration": 4.003541946411133 + }, + { + "auxiliary_loss_clip": 0.01095457, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.04359353, + "balance_loss_mlp": 1.016886, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.7723091880548114, + "language_loss": 0.79967713, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82093978, + "num_input_tokens_seen": 150182420, + "step": 6996, + "time_per_iteration": 2.5844428539276123 + }, + { + "auxiliary_loss_clip": 0.01112252, + "auxiliary_loss_mlp": 0.00780451, + "balance_loss_clip": 1.04705739, + "balance_loss_mlp": 1.00077581, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 2.0922659579620073, + "language_loss": 0.75813615, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77706313, + "num_input_tokens_seen": 150200175, + "step": 6997, + "time_per_iteration": 2.4965617656707764 + }, + { + "auxiliary_loss_clip": 0.01129742, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.04491234, + "balance_loss_mlp": 1.02569318, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 3.058034678688571, + "language_loss": 0.75705492, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.77876186, + "num_input_tokens_seen": 150217100, + "step": 6998, + "time_per_iteration": 4.010581970214844 + }, + { + "auxiliary_loss_clip": 0.01136258, + "auxiliary_loss_mlp": 0.01040627, + "balance_loss_clip": 1.04998016, + "balance_loss_mlp": 1.02508199, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.7155256128701797, + "language_loss": 0.75757861, + "learning_rate": 2.60067384046869e-06, + "loss": 0.77934754, + "num_input_tokens_seen": 150239830, + "step": 6999, + "time_per_iteration": 2.5063395500183105 + }, + { + "auxiliary_loss_clip": 0.01085549, + "auxiliary_loss_mlp": 0.01041071, + "balance_loss_clip": 1.04307318, + "balance_loss_mlp": 1.02502561, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 2.2116572460795156, + "language_loss": 0.64168584, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66295201, + "num_input_tokens_seen": 150260690, + "step": 7000, + "time_per_iteration": 2.5686557292938232 + }, + { + "auxiliary_loss_clip": 0.0109233, + "auxiliary_loss_mlp": 0.01042109, + "balance_loss_clip": 1.0428561, + "balance_loss_mlp": 1.02645636, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.499699939937722, + "language_loss": 0.76391816, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78526247, + "num_input_tokens_seen": 150279885, + "step": 7001, + "time_per_iteration": 2.5459413528442383 + }, + { + "auxiliary_loss_clip": 0.01094642, + "auxiliary_loss_mlp": 0.00779434, + "balance_loss_clip": 1.04677176, + "balance_loss_mlp": 1.00076318, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.5730565066171538, + "language_loss": 0.86725456, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88599539, + "num_input_tokens_seen": 150297390, + "step": 7002, + "time_per_iteration": 2.545095205307007 + }, + { + "auxiliary_loss_clip": 0.01095739, + "auxiliary_loss_mlp": 0.01040999, + "balance_loss_clip": 1.05076659, + "balance_loss_mlp": 1.02728987, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 1.9345421296286391, + "language_loss": 0.67447054, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.69583797, + "num_input_tokens_seen": 150317390, + "step": 7003, + "time_per_iteration": 4.003057479858398 + }, + { + "auxiliary_loss_clip": 0.01134516, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_clip": 1.04789472, + "balance_loss_mlp": 1.02593589, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 2.6958554637060295, + "language_loss": 0.77730399, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79906613, + "num_input_tokens_seen": 150337455, + "step": 7004, + "time_per_iteration": 2.462062120437622 + }, + { + "auxiliary_loss_clip": 0.01130362, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.04931521, + "balance_loss_mlp": 1.02596891, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.6527431208668997, + "language_loss": 0.68035656, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70207638, + "num_input_tokens_seen": 150355385, + "step": 7005, + "time_per_iteration": 2.495835542678833 + }, + { + "auxiliary_loss_clip": 0.01120949, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.04767489, + "balance_loss_mlp": 1.02077627, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 2.149968868590677, + "language_loss": 0.72413546, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74570292, + "num_input_tokens_seen": 150371750, + "step": 7006, + "time_per_iteration": 2.4424006938934326 + }, + { + "auxiliary_loss_clip": 0.01134017, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.04810774, + "balance_loss_mlp": 1.02083862, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.636514827732688, + "language_loss": 0.70832634, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.73002183, + "num_input_tokens_seen": 150389955, + "step": 7007, + "time_per_iteration": 2.414747953414917 + }, + { + "auxiliary_loss_clip": 0.01107737, + "auxiliary_loss_mlp": 0.00780092, + "balance_loss_clip": 1.04538548, + "balance_loss_mlp": 1.00079703, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 2.179445380989541, + "language_loss": 0.82274294, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84162116, + "num_input_tokens_seen": 150405780, + "step": 7008, + "time_per_iteration": 2.4896562099456787 + }, + { + "auxiliary_loss_clip": 0.01092598, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.04683399, + "balance_loss_mlp": 1.02666843, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 1.866400442233256, + "language_loss": 0.72094488, + "learning_rate": 2.596957889196831e-06, + "loss": 0.74228549, + "num_input_tokens_seen": 150425615, + "step": 7009, + "time_per_iteration": 2.5699355602264404 + }, + { + "auxiliary_loss_clip": 0.0113198, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.04715109, + "balance_loss_mlp": 1.01640904, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 2.406725092988929, + "language_loss": 0.65845942, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68009424, + "num_input_tokens_seen": 150445765, + "step": 7010, + "time_per_iteration": 2.5000078678131104 + }, + { + "auxiliary_loss_clip": 0.01093206, + "auxiliary_loss_mlp": 0.01035717, + "balance_loss_clip": 1.04275942, + "balance_loss_mlp": 1.02091122, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.7608760034354733, + "language_loss": 0.72880507, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.75009423, + "num_input_tokens_seen": 150464405, + "step": 7011, + "time_per_iteration": 2.5611188411712646 + }, + { + "auxiliary_loss_clip": 0.0103406, + "auxiliary_loss_mlp": 0.01001575, + "balance_loss_clip": 1.0148139, + "balance_loss_mlp": 0.9998585, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.7915828865190134, + "language_loss": 0.54356742, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56392372, + "num_input_tokens_seen": 150520430, + "step": 7012, + "time_per_iteration": 2.940056562423706 + }, + { + "auxiliary_loss_clip": 0.01124357, + "auxiliary_loss_mlp": 0.01038607, + "balance_loss_clip": 1.05098462, + "balance_loss_mlp": 1.02312207, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.3789567970282062, + "language_loss": 0.7862367, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80786633, + "num_input_tokens_seen": 150542610, + "step": 7013, + "time_per_iteration": 2.522308588027954 + }, + { + "auxiliary_loss_clip": 0.01132335, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.0468955, + "balance_loss_mlp": 1.02292848, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 3.174346554746314, + "language_loss": 0.81561089, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83731359, + "num_input_tokens_seen": 150560970, + "step": 7014, + "time_per_iteration": 2.4551942348480225 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.04379153, + "balance_loss_mlp": 1.02124023, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.5234756034824979, + "language_loss": 0.7766856, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.79823685, + "num_input_tokens_seen": 150582615, + "step": 7015, + "time_per_iteration": 2.5232224464416504 + }, + { + "auxiliary_loss_clip": 0.01134726, + "auxiliary_loss_mlp": 0.01040132, + "balance_loss_clip": 1.04919016, + "balance_loss_mlp": 1.02493274, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.3839341621972259, + "language_loss": 0.82198876, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84373736, + "num_input_tokens_seen": 150603640, + "step": 7016, + "time_per_iteration": 2.4854679107666016 + }, + { + "auxiliary_loss_clip": 0.01095579, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.04128659, + "balance_loss_mlp": 1.02246618, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 1.764979934672909, + "language_loss": 0.67990398, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70124024, + "num_input_tokens_seen": 150622490, + "step": 7017, + "time_per_iteration": 2.56386137008667 + }, + { + "auxiliary_loss_clip": 0.01036056, + "auxiliary_loss_mlp": 0.01006025, + "balance_loss_clip": 1.01816916, + "balance_loss_mlp": 1.00421321, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6755942941267722, + "language_loss": 0.59386969, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61429048, + "num_input_tokens_seen": 150689545, + "step": 7018, + "time_per_iteration": 3.11570143699646 + }, + { + "auxiliary_loss_clip": 0.01118097, + "auxiliary_loss_mlp": 0.01042412, + "balance_loss_clip": 1.04368627, + "balance_loss_mlp": 1.02777338, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 3.4273149242201306, + "language_loss": 0.74635822, + "learning_rate": 2.593239674255382e-06, + "loss": 0.76796329, + "num_input_tokens_seen": 150707610, + "step": 7019, + "time_per_iteration": 4.05337119102478 + }, + { + "auxiliary_loss_clip": 0.01110424, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.04910052, + "balance_loss_mlp": 1.02312875, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 1.8779468534226738, + "language_loss": 0.69586068, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71734655, + "num_input_tokens_seen": 150724530, + "step": 7020, + "time_per_iteration": 2.4918153285980225 + }, + { + "auxiliary_loss_clip": 0.01110424, + "auxiliary_loss_mlp": 0.0077968, + "balance_loss_clip": 1.05023146, + "balance_loss_mlp": 1.00068688, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.5272646010664785, + "language_loss": 0.81115627, + "learning_rate": 2.592495760867347e-06, + "loss": 0.83005726, + "num_input_tokens_seen": 150742870, + "step": 7021, + "time_per_iteration": 2.5296127796173096 + }, + { + "auxiliary_loss_clip": 0.01061689, + "auxiliary_loss_mlp": 0.01047092, + "balance_loss_clip": 1.03521252, + "balance_loss_mlp": 1.0299139, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 2.0300055448413423, + "language_loss": 0.70010924, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.72119713, + "num_input_tokens_seen": 150765500, + "step": 7022, + "time_per_iteration": 2.684058904647827 + }, + { + "auxiliary_loss_clip": 0.01114766, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.04623675, + "balance_loss_mlp": 1.02081454, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.532971666102562, + "language_loss": 0.67334104, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69481808, + "num_input_tokens_seen": 150784945, + "step": 7023, + "time_per_iteration": 2.551090717315674 + }, + { + "auxiliary_loss_clip": 0.01101624, + "auxiliary_loss_mlp": 0.01046267, + "balance_loss_clip": 1.04561222, + "balance_loss_mlp": 1.02958989, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.5946197298472449, + "language_loss": 0.69420409, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71568298, + "num_input_tokens_seen": 150803120, + "step": 7024, + "time_per_iteration": 2.5109503269195557 + }, + { + "auxiliary_loss_clip": 0.01132194, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.04916477, + "balance_loss_mlp": 1.02458727, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.545972925183103, + "language_loss": 0.77015126, + "learning_rate": 2.591007664594147e-06, + "loss": 0.79186189, + "num_input_tokens_seen": 150823135, + "step": 7025, + "time_per_iteration": 2.46588134765625 + }, + { + "auxiliary_loss_clip": 0.0109642, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.04195797, + "balance_loss_mlp": 1.02278709, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.6398201589126544, + "language_loss": 0.79448426, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81581777, + "num_input_tokens_seen": 150842070, + "step": 7026, + "time_per_iteration": 2.5096840858459473 + }, + { + "auxiliary_loss_clip": 0.01040925, + "auxiliary_loss_mlp": 0.01002635, + "balance_loss_clip": 1.01338482, + "balance_loss_mlp": 1.00087094, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.720458792962744, + "language_loss": 0.61839724, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.63883281, + "num_input_tokens_seen": 150907450, + "step": 7027, + "time_per_iteration": 3.0937869548797607 + }, + { + "auxiliary_loss_clip": 0.01129948, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.04816341, + "balance_loss_mlp": 1.02390289, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 2.2386526707986945, + "language_loss": 0.71516478, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.73684442, + "num_input_tokens_seen": 150928040, + "step": 7028, + "time_per_iteration": 2.525692939758301 + }, + { + "auxiliary_loss_clip": 0.01107463, + "auxiliary_loss_mlp": 0.01044651, + "balance_loss_clip": 1.04686856, + "balance_loss_mlp": 1.03051293, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.9542589057320434, + "language_loss": 0.82388711, + "learning_rate": 2.589519209743846e-06, + "loss": 0.8454082, + "num_input_tokens_seen": 150945760, + "step": 7029, + "time_per_iteration": 2.502568006515503 + }, + { + "auxiliary_loss_clip": 0.01089692, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_clip": 1.04548001, + "balance_loss_mlp": 1.02686822, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 1.8740809135413456, + "language_loss": 0.75391281, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77522665, + "num_input_tokens_seen": 150965665, + "step": 7030, + "time_per_iteration": 2.601719379425049 + }, + { + "auxiliary_loss_clip": 0.01130574, + "auxiliary_loss_mlp": 0.01038141, + "balance_loss_clip": 1.04709184, + "balance_loss_mlp": 1.0226078, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 2.911457270918518, + "language_loss": 0.86785567, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88954282, + "num_input_tokens_seen": 150982260, + "step": 7031, + "time_per_iteration": 2.4580729007720947 + }, + { + "auxiliary_loss_clip": 0.01119213, + "auxiliary_loss_mlp": 0.01041544, + "balance_loss_clip": 1.04783309, + "balance_loss_mlp": 1.02614236, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 2.6508975701650144, + "language_loss": 0.7312113, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75281882, + "num_input_tokens_seen": 150999990, + "step": 7032, + "time_per_iteration": 2.441957950592041 + }, + { + "auxiliary_loss_clip": 0.01104509, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.04571748, + "balance_loss_mlp": 1.03182685, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.8156971282918157, + "language_loss": 0.70275211, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.7242707, + "num_input_tokens_seen": 151021105, + "step": 7033, + "time_per_iteration": 2.545156240463257 + }, + { + "auxiliary_loss_clip": 0.01108391, + "auxiliary_loss_mlp": 0.00780529, + "balance_loss_clip": 1.0460912, + "balance_loss_mlp": 1.00081694, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 1.7850799142955507, + "language_loss": 0.90454197, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92343122, + "num_input_tokens_seen": 151040665, + "step": 7034, + "time_per_iteration": 3.9763941764831543 + }, + { + "auxiliary_loss_clip": 0.01109211, + "auxiliary_loss_mlp": 0.01035198, + "balance_loss_clip": 1.04739738, + "balance_loss_mlp": 1.02180529, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.5523382165834738, + "language_loss": 0.77216977, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79361385, + "num_input_tokens_seen": 151061240, + "step": 7035, + "time_per_iteration": 2.55216121673584 + }, + { + "auxiliary_loss_clip": 0.0112071, + "auxiliary_loss_mlp": 0.01043636, + "balance_loss_clip": 1.05130148, + "balance_loss_mlp": 1.02884185, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 2.309798992560928, + "language_loss": 0.82317448, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84481788, + "num_input_tokens_seen": 151076870, + "step": 7036, + "time_per_iteration": 2.434377908706665 + }, + { + "auxiliary_loss_clip": 0.01107161, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.04973531, + "balance_loss_mlp": 1.02153611, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.8768559665584217, + "language_loss": 0.70756096, + "learning_rate": 2.58654122792447e-06, + "loss": 0.72898126, + "num_input_tokens_seen": 151095110, + "step": 7037, + "time_per_iteration": 4.096523761749268 + }, + { + "auxiliary_loss_clip": 0.01092842, + "auxiliary_loss_mlp": 0.00780723, + "balance_loss_clip": 1.04744625, + "balance_loss_mlp": 1.00078511, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.6041879526490037, + "language_loss": 0.78024894, + "learning_rate": 2.586168879961155e-06, + "loss": 0.79898465, + "num_input_tokens_seen": 151114355, + "step": 7038, + "time_per_iteration": 2.542903184890747 + }, + { + "auxiliary_loss_clip": 0.01099381, + "auxiliary_loss_mlp": 0.01040173, + "balance_loss_clip": 1.05209005, + "balance_loss_mlp": 1.02407932, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.782449632337458, + "language_loss": 0.66909981, + "learning_rate": 2.585796509770259e-06, + "loss": 0.69049537, + "num_input_tokens_seen": 151131505, + "step": 7039, + "time_per_iteration": 2.5385544300079346 + }, + { + "auxiliary_loss_clip": 0.01124858, + "auxiliary_loss_mlp": 0.01037222, + "balance_loss_clip": 1.04780006, + "balance_loss_mlp": 1.02181995, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 2.0749659379945156, + "language_loss": 0.75629586, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.77791667, + "num_input_tokens_seen": 151151555, + "step": 7040, + "time_per_iteration": 2.5092639923095703 + }, + { + "auxiliary_loss_clip": 0.01117366, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.04478776, + "balance_loss_mlp": 1.01674032, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 2.8934410432287816, + "language_loss": 0.64819574, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.6696775, + "num_input_tokens_seen": 151172385, + "step": 7041, + "time_per_iteration": 2.53183913230896 + }, + { + "auxiliary_loss_clip": 0.0110353, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.0432179, + "balance_loss_mlp": 1.02162457, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.6602011850567837, + "language_loss": 0.74279571, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.76419365, + "num_input_tokens_seen": 151194930, + "step": 7042, + "time_per_iteration": 4.075213193893433 + }, + { + "auxiliary_loss_clip": 0.01116514, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.04763913, + "balance_loss_mlp": 1.02030468, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.3212739702433285, + "language_loss": 0.82309818, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84460145, + "num_input_tokens_seen": 151217905, + "step": 7043, + "time_per_iteration": 2.5369794368743896 + }, + { + "auxiliary_loss_clip": 0.01109736, + "auxiliary_loss_mlp": 0.01043069, + "balance_loss_clip": 1.04942727, + "balance_loss_mlp": 1.02643895, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.61684692485378, + "language_loss": 0.65084314, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67237121, + "num_input_tokens_seen": 151234580, + "step": 7044, + "time_per_iteration": 2.5147836208343506 + }, + { + "auxiliary_loss_clip": 0.01117595, + "auxiliary_loss_mlp": 0.01049858, + "balance_loss_clip": 1.04667258, + "balance_loss_mlp": 1.03324056, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 10.946918682238584, + "language_loss": 0.75239229, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77406681, + "num_input_tokens_seen": 151254765, + "step": 7045, + "time_per_iteration": 2.614227056503296 + }, + { + "auxiliary_loss_clip": 0.01095938, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.04840612, + "balance_loss_mlp": 1.02882421, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.2923281775833004, + "language_loss": 0.80975091, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.83114624, + "num_input_tokens_seen": 151269045, + "step": 7046, + "time_per_iteration": 2.518573522567749 + }, + { + "auxiliary_loss_clip": 0.0105757, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.05157351, + "balance_loss_mlp": 1.0226692, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.7038527068348, + "language_loss": 0.76430357, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.78525376, + "num_input_tokens_seen": 151287530, + "step": 7047, + "time_per_iteration": 2.7427690029144287 + }, + { + "auxiliary_loss_clip": 0.01127697, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.04773748, + "balance_loss_mlp": 1.0176549, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.7943457087763806, + "language_loss": 0.68397355, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70556903, + "num_input_tokens_seen": 151308905, + "step": 7048, + "time_per_iteration": 2.6887688636779785 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_clip": 1.04467988, + "balance_loss_mlp": 1.02674162, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 2.82489415526321, + "language_loss": 0.77993679, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80153644, + "num_input_tokens_seen": 151326525, + "step": 7049, + "time_per_iteration": 2.4584426879882812 + }, + { + "auxiliary_loss_clip": 0.01121061, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.04736269, + "balance_loss_mlp": 1.02705133, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 2.6736336787750883, + "language_loss": 0.82347476, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84510171, + "num_input_tokens_seen": 151344675, + "step": 7050, + "time_per_iteration": 2.4783289432525635 + }, + { + "auxiliary_loss_clip": 0.01127936, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.04411626, + "balance_loss_mlp": 1.02266407, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.100584319432896, + "language_loss": 0.72729993, + "learning_rate": 2.581326338868687e-06, + "loss": 0.74894947, + "num_input_tokens_seen": 151360730, + "step": 7051, + "time_per_iteration": 2.5194263458251953 + }, + { + "auxiliary_loss_clip": 0.0110218, + "auxiliary_loss_mlp": 0.01036642, + "balance_loss_clip": 1.050125, + "balance_loss_mlp": 1.02283192, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 2.2134260328161006, + "language_loss": 0.86363745, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.88502562, + "num_input_tokens_seen": 151380445, + "step": 7052, + "time_per_iteration": 2.570483446121216 + }, + { + "auxiliary_loss_clip": 0.01104269, + "auxiliary_loss_mlp": 0.01045983, + "balance_loss_clip": 1.04411745, + "balance_loss_mlp": 1.03123116, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.3660489114933434, + "language_loss": 0.72444856, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74595106, + "num_input_tokens_seen": 151399325, + "step": 7053, + "time_per_iteration": 2.51125431060791 + }, + { + "auxiliary_loss_clip": 0.01101964, + "auxiliary_loss_mlp": 0.00781451, + "balance_loss_clip": 1.04910827, + "balance_loss_mlp": 1.0008893, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.4968753368192804, + "language_loss": 0.81913352, + "learning_rate": 2.580208299200704e-06, + "loss": 0.83796763, + "num_input_tokens_seen": 151417240, + "step": 7054, + "time_per_iteration": 2.567037343978882 + }, + { + "auxiliary_loss_clip": 0.01034205, + "auxiliary_loss_mlp": 0.01004429, + "balance_loss_clip": 1.01749492, + "balance_loss_mlp": 1.00268888, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7799291678130924, + "language_loss": 0.60331607, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62370241, + "num_input_tokens_seen": 151476015, + "step": 7055, + "time_per_iteration": 3.024461030960083 + }, + { + "auxiliary_loss_clip": 0.01130399, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_clip": 1.04545939, + "balance_loss_mlp": 1.02701116, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.330860002302523, + "language_loss": 0.77200675, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.79373491, + "num_input_tokens_seen": 151492035, + "step": 7056, + "time_per_iteration": 2.4216597080230713 + }, + { + "auxiliary_loss_clip": 0.01123329, + "auxiliary_loss_mlp": 0.01038172, + "balance_loss_clip": 1.04597116, + "balance_loss_mlp": 1.02157855, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 1.9602519938415475, + "language_loss": 0.84517044, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86678547, + "num_input_tokens_seen": 151508970, + "step": 7057, + "time_per_iteration": 2.486682415008545 + }, + { + "auxiliary_loss_clip": 0.01097785, + "auxiliary_loss_mlp": 0.01038333, + "balance_loss_clip": 1.04870701, + "balance_loss_mlp": 1.02265739, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 2.681960374583361, + "language_loss": 0.82836258, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.84972376, + "num_input_tokens_seen": 151525295, + "step": 7058, + "time_per_iteration": 2.5260989665985107 + }, + { + "auxiliary_loss_clip": 0.01103531, + "auxiliary_loss_mlp": 0.0077913, + "balance_loss_clip": 1.04638672, + "balance_loss_mlp": 1.00083756, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.7117090552268306, + "language_loss": 0.80273062, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82155722, + "num_input_tokens_seen": 151544435, + "step": 7059, + "time_per_iteration": 4.238690376281738 + }, + { + "auxiliary_loss_clip": 0.0113211, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.04800606, + "balance_loss_mlp": 1.02577269, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 1.9677490794647015, + "language_loss": 0.69906384, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72080284, + "num_input_tokens_seen": 151559520, + "step": 7060, + "time_per_iteration": 2.420224905014038 + }, + { + "auxiliary_loss_clip": 0.01123373, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.04849577, + "balance_loss_mlp": 1.02305388, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 4.6633146432813914, + "language_loss": 0.76213539, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78374755, + "num_input_tokens_seen": 151579790, + "step": 7061, + "time_per_iteration": 2.510751962661743 + }, + { + "auxiliary_loss_clip": 0.0112188, + "auxiliary_loss_mlp": 0.01041481, + "balance_loss_clip": 1.04802465, + "balance_loss_mlp": 1.02542901, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 4.630994168879641, + "language_loss": 0.72577417, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.74740779, + "num_input_tokens_seen": 151598285, + "step": 7062, + "time_per_iteration": 2.458503246307373 + }, + { + "auxiliary_loss_clip": 0.01109117, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.05095983, + "balance_loss_mlp": 1.02715755, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.7783166678623756, + "language_loss": 0.66123474, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68274206, + "num_input_tokens_seen": 151615430, + "step": 7063, + "time_per_iteration": 2.5226380825042725 + }, + { + "auxiliary_loss_clip": 0.01098735, + "auxiliary_loss_mlp": 0.0078124, + "balance_loss_clip": 1.04432714, + "balance_loss_mlp": 1.00077868, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.4694666270403123, + "language_loss": 0.78475928, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80355906, + "num_input_tokens_seen": 151637030, + "step": 7064, + "time_per_iteration": 2.6192357540130615 + }, + { + "auxiliary_loss_clip": 0.01130016, + "auxiliary_loss_mlp": 0.01040062, + "balance_loss_clip": 1.04532838, + "balance_loss_mlp": 1.02502966, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 3.2561335515832805, + "language_loss": 0.75463516, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77633595, + "num_input_tokens_seen": 151655745, + "step": 7065, + "time_per_iteration": 2.4437758922576904 + }, + { + "auxiliary_loss_clip": 0.01121414, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.04916692, + "balance_loss_mlp": 1.02410293, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.3387526655054163, + "language_loss": 0.72418475, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74579406, + "num_input_tokens_seen": 151678040, + "step": 7066, + "time_per_iteration": 2.5250818729400635 + }, + { + "auxiliary_loss_clip": 0.01095854, + "auxiliary_loss_mlp": 0.01037739, + "balance_loss_clip": 1.04514909, + "balance_loss_mlp": 1.02159202, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 2.0691539418961713, + "language_loss": 0.79723775, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.81857371, + "num_input_tokens_seen": 151696410, + "step": 7067, + "time_per_iteration": 2.5404911041259766 + }, + { + "auxiliary_loss_clip": 0.01042766, + "auxiliary_loss_mlp": 0.01001633, + "balance_loss_clip": 1.01562893, + "balance_loss_mlp": 1.00003552, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.9216166699773711, + "language_loss": 0.63457006, + "learning_rate": 2.574988168733022e-06, + "loss": 0.6550141, + "num_input_tokens_seen": 151756365, + "step": 7068, + "time_per_iteration": 2.9407806396484375 + }, + { + "auxiliary_loss_clip": 0.0113161, + "auxiliary_loss_mlp": 0.01038488, + "balance_loss_clip": 1.04774165, + "balance_loss_mlp": 1.02240682, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.700094780340549, + "language_loss": 0.72165012, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74335104, + "num_input_tokens_seen": 151775165, + "step": 7069, + "time_per_iteration": 2.437403917312622 + }, + { + "auxiliary_loss_clip": 0.01136201, + "auxiliary_loss_mlp": 0.01039469, + "balance_loss_clip": 1.05145907, + "balance_loss_mlp": 1.02308941, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 2.670370251942441, + "language_loss": 0.79116774, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81292439, + "num_input_tokens_seen": 151792620, + "step": 7070, + "time_per_iteration": 2.4293549060821533 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.04695702, + "balance_loss_mlp": 1.02191663, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 45.02376744752692, + "language_loss": 0.70409358, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72568458, + "num_input_tokens_seen": 151812850, + "step": 7071, + "time_per_iteration": 2.5299460887908936 + }, + { + "auxiliary_loss_clip": 0.01133347, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.04991555, + "balance_loss_mlp": 1.01883852, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 9.331347904362977, + "language_loss": 0.71556354, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73723537, + "num_input_tokens_seen": 151831785, + "step": 7072, + "time_per_iteration": 2.473864793777466 + }, + { + "auxiliary_loss_clip": 0.01096161, + "auxiliary_loss_mlp": 0.01040401, + "balance_loss_clip": 1.04851127, + "balance_loss_mlp": 1.0254643, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.895969068993367, + "language_loss": 0.81184423, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83320987, + "num_input_tokens_seen": 151853885, + "step": 7073, + "time_per_iteration": 2.5788235664367676 + }, + { + "auxiliary_loss_clip": 0.0111732, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.04770613, + "balance_loss_mlp": 1.01872182, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.4102307628013486, + "language_loss": 0.9079954, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.92949867, + "num_input_tokens_seen": 151871780, + "step": 7074, + "time_per_iteration": 3.983189582824707 + }, + { + "auxiliary_loss_clip": 0.01125723, + "auxiliary_loss_mlp": 0.00782498, + "balance_loss_clip": 1.0488553, + "balance_loss_mlp": 1.00078988, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.600015080473096, + "language_loss": 0.64208722, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66116941, + "num_input_tokens_seen": 151891600, + "step": 7075, + "time_per_iteration": 2.486799716949463 + }, + { + "auxiliary_loss_clip": 0.01092974, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.04426444, + "balance_loss_mlp": 1.01876903, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 2.8866823670495636, + "language_loss": 0.73531663, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75657129, + "num_input_tokens_seen": 151911330, + "step": 7076, + "time_per_iteration": 4.039397716522217 + }, + { + "auxiliary_loss_clip": 0.01108035, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_clip": 1.04345119, + "balance_loss_mlp": 1.02543616, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 2.0215300774947838, + "language_loss": 0.78862739, + "learning_rate": 2.571630111462766e-06, + "loss": 0.81011593, + "num_input_tokens_seen": 151930355, + "step": 7077, + "time_per_iteration": 2.543027400970459 + }, + { + "auxiliary_loss_clip": 0.01103618, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.04430056, + "balance_loss_mlp": 1.02122521, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 4.003226773207532, + "language_loss": 0.7322917, + "learning_rate": 2.571256885418265e-06, + "loss": 0.7536695, + "num_input_tokens_seen": 151949695, + "step": 7078, + "time_per_iteration": 2.522005319595337 + }, + { + "auxiliary_loss_clip": 0.01103645, + "auxiliary_loss_mlp": 0.01037519, + "balance_loss_clip": 1.0489161, + "balance_loss_mlp": 1.02455497, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.936884868443915, + "language_loss": 0.79662192, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81803358, + "num_input_tokens_seen": 151967640, + "step": 7079, + "time_per_iteration": 2.487330675125122 + }, + { + "auxiliary_loss_clip": 0.01120852, + "auxiliary_loss_mlp": 0.01035182, + "balance_loss_clip": 1.05071163, + "balance_loss_mlp": 1.02164626, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.4912510140758597, + "language_loss": 0.7225408, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74410117, + "num_input_tokens_seen": 151994020, + "step": 7080, + "time_per_iteration": 2.8209829330444336 + }, + { + "auxiliary_loss_clip": 0.01129347, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.04653573, + "balance_loss_mlp": 1.02248549, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 1.97564605463084, + "language_loss": 0.80782604, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82948345, + "num_input_tokens_seen": 152013415, + "step": 7081, + "time_per_iteration": 3.916973829269409 + }, + { + "auxiliary_loss_clip": 0.01100815, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.04287434, + "balance_loss_mlp": 1.01616406, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.9863181284936817, + "language_loss": 0.81554019, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.83684957, + "num_input_tokens_seen": 152030860, + "step": 7082, + "time_per_iteration": 2.512493133544922 + }, + { + "auxiliary_loss_clip": 0.01119939, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.04851341, + "balance_loss_mlp": 1.02450609, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 2.1694292873079895, + "language_loss": 0.6995787, + "learning_rate": 2.569390430547065e-06, + "loss": 0.72117352, + "num_input_tokens_seen": 152050395, + "step": 7083, + "time_per_iteration": 2.5017831325531006 + }, + { + "auxiliary_loss_clip": 0.01031524, + "auxiliary_loss_mlp": 0.01007798, + "balance_loss_clip": 1.01348877, + "balance_loss_mlp": 1.00611675, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.881210705030514, + "language_loss": 0.67146111, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69185424, + "num_input_tokens_seen": 152113555, + "step": 7084, + "time_per_iteration": 3.1527950763702393 + }, + { + "auxiliary_loss_clip": 0.01118197, + "auxiliary_loss_mlp": 0.01041768, + "balance_loss_clip": 1.0469687, + "balance_loss_mlp": 1.02636003, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 1.9124322854048963, + "language_loss": 0.783687, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80528665, + "num_input_tokens_seen": 152131575, + "step": 7085, + "time_per_iteration": 2.4379377365112305 + }, + { + "auxiliary_loss_clip": 0.01121253, + "auxiliary_loss_mlp": 0.01042588, + "balance_loss_clip": 1.04702044, + "balance_loss_mlp": 1.02541578, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.3857983060845744, + "language_loss": 0.75483322, + "learning_rate": 2.568270298414995e-06, + "loss": 0.77647161, + "num_input_tokens_seen": 152149435, + "step": 7086, + "time_per_iteration": 2.4265029430389404 + }, + { + "auxiliary_loss_clip": 0.01108985, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.04624915, + "balance_loss_mlp": 1.0221076, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 1.952829171237505, + "language_loss": 0.80622965, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82769269, + "num_input_tokens_seen": 152166860, + "step": 7087, + "time_per_iteration": 2.481015205383301 + }, + { + "auxiliary_loss_clip": 0.01110783, + "auxiliary_loss_mlp": 0.01035787, + "balance_loss_clip": 1.04674864, + "balance_loss_mlp": 1.02046287, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.804487796641318, + "language_loss": 0.6560086, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.67747426, + "num_input_tokens_seen": 152187475, + "step": 7088, + "time_per_iteration": 2.5184409618377686 + }, + { + "auxiliary_loss_clip": 0.01077463, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.03862381, + "balance_loss_mlp": 1.02465987, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 4.782395633907585, + "language_loss": 0.6808238, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70200044, + "num_input_tokens_seen": 152207235, + "step": 7089, + "time_per_iteration": 2.607224464416504 + }, + { + "auxiliary_loss_clip": 0.01086944, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.04452717, + "balance_loss_mlp": 1.0215863, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.5016072681167834, + "language_loss": 0.7323159, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75354987, + "num_input_tokens_seen": 152224240, + "step": 7090, + "time_per_iteration": 2.532832622528076 + }, + { + "auxiliary_loss_clip": 0.01110058, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_clip": 1.04389811, + "balance_loss_mlp": 1.0285244, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.9273784236242213, + "language_loss": 0.75429499, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77582663, + "num_input_tokens_seen": 152242595, + "step": 7091, + "time_per_iteration": 2.5733482837677 + }, + { + "auxiliary_loss_clip": 0.01080808, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.04741907, + "balance_loss_mlp": 1.0196414, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 1.699265706528013, + "language_loss": 0.82313281, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84426463, + "num_input_tokens_seen": 152260840, + "step": 7092, + "time_per_iteration": 2.5769457817077637 + }, + { + "auxiliary_loss_clip": 0.0110929, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.04366517, + "balance_loss_mlp": 1.02584505, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.6584814566357837, + "language_loss": 0.73983282, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76132649, + "num_input_tokens_seen": 152280580, + "step": 7093, + "time_per_iteration": 2.5841691493988037 + }, + { + "auxiliary_loss_clip": 0.01121264, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.04641366, + "balance_loss_mlp": 1.021451, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.572522942248605, + "language_loss": 0.70245433, + "learning_rate": 2.565282332284532e-06, + "loss": 0.72403538, + "num_input_tokens_seen": 152298455, + "step": 7094, + "time_per_iteration": 2.5163991451263428 + }, + { + "auxiliary_loss_clip": 0.01097141, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.04626799, + "balance_loss_mlp": 1.02480149, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.7689261661939943, + "language_loss": 0.81812012, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83948839, + "num_input_tokens_seen": 152316995, + "step": 7095, + "time_per_iteration": 2.539215326309204 + }, + { + "auxiliary_loss_clip": 0.01130079, + "auxiliary_loss_mlp": 0.01048945, + "balance_loss_clip": 1.04638827, + "balance_loss_mlp": 1.0340085, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 2.4602016784496623, + "language_loss": 0.80310768, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82489789, + "num_input_tokens_seen": 152334800, + "step": 7096, + "time_per_iteration": 2.4364917278289795 + }, + { + "auxiliary_loss_clip": 0.01121418, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.0464921, + "balance_loss_mlp": 1.02444577, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 2.1041830368516834, + "language_loss": 0.65071547, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67231971, + "num_input_tokens_seen": 152355175, + "step": 7097, + "time_per_iteration": 2.5105414390563965 + }, + { + "auxiliary_loss_clip": 0.01100117, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.04744303, + "balance_loss_mlp": 1.01698637, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.7061610645190561, + "language_loss": 0.74320984, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76452649, + "num_input_tokens_seen": 152377245, + "step": 7098, + "time_per_iteration": 4.178308486938477 + }, + { + "auxiliary_loss_clip": 0.01119002, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.04687071, + "balance_loss_mlp": 1.02019501, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.641541113231235, + "language_loss": 0.75148773, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77301586, + "num_input_tokens_seen": 152396985, + "step": 7099, + "time_per_iteration": 2.5426313877105713 + }, + { + "auxiliary_loss_clip": 0.01109447, + "auxiliary_loss_mlp": 0.01048503, + "balance_loss_clip": 1.04380202, + "balance_loss_mlp": 1.03302383, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 1.9943523019498997, + "language_loss": 0.82119036, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.84276986, + "num_input_tokens_seen": 152415590, + "step": 7100, + "time_per_iteration": 2.5460097789764404 + }, + { + "auxiliary_loss_clip": 0.01108435, + "auxiliary_loss_mlp": 0.01033947, + "balance_loss_clip": 1.04470778, + "balance_loss_mlp": 1.02004695, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.407871376643185, + "language_loss": 0.81986845, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84129226, + "num_input_tokens_seen": 152436735, + "step": 7101, + "time_per_iteration": 2.5534653663635254 + }, + { + "auxiliary_loss_clip": 0.01135267, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.04911387, + "balance_loss_mlp": 1.0174396, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 1.8905542157791828, + "language_loss": 0.7309984, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.75268054, + "num_input_tokens_seen": 152455685, + "step": 7102, + "time_per_iteration": 2.4251291751861572 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.0103444, + "balance_loss_clip": 1.04566467, + "balance_loss_mlp": 1.02091527, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 2.01681928193611, + "language_loss": 0.82775199, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.84926224, + "num_input_tokens_seen": 152473500, + "step": 7103, + "time_per_iteration": 2.4450972080230713 + }, + { + "auxiliary_loss_clip": 0.01104746, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.04067349, + "balance_loss_mlp": 1.02529001, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.668400190533394, + "language_loss": 0.74015683, + "learning_rate": 2.561545446271294e-06, + "loss": 0.76162744, + "num_input_tokens_seen": 152491320, + "step": 7104, + "time_per_iteration": 2.4822041988372803 + }, + { + "auxiliary_loss_clip": 0.01115969, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.04423547, + "balance_loss_mlp": 1.0210247, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.3114710408377044, + "language_loss": 0.7544415, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77595222, + "num_input_tokens_seen": 152511970, + "step": 7105, + "time_per_iteration": 2.5541746616363525 + }, + { + "auxiliary_loss_clip": 0.0113275, + "auxiliary_loss_mlp": 0.01036354, + "balance_loss_clip": 1.04870462, + "balance_loss_mlp": 1.02253747, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 2.2479870136011475, + "language_loss": 0.76901793, + "learning_rate": 2.560797813088819e-06, + "loss": 0.7907089, + "num_input_tokens_seen": 152530515, + "step": 7106, + "time_per_iteration": 2.4363245964050293 + }, + { + "auxiliary_loss_clip": 0.01108927, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.04718518, + "balance_loss_mlp": 1.02195418, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.9634956609413639, + "language_loss": 0.80191803, + "learning_rate": 2.560423964592229e-06, + "loss": 0.82335943, + "num_input_tokens_seen": 152549295, + "step": 7107, + "time_per_iteration": 2.5244390964508057 + }, + { + "auxiliary_loss_clip": 0.01083517, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.04426312, + "balance_loss_mlp": 1.02515686, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.465573662420796, + "language_loss": 0.68384868, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70508015, + "num_input_tokens_seen": 152570725, + "step": 7108, + "time_per_iteration": 2.6255691051483154 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.01043354, + "balance_loss_clip": 1.04964805, + "balance_loss_mlp": 1.02949619, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.6325127718317087, + "language_loss": 0.71462917, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73615396, + "num_input_tokens_seen": 152588950, + "step": 7109, + "time_per_iteration": 2.511302947998047 + }, + { + "auxiliary_loss_clip": 0.01118557, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.04465747, + "balance_loss_mlp": 1.02636075, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 1.8664224427423821, + "language_loss": 0.64652658, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66814399, + "num_input_tokens_seen": 152608965, + "step": 7110, + "time_per_iteration": 2.531064510345459 + }, + { + "auxiliary_loss_clip": 0.01131917, + "auxiliary_loss_mlp": 0.00780117, + "balance_loss_clip": 1.04840386, + "balance_loss_mlp": 1.00058007, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.6007013621136277, + "language_loss": 0.76335973, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78248006, + "num_input_tokens_seen": 152630220, + "step": 7111, + "time_per_iteration": 2.488471031188965 + }, + { + "auxiliary_loss_clip": 0.01101109, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.04703474, + "balance_loss_mlp": 1.02127433, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 2.1938578994932265, + "language_loss": 0.73317111, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75454199, + "num_input_tokens_seen": 152648835, + "step": 7112, + "time_per_iteration": 2.5484273433685303 + }, + { + "auxiliary_loss_clip": 0.01104388, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.04456162, + "balance_loss_mlp": 1.02681124, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.6017104710202277, + "language_loss": 0.7152971, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73674875, + "num_input_tokens_seen": 152668375, + "step": 7113, + "time_per_iteration": 4.062620401382446 + }, + { + "auxiliary_loss_clip": 0.01122916, + "auxiliary_loss_mlp": 0.01044838, + "balance_loss_clip": 1.04904234, + "balance_loss_mlp": 1.03041387, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 1.592887607028624, + "language_loss": 0.61744201, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.63911951, + "num_input_tokens_seen": 152689725, + "step": 7114, + "time_per_iteration": 2.515331745147705 + }, + { + "auxiliary_loss_clip": 0.0112731, + "auxiliary_loss_mlp": 0.01046496, + "balance_loss_clip": 1.04936743, + "balance_loss_mlp": 1.0291152, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 2.201831532610376, + "language_loss": 0.64477634, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.6665144, + "num_input_tokens_seen": 152709375, + "step": 7115, + "time_per_iteration": 2.5014820098876953 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01038654, + "balance_loss_clip": 1.04214513, + "balance_loss_mlp": 1.02465248, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.663826416108933, + "language_loss": 0.73717123, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.75859892, + "num_input_tokens_seen": 152727510, + "step": 7116, + "time_per_iteration": 4.058711767196655 + }, + { + "auxiliary_loss_clip": 0.01101544, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_clip": 1.03990829, + "balance_loss_mlp": 1.03204811, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 3.007272999653295, + "language_loss": 0.69425583, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71572751, + "num_input_tokens_seen": 152746670, + "step": 7117, + "time_per_iteration": 2.5920538902282715 + }, + { + "auxiliary_loss_clip": 0.0110727, + "auxiliary_loss_mlp": 0.01041758, + "balance_loss_clip": 1.04839313, + "balance_loss_mlp": 1.02780461, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 3.416812978657067, + "language_loss": 0.69756895, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.71905929, + "num_input_tokens_seen": 152760545, + "step": 7118, + "time_per_iteration": 2.4586241245269775 + }, + { + "auxiliary_loss_clip": 0.01091047, + "auxiliary_loss_mlp": 0.01049553, + "balance_loss_clip": 1.04290235, + "balance_loss_mlp": 1.03457987, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.1591020243998655, + "language_loss": 0.74633425, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76774025, + "num_input_tokens_seen": 152780970, + "step": 7119, + "time_per_iteration": 2.637329339981079 + }, + { + "auxiliary_loss_clip": 0.01064699, + "auxiliary_loss_mlp": 0.01040009, + "balance_loss_clip": 1.03756654, + "balance_loss_mlp": 1.02535248, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 2.055208562086449, + "language_loss": 0.75091487, + "learning_rate": 2.555562005426573e-06, + "loss": 0.77196193, + "num_input_tokens_seen": 152798475, + "step": 7120, + "time_per_iteration": 2.5865657329559326 + }, + { + "auxiliary_loss_clip": 0.01109359, + "auxiliary_loss_mlp": 0.00779835, + "balance_loss_clip": 1.04936242, + "balance_loss_mlp": 1.00053847, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.5470922314398188, + "language_loss": 0.7686367, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.78752863, + "num_input_tokens_seen": 152817555, + "step": 7121, + "time_per_iteration": 3.890044689178467 + }, + { + "auxiliary_loss_clip": 0.01105768, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.04497695, + "balance_loss_mlp": 1.02693439, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 2.1306862861106675, + "language_loss": 0.85333538, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87479758, + "num_input_tokens_seen": 152836295, + "step": 7122, + "time_per_iteration": 2.5046377182006836 + }, + { + "auxiliary_loss_clip": 0.01079465, + "auxiliary_loss_mlp": 0.01038589, + "balance_loss_clip": 1.04287601, + "balance_loss_mlp": 1.0240159, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 1.967756999887785, + "language_loss": 0.81182301, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83300352, + "num_input_tokens_seen": 152854950, + "step": 7123, + "time_per_iteration": 2.5636823177337646 + }, + { + "auxiliary_loss_clip": 0.01087729, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.0468024, + "balance_loss_mlp": 1.02347386, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.7542661447942427, + "language_loss": 0.8104291, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83168101, + "num_input_tokens_seen": 152873995, + "step": 7124, + "time_per_iteration": 2.5359160900115967 + }, + { + "auxiliary_loss_clip": 0.01117935, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.04501891, + "balance_loss_mlp": 1.02552676, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 1.7666017151653113, + "language_loss": 0.80552745, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82710922, + "num_input_tokens_seen": 152892925, + "step": 7125, + "time_per_iteration": 2.487060546875 + }, + { + "auxiliary_loss_clip": 0.01127126, + "auxiliary_loss_mlp": 0.00779422, + "balance_loss_clip": 1.04698956, + "balance_loss_mlp": 1.00045776, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 2.5415102840707635, + "language_loss": 0.74496853, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76403403, + "num_input_tokens_seen": 152910935, + "step": 7126, + "time_per_iteration": 2.4192819595336914 + }, + { + "auxiliary_loss_clip": 0.01117533, + "auxiliary_loss_mlp": 0.01038897, + "balance_loss_clip": 1.04680729, + "balance_loss_mlp": 1.02425253, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 1.5501432027250233, + "language_loss": 0.8125847, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.834149, + "num_input_tokens_seen": 152931030, + "step": 7127, + "time_per_iteration": 2.4940602779388428 + }, + { + "auxiliary_loss_clip": 0.01082421, + "auxiliary_loss_mlp": 0.01041541, + "balance_loss_clip": 1.04444814, + "balance_loss_mlp": 1.027915, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 2.9926436638257368, + "language_loss": 0.76205194, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78329158, + "num_input_tokens_seen": 152948085, + "step": 7128, + "time_per_iteration": 2.5423121452331543 + }, + { + "auxiliary_loss_clip": 0.01085559, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.04483628, + "balance_loss_mlp": 1.02265453, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 2.1240524178518294, + "language_loss": 0.73728806, + "learning_rate": 2.552193946194937e-06, + "loss": 0.75851834, + "num_input_tokens_seen": 152966265, + "step": 7129, + "time_per_iteration": 2.581592321395874 + }, + { + "auxiliary_loss_clip": 0.01121463, + "auxiliary_loss_mlp": 0.00778924, + "balance_loss_clip": 1.04983664, + "balance_loss_mlp": 1.00052667, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.7594477837636069, + "language_loss": 0.77991295, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79891682, + "num_input_tokens_seen": 152986775, + "step": 7130, + "time_per_iteration": 2.514195442199707 + }, + { + "auxiliary_loss_clip": 0.01111629, + "auxiliary_loss_mlp": 0.01041724, + "balance_loss_clip": 1.04988182, + "balance_loss_mlp": 1.02713835, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 2.3116707291556216, + "language_loss": 0.73131543, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75284892, + "num_input_tokens_seen": 153003595, + "step": 7131, + "time_per_iteration": 2.466663122177124 + }, + { + "auxiliary_loss_clip": 0.01109411, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.04560542, + "balance_loss_mlp": 1.02150762, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.137611574686312, + "language_loss": 0.77516377, + "learning_rate": 2.551070882366973e-06, + "loss": 0.79662049, + "num_input_tokens_seen": 153021960, + "step": 7132, + "time_per_iteration": 2.501575469970703 + }, + { + "auxiliary_loss_clip": 0.01090159, + "auxiliary_loss_mlp": 0.00780077, + "balance_loss_clip": 1.04483008, + "balance_loss_mlp": 1.00045168, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.6941658033101528, + "language_loss": 0.78345025, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80215263, + "num_input_tokens_seen": 153042110, + "step": 7133, + "time_per_iteration": 2.5812296867370605 + }, + { + "auxiliary_loss_clip": 0.01109849, + "auxiliary_loss_mlp": 0.01038899, + "balance_loss_clip": 1.0528276, + "balance_loss_mlp": 1.0246892, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 17.999556839080583, + "language_loss": 0.75121832, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77270579, + "num_input_tokens_seen": 153058925, + "step": 7134, + "time_per_iteration": 2.471390724182129 + }, + { + "auxiliary_loss_clip": 0.01111628, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.04393983, + "balance_loss_mlp": 1.03155267, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 1.9264591706045995, + "language_loss": 0.84276354, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86433911, + "num_input_tokens_seen": 153078070, + "step": 7135, + "time_per_iteration": 2.448888063430786 + }, + { + "auxiliary_loss_clip": 0.01060284, + "auxiliary_loss_mlp": 0.01042411, + "balance_loss_clip": 1.04071283, + "balance_loss_mlp": 1.02656841, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 1.9299140533246029, + "language_loss": 0.7462635, + "learning_rate": 2.549573171442666e-06, + "loss": 0.76729047, + "num_input_tokens_seen": 153096680, + "step": 7136, + "time_per_iteration": 2.6424412727355957 + }, + { + "auxiliary_loss_clip": 0.01115234, + "auxiliary_loss_mlp": 0.01036935, + "balance_loss_clip": 1.0466907, + "balance_loss_mlp": 1.02264762, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 1.7770092628634977, + "language_loss": 0.79014635, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81166804, + "num_input_tokens_seen": 153113305, + "step": 7137, + "time_per_iteration": 2.4507315158843994 + }, + { + "auxiliary_loss_clip": 0.01133429, + "auxiliary_loss_mlp": 0.01037986, + "balance_loss_clip": 1.04976439, + "balance_loss_mlp": 1.02313232, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 1.8828215214768769, + "language_loss": 0.75954247, + "learning_rate": 2.548824190884499e-06, + "loss": 0.78125656, + "num_input_tokens_seen": 153132735, + "step": 7138, + "time_per_iteration": 3.9417521953582764 + }, + { + "auxiliary_loss_clip": 0.01033566, + "auxiliary_loss_mlp": 0.01004452, + "balance_loss_clip": 1.02453351, + "balance_loss_mlp": 1.00261629, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7794904139659236, + "language_loss": 0.56150192, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58188206, + "num_input_tokens_seen": 153187925, + "step": 7139, + "time_per_iteration": 2.969416618347168 + }, + { + "auxiliary_loss_clip": 0.0112393, + "auxiliary_loss_mlp": 0.00777823, + "balance_loss_clip": 1.04519069, + "balance_loss_mlp": 1.00042713, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.7804064975941194, + "language_loss": 0.80907667, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.82809424, + "num_input_tokens_seen": 153206990, + "step": 7140, + "time_per_iteration": 2.4567558765411377 + }, + { + "auxiliary_loss_clip": 0.01115095, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.04416215, + "balance_loss_mlp": 1.01860988, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 2.096692416533438, + "language_loss": 0.82083333, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84231603, + "num_input_tokens_seen": 153222345, + "step": 7141, + "time_per_iteration": 2.434901714324951 + }, + { + "auxiliary_loss_clip": 0.01120919, + "auxiliary_loss_mlp": 0.01039681, + "balance_loss_clip": 1.04854441, + "balance_loss_mlp": 1.02488732, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 2.082200198162827, + "language_loss": 0.86542022, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88702619, + "num_input_tokens_seen": 153240570, + "step": 7142, + "time_per_iteration": 2.4969534873962402 + }, + { + "auxiliary_loss_clip": 0.01102801, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.0474875, + "balance_loss_mlp": 1.01734161, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.9181170026551755, + "language_loss": 0.78702366, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80836046, + "num_input_tokens_seen": 153259575, + "step": 7143, + "time_per_iteration": 2.5576553344726562 + }, + { + "auxiliary_loss_clip": 0.01081355, + "auxiliary_loss_mlp": 0.0103728, + "balance_loss_clip": 1.04276347, + "balance_loss_mlp": 1.02296281, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 1.9919648915063244, + "language_loss": 0.77369165, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.79487801, + "num_input_tokens_seen": 153276650, + "step": 7144, + "time_per_iteration": 2.561828136444092 + }, + { + "auxiliary_loss_clip": 0.01101528, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.04602337, + "balance_loss_mlp": 1.01844811, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 1.623607293394533, + "language_loss": 0.73617709, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75751573, + "num_input_tokens_seen": 153298025, + "step": 7145, + "time_per_iteration": 2.606219530105591 + }, + { + "auxiliary_loss_clip": 0.01118978, + "auxiliary_loss_mlp": 0.01037694, + "balance_loss_clip": 1.04700685, + "balance_loss_mlp": 1.02299547, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 2.0029913042171477, + "language_loss": 0.78992057, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81148732, + "num_input_tokens_seen": 153315775, + "step": 7146, + "time_per_iteration": 2.4606730937957764 + }, + { + "auxiliary_loss_clip": 0.01112618, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.04432285, + "balance_loss_mlp": 1.01998723, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 2.7251340106680106, + "language_loss": 0.83086556, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85232198, + "num_input_tokens_seen": 153332765, + "step": 7147, + "time_per_iteration": 2.450446844100952 + }, + { + "auxiliary_loss_clip": 0.01120296, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.05348325, + "balance_loss_mlp": 1.02603245, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 2.1448492649928586, + "language_loss": 0.87377024, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89539003, + "num_input_tokens_seen": 153350760, + "step": 7148, + "time_per_iteration": 2.4892637729644775 + }, + { + "auxiliary_loss_clip": 0.01100697, + "auxiliary_loss_mlp": 0.01038895, + "balance_loss_clip": 1.04177332, + "balance_loss_mlp": 1.02489436, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 2.176773042044082, + "language_loss": 0.78180081, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80319673, + "num_input_tokens_seen": 153370765, + "step": 7149, + "time_per_iteration": 2.612532615661621 + }, + { + "auxiliary_loss_clip": 0.0108576, + "auxiliary_loss_mlp": 0.0103958, + "balance_loss_clip": 1.03931451, + "balance_loss_mlp": 1.025388, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 2.128892728693792, + "language_loss": 0.79746407, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81871742, + "num_input_tokens_seen": 153390725, + "step": 7150, + "time_per_iteration": 2.57448410987854 + }, + { + "auxiliary_loss_clip": 0.01123813, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.04944313, + "balance_loss_mlp": 1.02499342, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.8445945678075282, + "language_loss": 0.75044966, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.77210563, + "num_input_tokens_seen": 153408010, + "step": 7151, + "time_per_iteration": 2.438249349594116 + }, + { + "auxiliary_loss_clip": 0.01088444, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.04386306, + "balance_loss_mlp": 1.02103055, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 1.8339674771657526, + "language_loss": 0.70429051, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72553855, + "num_input_tokens_seen": 153426865, + "step": 7152, + "time_per_iteration": 3.9456872940063477 + }, + { + "auxiliary_loss_clip": 0.01107972, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.04675257, + "balance_loss_mlp": 1.02641177, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 2.1239687999154784, + "language_loss": 0.71435702, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73583663, + "num_input_tokens_seen": 153449410, + "step": 7153, + "time_per_iteration": 2.6223840713500977 + }, + { + "auxiliary_loss_clip": 0.01114183, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.04270363, + "balance_loss_mlp": 1.021842, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 2.280231701044002, + "language_loss": 0.78367519, + "learning_rate": 2.542829359113276e-06, + "loss": 0.8051818, + "num_input_tokens_seen": 153467910, + "step": 7154, + "time_per_iteration": 2.4479007720947266 + }, + { + "auxiliary_loss_clip": 0.01098892, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_clip": 1.04622602, + "balance_loss_mlp": 1.02842319, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.4407181583331976, + "language_loss": 0.78749537, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80891317, + "num_input_tokens_seen": 153487100, + "step": 7155, + "time_per_iteration": 3.9989013671875 + }, + { + "auxiliary_loss_clip": 0.01104735, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.05031705, + "balance_loss_mlp": 1.01851726, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.9141615273567647, + "language_loss": 0.88655144, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.9079119, + "num_input_tokens_seen": 153505565, + "step": 7156, + "time_per_iteration": 2.515187978744507 + }, + { + "auxiliary_loss_clip": 0.01129914, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.04615617, + "balance_loss_mlp": 1.01993906, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 2.76668972904153, + "language_loss": 0.82761341, + "learning_rate": 2.541704739753042e-06, + "loss": 0.84925812, + "num_input_tokens_seen": 153526130, + "step": 7157, + "time_per_iteration": 2.473417043685913 + }, + { + "auxiliary_loss_clip": 0.01132886, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.0483768, + "balance_loss_mlp": 1.02413344, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 6.980221064793525, + "language_loss": 0.72052145, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74224079, + "num_input_tokens_seen": 153546370, + "step": 7158, + "time_per_iteration": 2.4527482986450195 + }, + { + "auxiliary_loss_clip": 0.0111947, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.04822922, + "balance_loss_mlp": 1.02108312, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 1.8468868761344444, + "language_loss": 0.82541609, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.84695935, + "num_input_tokens_seen": 153562800, + "step": 7159, + "time_per_iteration": 2.4290552139282227 + }, + { + "auxiliary_loss_clip": 0.01101427, + "auxiliary_loss_mlp": 0.01039074, + "balance_loss_clip": 1.04212427, + "balance_loss_mlp": 1.02452409, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.2220007238649107, + "language_loss": 0.83211046, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85351545, + "num_input_tokens_seen": 153578395, + "step": 7160, + "time_per_iteration": 3.897012233734131 + }, + { + "auxiliary_loss_clip": 0.01122104, + "auxiliary_loss_mlp": 0.01040387, + "balance_loss_clip": 1.04722536, + "balance_loss_mlp": 1.0240072, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 2.8350768444532752, + "language_loss": 0.77053744, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.7921623, + "num_input_tokens_seen": 153596880, + "step": 7161, + "time_per_iteration": 2.4806694984436035 + }, + { + "auxiliary_loss_clip": 0.01116427, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.04426229, + "balance_loss_mlp": 1.02663755, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.80960473604888, + "language_loss": 0.72624731, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.74781883, + "num_input_tokens_seen": 153616570, + "step": 7162, + "time_per_iteration": 2.492805004119873 + }, + { + "auxiliary_loss_clip": 0.01022237, + "auxiliary_loss_mlp": 0.00754588, + "balance_loss_clip": 1.02801156, + "balance_loss_mlp": 0.99997443, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7937010461282311, + "language_loss": 0.58995211, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.60772038, + "num_input_tokens_seen": 153671450, + "step": 7163, + "time_per_iteration": 2.9852495193481445 + }, + { + "auxiliary_loss_clip": 0.01100238, + "auxiliary_loss_mlp": 0.01044228, + "balance_loss_clip": 1.04244232, + "balance_loss_mlp": 1.02958918, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.7603454775638265, + "language_loss": 0.78976119, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81120586, + "num_input_tokens_seen": 153691405, + "step": 7164, + "time_per_iteration": 2.5678822994232178 + }, + { + "auxiliary_loss_clip": 0.0113158, + "auxiliary_loss_mlp": 0.01039905, + "balance_loss_clip": 1.04709136, + "balance_loss_mlp": 1.02574313, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 2.7940600540378453, + "language_loss": 0.67517149, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69688636, + "num_input_tokens_seen": 153711555, + "step": 7165, + "time_per_iteration": 2.488884210586548 + }, + { + "auxiliary_loss_clip": 0.01101288, + "auxiliary_loss_mlp": 0.00779097, + "balance_loss_clip": 1.04759121, + "balance_loss_mlp": 1.00047231, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 1.9716531865651963, + "language_loss": 0.75027514, + "learning_rate": 2.538329773967034e-06, + "loss": 0.76907891, + "num_input_tokens_seen": 153730095, + "step": 7166, + "time_per_iteration": 2.4989306926727295 + }, + { + "auxiliary_loss_clip": 0.01117487, + "auxiliary_loss_mlp": 0.01039947, + "balance_loss_clip": 1.04856324, + "balance_loss_mlp": 1.02700138, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.7616236128331557, + "language_loss": 0.71817386, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73974824, + "num_input_tokens_seen": 153749320, + "step": 7167, + "time_per_iteration": 2.504978895187378 + }, + { + "auxiliary_loss_clip": 0.01098391, + "auxiliary_loss_mlp": 0.00778329, + "balance_loss_clip": 1.0453335, + "balance_loss_mlp": 1.00047278, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.5724568969377686, + "language_loss": 0.78450567, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80327284, + "num_input_tokens_seen": 153767825, + "step": 7168, + "time_per_iteration": 2.5035605430603027 + }, + { + "auxiliary_loss_clip": 0.01108006, + "auxiliary_loss_mlp": 0.01042038, + "balance_loss_clip": 1.04771519, + "balance_loss_mlp": 1.02781677, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 4.499416214994332, + "language_loss": 0.82291162, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84441209, + "num_input_tokens_seen": 153785350, + "step": 7169, + "time_per_iteration": 2.494555711746216 + }, + { + "auxiliary_loss_clip": 0.0103579, + "auxiliary_loss_mlp": 0.01004026, + "balance_loss_clip": 1.0273993, + "balance_loss_mlp": 1.00227404, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6754312289469387, + "language_loss": 0.60763836, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62803656, + "num_input_tokens_seen": 153856400, + "step": 7170, + "time_per_iteration": 3.2370963096618652 + }, + { + "auxiliary_loss_clip": 0.01129692, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.04833651, + "balance_loss_mlp": 1.02265096, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 1.7423948214497913, + "language_loss": 0.75919896, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78085244, + "num_input_tokens_seen": 153875230, + "step": 7171, + "time_per_iteration": 2.4311628341674805 + }, + { + "auxiliary_loss_clip": 0.01115834, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.04662871, + "balance_loss_mlp": 1.02140927, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.544954225560454, + "language_loss": 0.7761761, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79767358, + "num_input_tokens_seen": 153894740, + "step": 7172, + "time_per_iteration": 2.51578950881958 + }, + { + "auxiliary_loss_clip": 0.01102616, + "auxiliary_loss_mlp": 0.01042884, + "balance_loss_clip": 1.04358816, + "balance_loss_mlp": 1.02656412, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 2.016645057138269, + "language_loss": 0.76671708, + "learning_rate": 2.535703656890086e-06, + "loss": 0.78817213, + "num_input_tokens_seen": 153913230, + "step": 7173, + "time_per_iteration": 2.4966161251068115 + }, + { + "auxiliary_loss_clip": 0.01127634, + "auxiliary_loss_mlp": 0.00780245, + "balance_loss_clip": 1.04745042, + "balance_loss_mlp": 1.00049305, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.5059034224675634, + "language_loss": 0.77093428, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79001307, + "num_input_tokens_seen": 153933250, + "step": 7174, + "time_per_iteration": 2.450321912765503 + }, + { + "auxiliary_loss_clip": 0.01128131, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.04561687, + "balance_loss_mlp": 1.01789045, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.554030993148229, + "language_loss": 0.82573807, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84734714, + "num_input_tokens_seen": 153951325, + "step": 7175, + "time_per_iteration": 2.4106922149658203 + }, + { + "auxiliary_loss_clip": 0.01086602, + "auxiliary_loss_mlp": 0.01043572, + "balance_loss_clip": 1.04132485, + "balance_loss_mlp": 1.02675176, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.3538147843222568, + "language_loss": 0.74314576, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.76444751, + "num_input_tokens_seen": 153966975, + "step": 7176, + "time_per_iteration": 2.492126941680908 + }, + { + "auxiliary_loss_clip": 0.01117977, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.04352689, + "balance_loss_mlp": 1.0169611, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.514629071549164, + "language_loss": 0.73331255, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75479865, + "num_input_tokens_seen": 153986695, + "step": 7177, + "time_per_iteration": 4.00244927406311 + }, + { + "auxiliary_loss_clip": 0.01111802, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.04440165, + "balance_loss_mlp": 1.02561903, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 2.2558414960625153, + "language_loss": 0.81665111, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83818728, + "num_input_tokens_seen": 154004710, + "step": 7178, + "time_per_iteration": 2.5097849369049072 + }, + { + "auxiliary_loss_clip": 0.01105965, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.04832137, + "balance_loss_mlp": 1.01895094, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.5164727892095171, + "language_loss": 0.84171247, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86309159, + "num_input_tokens_seen": 154024320, + "step": 7179, + "time_per_iteration": 2.6127145290374756 + }, + { + "auxiliary_loss_clip": 0.01102444, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.0432148, + "balance_loss_mlp": 1.02092814, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 1.6566574930233895, + "language_loss": 0.74918044, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77055323, + "num_input_tokens_seen": 154041755, + "step": 7180, + "time_per_iteration": 2.5559639930725098 + }, + { + "auxiliary_loss_clip": 0.01101613, + "auxiliary_loss_mlp": 0.00785044, + "balance_loss_clip": 1.03988314, + "balance_loss_mlp": 1.00059271, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 4.514301068237331, + "language_loss": 0.82053876, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.83940542, + "num_input_tokens_seen": 154056775, + "step": 7181, + "time_per_iteration": 2.5661885738372803 + }, + { + "auxiliary_loss_clip": 0.01105987, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.04545355, + "balance_loss_mlp": 1.02170777, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.7104805483655017, + "language_loss": 0.88954604, + "learning_rate": 2.532325758728165e-06, + "loss": 0.91097111, + "num_input_tokens_seen": 154075015, + "step": 7182, + "time_per_iteration": 2.5032787322998047 + }, + { + "auxiliary_loss_clip": 0.01115947, + "auxiliary_loss_mlp": 0.00779386, + "balance_loss_clip": 1.04697204, + "balance_loss_mlp": 1.00043821, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.879262195897856, + "language_loss": 0.75888377, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.7778371, + "num_input_tokens_seen": 154095170, + "step": 7183, + "time_per_iteration": 2.490638494491577 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.04608822, + "balance_loss_mlp": 1.02127361, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 1.7007323800601635, + "language_loss": 0.77920949, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.80072534, + "num_input_tokens_seen": 154116895, + "step": 7184, + "time_per_iteration": 2.509918212890625 + }, + { + "auxiliary_loss_clip": 0.01098756, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.04287148, + "balance_loss_mlp": 1.02314818, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.8111503488414973, + "language_loss": 0.73346591, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75482011, + "num_input_tokens_seen": 154138395, + "step": 7185, + "time_per_iteration": 2.5916073322296143 + }, + { + "auxiliary_loss_clip": 0.01112436, + "auxiliary_loss_mlp": 0.01036461, + "balance_loss_clip": 1.04592013, + "balance_loss_mlp": 1.02161956, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.661489532205599, + "language_loss": 0.75400358, + "learning_rate": 2.530823945207421e-06, + "loss": 0.77549255, + "num_input_tokens_seen": 154156775, + "step": 7186, + "time_per_iteration": 2.5305213928222656 + }, + { + "auxiliary_loss_clip": 0.010935, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.04527819, + "balance_loss_mlp": 1.0208391, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 4.904865724403923, + "language_loss": 0.76084262, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78211832, + "num_input_tokens_seen": 154177500, + "step": 7187, + "time_per_iteration": 2.5524895191192627 + }, + { + "auxiliary_loss_clip": 0.01034113, + "auxiliary_loss_mlp": 0.01006595, + "balance_loss_clip": 1.04104233, + "balance_loss_mlp": 1.0043416, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8608421489307789, + "language_loss": 0.68175691, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70216399, + "num_input_tokens_seen": 154237110, + "step": 7188, + "time_per_iteration": 3.131662368774414 + }, + { + "auxiliary_loss_clip": 0.01095599, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.04179502, + "balance_loss_mlp": 1.02249479, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.8879329949950778, + "language_loss": 0.78210878, + "learning_rate": 2.529697373663614e-06, + "loss": 0.8034206, + "num_input_tokens_seen": 154253910, + "step": 7189, + "time_per_iteration": 2.4850757122039795 + }, + { + "auxiliary_loss_clip": 0.0107738, + "auxiliary_loss_mlp": 0.01042908, + "balance_loss_clip": 1.04228806, + "balance_loss_mlp": 1.02809632, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 2.1385384312385916, + "language_loss": 0.71347225, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73467517, + "num_input_tokens_seen": 154274770, + "step": 7190, + "time_per_iteration": 2.593230962753296 + }, + { + "auxiliary_loss_clip": 0.01103127, + "auxiliary_loss_mlp": 0.01039494, + "balance_loss_clip": 1.04344964, + "balance_loss_mlp": 1.02610111, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 2.028640899652496, + "language_loss": 0.79837728, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81980348, + "num_input_tokens_seen": 154295035, + "step": 7191, + "time_per_iteration": 4.077075481414795 + }, + { + "auxiliary_loss_clip": 0.0108405, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.04711568, + "balance_loss_mlp": 1.0195297, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.6725770855588489, + "language_loss": 0.74923658, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.77040565, + "num_input_tokens_seen": 154314905, + "step": 7192, + "time_per_iteration": 2.581423282623291 + }, + { + "auxiliary_loss_clip": 0.01084829, + "auxiliary_loss_mlp": 0.01050028, + "balance_loss_clip": 1.04268312, + "balance_loss_mlp": 1.03455496, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 2.2682258481032602, + "language_loss": 0.79102111, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.8123697, + "num_input_tokens_seen": 154331740, + "step": 7193, + "time_per_iteration": 2.548276662826538 + }, + { + "auxiliary_loss_clip": 0.01107015, + "auxiliary_loss_mlp": 0.01042236, + "balance_loss_clip": 1.04359746, + "balance_loss_mlp": 1.02755523, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.803332284789359, + "language_loss": 0.7571733, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.77866578, + "num_input_tokens_seen": 154348740, + "step": 7194, + "time_per_iteration": 4.103496074676514 + }, + { + "auxiliary_loss_clip": 0.01126518, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.04589176, + "balance_loss_mlp": 1.02487683, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 3.207497049902051, + "language_loss": 0.59703779, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.61869216, + "num_input_tokens_seen": 154368835, + "step": 7195, + "time_per_iteration": 2.491837501525879 + }, + { + "auxiliary_loss_clip": 0.01105465, + "auxiliary_loss_mlp": 0.01036463, + "balance_loss_clip": 1.0426687, + "balance_loss_mlp": 1.0217942, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 1.9727916818375648, + "language_loss": 0.65442336, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67584264, + "num_input_tokens_seen": 154384620, + "step": 7196, + "time_per_iteration": 2.4696807861328125 + }, + { + "auxiliary_loss_clip": 0.01131396, + "auxiliary_loss_mlp": 0.01040642, + "balance_loss_clip": 1.04666328, + "balance_loss_mlp": 1.02584767, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 2.0524688794435733, + "language_loss": 0.7249316, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74665201, + "num_input_tokens_seen": 154402865, + "step": 7197, + "time_per_iteration": 2.528085708618164 + }, + { + "auxiliary_loss_clip": 0.01112615, + "auxiliary_loss_mlp": 0.01041837, + "balance_loss_clip": 1.04498696, + "balance_loss_mlp": 1.02771688, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.8407083224623892, + "language_loss": 0.72727787, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.74882245, + "num_input_tokens_seen": 154423625, + "step": 7198, + "time_per_iteration": 2.5038504600524902 + }, + { + "auxiliary_loss_clip": 0.01089088, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.04123473, + "balance_loss_mlp": 1.0209167, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.3373536004593631, + "language_loss": 0.8129276, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83416092, + "num_input_tokens_seen": 154444775, + "step": 7199, + "time_per_iteration": 4.0557475090026855 + }, + { + "auxiliary_loss_clip": 0.01109761, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.04511404, + "balance_loss_mlp": 1.02336597, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.3126594020734808, + "language_loss": 0.68792456, + "learning_rate": 2.525565067625286e-06, + "loss": 0.70938575, + "num_input_tokens_seen": 154460815, + "step": 7200, + "time_per_iteration": 2.5594918727874756 + }, + { + "auxiliary_loss_clip": 0.01103628, + "auxiliary_loss_mlp": 0.00780602, + "balance_loss_clip": 1.04294443, + "balance_loss_mlp": 1.0003891, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 1.9324958311827347, + "language_loss": 0.87574375, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89458609, + "num_input_tokens_seen": 154479145, + "step": 7201, + "time_per_iteration": 2.4827356338500977 + }, + { + "auxiliary_loss_clip": 0.01080852, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.04677403, + "balance_loss_mlp": 1.02905083, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 2.540194195581278, + "language_loss": 0.64408892, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66535103, + "num_input_tokens_seen": 154498905, + "step": 7202, + "time_per_iteration": 2.6214239597320557 + }, + { + "auxiliary_loss_clip": 0.01079414, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.04552805, + "balance_loss_mlp": 1.02145433, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 2.4466011185082484, + "language_loss": 0.8172664, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.83840096, + "num_input_tokens_seen": 154517270, + "step": 7203, + "time_per_iteration": 2.5821778774261475 + }, + { + "auxiliary_loss_clip": 0.01095151, + "auxiliary_loss_mlp": 0.01049224, + "balance_loss_clip": 1.04136062, + "balance_loss_mlp": 1.03482938, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.7712705461385125, + "language_loss": 0.80897826, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83042204, + "num_input_tokens_seen": 154535945, + "step": 7204, + "time_per_iteration": 2.5799343585968018 + }, + { + "auxiliary_loss_clip": 0.01103635, + "auxiliary_loss_mlp": 0.01040825, + "balance_loss_clip": 1.04403734, + "balance_loss_mlp": 1.02748561, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.9993305467410274, + "language_loss": 0.73579204, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.7572366, + "num_input_tokens_seen": 154554935, + "step": 7205, + "time_per_iteration": 2.5016913414001465 + }, + { + "auxiliary_loss_clip": 0.01126396, + "auxiliary_loss_mlp": 0.00778979, + "balance_loss_clip": 1.04778624, + "balance_loss_mlp": 1.00051284, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 2.7106995012310895, + "language_loss": 0.75403374, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.7730875, + "num_input_tokens_seen": 154576065, + "step": 7206, + "time_per_iteration": 2.5230202674865723 + }, + { + "auxiliary_loss_clip": 0.01077305, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.04586089, + "balance_loss_mlp": 1.02092886, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 1.6272516368420298, + "language_loss": 0.78678489, + "learning_rate": 2.522934161574342e-06, + "loss": 0.80790722, + "num_input_tokens_seen": 154595110, + "step": 7207, + "time_per_iteration": 2.5679337978363037 + }, + { + "auxiliary_loss_clip": 0.01100246, + "auxiliary_loss_mlp": 0.01038452, + "balance_loss_clip": 1.04669833, + "balance_loss_mlp": 1.02431941, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 2.20139443541928, + "language_loss": 0.80591416, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.82730114, + "num_input_tokens_seen": 154612255, + "step": 7208, + "time_per_iteration": 2.5103607177734375 + }, + { + "auxiliary_loss_clip": 0.01105781, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.04411995, + "balance_loss_mlp": 1.02469623, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.0933092509378053, + "language_loss": 0.70624757, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72768629, + "num_input_tokens_seen": 154630440, + "step": 7209, + "time_per_iteration": 2.4819984436035156 + }, + { + "auxiliary_loss_clip": 0.01113062, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.04277754, + "balance_loss_mlp": 1.02394509, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.9733196446088752, + "language_loss": 0.81437153, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83588642, + "num_input_tokens_seen": 154652515, + "step": 7210, + "time_per_iteration": 2.543444871902466 + }, + { + "auxiliary_loss_clip": 0.01107793, + "auxiliary_loss_mlp": 0.01039492, + "balance_loss_clip": 1.0496726, + "balance_loss_mlp": 1.02679074, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 2.023787735933818, + "language_loss": 0.8205834, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84205627, + "num_input_tokens_seen": 154670965, + "step": 7211, + "time_per_iteration": 2.496434211730957 + }, + { + "auxiliary_loss_clip": 0.01112603, + "auxiliary_loss_mlp": 0.01040295, + "balance_loss_clip": 1.04116547, + "balance_loss_mlp": 1.02802265, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 2.333146734379567, + "language_loss": 0.74695587, + "learning_rate": 2.521054347790029e-06, + "loss": 0.76848483, + "num_input_tokens_seen": 154689980, + "step": 7212, + "time_per_iteration": 2.475800037384033 + }, + { + "auxiliary_loss_clip": 0.01105097, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.04781961, + "balance_loss_mlp": 1.02260232, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 7.29469994186403, + "language_loss": 0.76736021, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.7887612, + "num_input_tokens_seen": 154706570, + "step": 7213, + "time_per_iteration": 2.472480058670044 + }, + { + "auxiliary_loss_clip": 0.01116281, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.04622746, + "balance_loss_mlp": 1.02453494, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.454344461685204, + "language_loss": 0.65143222, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67297083, + "num_input_tokens_seen": 154725210, + "step": 7214, + "time_per_iteration": 2.4536261558532715 + }, + { + "auxiliary_loss_clip": 0.01097637, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.04067123, + "balance_loss_mlp": 1.02418303, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 2.5921646538576444, + "language_loss": 0.71862286, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73996818, + "num_input_tokens_seen": 154745945, + "step": 7215, + "time_per_iteration": 2.541747808456421 + }, + { + "auxiliary_loss_clip": 0.01096647, + "auxiliary_loss_mlp": 0.01043788, + "balance_loss_clip": 1.04160142, + "balance_loss_mlp": 1.029423, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 1.8454101761425423, + "language_loss": 0.74801028, + "learning_rate": 2.519550141025255e-06, + "loss": 0.76941454, + "num_input_tokens_seen": 154763580, + "step": 7216, + "time_per_iteration": 3.9477851390838623 + }, + { + "auxiliary_loss_clip": 0.01109555, + "auxiliary_loss_mlp": 0.01044751, + "balance_loss_clip": 1.0467546, + "balance_loss_mlp": 1.02934313, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.4184040060530627, + "language_loss": 0.75188792, + "learning_rate": 2.519174040044927e-06, + "loss": 0.77343094, + "num_input_tokens_seen": 154776825, + "step": 7217, + "time_per_iteration": 2.4844110012054443 + }, + { + "auxiliary_loss_clip": 0.01091338, + "auxiliary_loss_mlp": 0.01044088, + "balance_loss_clip": 1.04121685, + "balance_loss_mlp": 1.02925813, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 1.914771213793383, + "language_loss": 0.74162996, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76298428, + "num_input_tokens_seen": 154794025, + "step": 7218, + "time_per_iteration": 2.507075786590576 + }, + { + "auxiliary_loss_clip": 0.01103519, + "auxiliary_loss_mlp": 0.01035894, + "balance_loss_clip": 1.04908192, + "balance_loss_mlp": 1.02267385, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.6988578395875236, + "language_loss": 0.68548, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.70687413, + "num_input_tokens_seen": 154813105, + "step": 7219, + "time_per_iteration": 2.479979991912842 + }, + { + "auxiliary_loss_clip": 0.01097486, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.04312849, + "balance_loss_mlp": 1.02218032, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.5694292466574586, + "language_loss": 0.77531886, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79665327, + "num_input_tokens_seen": 154833525, + "step": 7220, + "time_per_iteration": 2.5818774700164795 + }, + { + "auxiliary_loss_clip": 0.0106365, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.04299927, + "balance_loss_mlp": 1.02169895, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 2.2115165737403415, + "language_loss": 0.69860631, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71960068, + "num_input_tokens_seen": 154853090, + "step": 7221, + "time_per_iteration": 2.6013259887695312 + }, + { + "auxiliary_loss_clip": 0.01118239, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.04474533, + "balance_loss_mlp": 1.02091074, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 1.7015124375859485, + "language_loss": 0.64996606, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67148906, + "num_input_tokens_seen": 154872055, + "step": 7222, + "time_per_iteration": 2.470268487930298 + }, + { + "auxiliary_loss_clip": 0.01092541, + "auxiliary_loss_mlp": 0.01031822, + "balance_loss_clip": 1.04309642, + "balance_loss_mlp": 1.01730812, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 2.669617111932012, + "language_loss": 0.72933233, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.7505759, + "num_input_tokens_seen": 154886645, + "step": 7223, + "time_per_iteration": 2.501446485519409 + }, + { + "auxiliary_loss_clip": 0.01127551, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.04519105, + "balance_loss_mlp": 1.01742673, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.8642201455385627, + "language_loss": 0.93894333, + "learning_rate": 2.516540782741694e-06, + "loss": 0.96053618, + "num_input_tokens_seen": 154906775, + "step": 7224, + "time_per_iteration": 2.475437641143799 + }, + { + "auxiliary_loss_clip": 0.01085635, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.03810942, + "balance_loss_mlp": 1.02584887, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.5050672656305801, + "language_loss": 0.61159754, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63284457, + "num_input_tokens_seen": 154926990, + "step": 7225, + "time_per_iteration": 2.5793683528900146 + }, + { + "auxiliary_loss_clip": 0.01096394, + "auxiliary_loss_mlp": 0.00780553, + "balance_loss_clip": 1.03950882, + "balance_loss_mlp": 1.00047171, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 2.455104814685729, + "language_loss": 0.7785762, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79734564, + "num_input_tokens_seen": 154946210, + "step": 7226, + "time_per_iteration": 2.486825466156006 + }, + { + "auxiliary_loss_clip": 0.0111351, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.04519105, + "balance_loss_mlp": 1.0197798, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.7104585621803408, + "language_loss": 0.84673649, + "learning_rate": 2.515411949802964e-06, + "loss": 0.86819744, + "num_input_tokens_seen": 154964995, + "step": 7227, + "time_per_iteration": 2.4584248065948486 + }, + { + "auxiliary_loss_clip": 0.01111056, + "auxiliary_loss_mlp": 0.01036461, + "balance_loss_clip": 1.04253209, + "balance_loss_mlp": 1.02244246, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 1.88419961609791, + "language_loss": 0.76820588, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78968108, + "num_input_tokens_seen": 154984775, + "step": 7228, + "time_per_iteration": 2.5051658153533936 + }, + { + "auxiliary_loss_clip": 0.01082609, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.04567432, + "balance_loss_mlp": 1.02181041, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.5463500280521363, + "language_loss": 0.80534226, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82652497, + "num_input_tokens_seen": 155008125, + "step": 7229, + "time_per_iteration": 2.6687495708465576 + }, + { + "auxiliary_loss_clip": 0.011141, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.04239523, + "balance_loss_mlp": 1.02595973, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 1.9258073099443347, + "language_loss": 0.81824028, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.83978128, + "num_input_tokens_seen": 155027885, + "step": 7230, + "time_per_iteration": 3.937934398651123 + }, + { + "auxiliary_loss_clip": 0.01114128, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.04431033, + "balance_loss_mlp": 1.02710032, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.2260280719808643, + "language_loss": 0.77135527, + "learning_rate": 2.513906565661973e-06, + "loss": 0.7929157, + "num_input_tokens_seen": 155043375, + "step": 7231, + "time_per_iteration": 2.432590961456299 + }, + { + "auxiliary_loss_clip": 0.01077878, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.0398711, + "balance_loss_mlp": 1.02126813, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.4891674071467658, + "language_loss": 0.69051492, + "learning_rate": 2.513530170872575e-06, + "loss": 0.71162748, + "num_input_tokens_seen": 155062930, + "step": 7232, + "time_per_iteration": 2.557446241378784 + }, + { + "auxiliary_loss_clip": 0.01094262, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.04506516, + "balance_loss_mlp": 1.01940227, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.7352503254131553, + "language_loss": 0.7231344, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74441653, + "num_input_tokens_seen": 155084980, + "step": 7233, + "time_per_iteration": 2.645695924758911 + }, + { + "auxiliary_loss_clip": 0.01075956, + "auxiliary_loss_mlp": 0.01043472, + "balance_loss_clip": 1.04777014, + "balance_loss_mlp": 1.02780783, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.5304496919919008, + "language_loss": 0.74301863, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.76421291, + "num_input_tokens_seen": 155107260, + "step": 7234, + "time_per_iteration": 4.165289878845215 + }, + { + "auxiliary_loss_clip": 0.01108276, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.04680109, + "balance_loss_mlp": 1.02236986, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 1.7843350052442994, + "language_loss": 0.58712631, + "learning_rate": 2.512400869722782e-06, + "loss": 0.60858464, + "num_input_tokens_seen": 155126720, + "step": 7235, + "time_per_iteration": 2.537017345428467 + }, + { + "auxiliary_loss_clip": 0.0106525, + "auxiliary_loss_mlp": 0.010425, + "balance_loss_clip": 1.03848767, + "balance_loss_mlp": 1.02597737, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.6702721439118315, + "language_loss": 0.77638149, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79745901, + "num_input_tokens_seen": 155148640, + "step": 7236, + "time_per_iteration": 2.6366660594940186 + }, + { + "auxiliary_loss_clip": 0.01121096, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.04345322, + "balance_loss_mlp": 1.01944923, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.659100760721636, + "language_loss": 0.81151772, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83306646, + "num_input_tokens_seen": 155165870, + "step": 7237, + "time_per_iteration": 2.404252290725708 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.04030418, + "balance_loss_mlp": 1.02026379, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 1.534652355035656, + "language_loss": 0.63268888, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65413213, + "num_input_tokens_seen": 155185315, + "step": 7238, + "time_per_iteration": 2.4693045616149902 + }, + { + "auxiliary_loss_clip": 0.01091159, + "auxiliary_loss_mlp": 0.00778978, + "balance_loss_clip": 1.04445338, + "balance_loss_mlp": 1.00052118, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.827548628988089, + "language_loss": 0.86406875, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88277006, + "num_input_tokens_seen": 155205790, + "step": 7239, + "time_per_iteration": 3.9772517681121826 + }, + { + "auxiliary_loss_clip": 0.01105467, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.04437113, + "balance_loss_mlp": 1.01679039, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.6836919222518107, + "language_loss": 0.72626364, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74763048, + "num_input_tokens_seen": 155226475, + "step": 7240, + "time_per_iteration": 2.5231378078460693 + }, + { + "auxiliary_loss_clip": 0.01095564, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.04603219, + "balance_loss_mlp": 1.01633215, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 2.5215527875494526, + "language_loss": 0.81538147, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83664644, + "num_input_tokens_seen": 155247110, + "step": 7241, + "time_per_iteration": 2.5594851970672607 + }, + { + "auxiliary_loss_clip": 0.01097741, + "auxiliary_loss_mlp": 0.00779895, + "balance_loss_clip": 1.0453186, + "balance_loss_mlp": 1.00038552, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 2.9326736794998296, + "language_loss": 0.79662323, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81539959, + "num_input_tokens_seen": 155261335, + "step": 7242, + "time_per_iteration": 2.500187397003174 + }, + { + "auxiliary_loss_clip": 0.01100312, + "auxiliary_loss_mlp": 0.01036074, + "balance_loss_clip": 1.0383172, + "balance_loss_mlp": 1.02080321, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 2.0909783691234862, + "language_loss": 0.67949593, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70085979, + "num_input_tokens_seen": 155278510, + "step": 7243, + "time_per_iteration": 2.4748518466949463 + }, + { + "auxiliary_loss_clip": 0.01065679, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.04131079, + "balance_loss_mlp": 1.02162433, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6444986182082046, + "language_loss": 0.81538534, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.8363924, + "num_input_tokens_seen": 155296450, + "step": 7244, + "time_per_iteration": 2.537485361099243 + }, + { + "auxiliary_loss_clip": 0.0107076, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.0454545, + "balance_loss_mlp": 1.01844144, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.8300912951278487, + "language_loss": 0.73413563, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75515807, + "num_input_tokens_seen": 155316080, + "step": 7245, + "time_per_iteration": 2.6184165477752686 + }, + { + "auxiliary_loss_clip": 0.0106739, + "auxiliary_loss_mlp": 0.01039745, + "balance_loss_clip": 1.03944814, + "balance_loss_mlp": 1.02639341, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.8985258021126525, + "language_loss": 0.76555175, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78662312, + "num_input_tokens_seen": 155336765, + "step": 7246, + "time_per_iteration": 2.6640572547912598 + }, + { + "auxiliary_loss_clip": 0.01112078, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_clip": 1.0411675, + "balance_loss_mlp": 1.0293591, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 1.869045590822481, + "language_loss": 0.85237765, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87393659, + "num_input_tokens_seen": 155356440, + "step": 7247, + "time_per_iteration": 2.4715144634246826 + }, + { + "auxiliary_loss_clip": 0.01124636, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.04438591, + "balance_loss_mlp": 1.02369499, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.6119135454312, + "language_loss": 0.72713339, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74874747, + "num_input_tokens_seen": 155377070, + "step": 7248, + "time_per_iteration": 2.459683895111084 + }, + { + "auxiliary_loss_clip": 0.01113583, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.04512525, + "balance_loss_mlp": 1.02583957, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 2.037246115868522, + "language_loss": 0.872751, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89428043, + "num_input_tokens_seen": 155398415, + "step": 7249, + "time_per_iteration": 2.503263473510742 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01042811, + "balance_loss_clip": 1.04487181, + "balance_loss_mlp": 1.02991295, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.8033557542512961, + "language_loss": 0.82048744, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84198946, + "num_input_tokens_seen": 155415625, + "step": 7250, + "time_per_iteration": 2.503113031387329 + }, + { + "auxiliary_loss_clip": 0.01119168, + "auxiliary_loss_mlp": 0.01037338, + "balance_loss_clip": 1.04822445, + "balance_loss_mlp": 1.02342629, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 2.083087080451989, + "language_loss": 0.85019279, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.87175786, + "num_input_tokens_seen": 155435505, + "step": 7251, + "time_per_iteration": 2.522047996520996 + }, + { + "auxiliary_loss_clip": 0.01104252, + "auxiliary_loss_mlp": 0.01042258, + "balance_loss_clip": 1.03884149, + "balance_loss_mlp": 1.02712464, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 2.482847742382891, + "language_loss": 0.6916467, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71311176, + "num_input_tokens_seen": 155455425, + "step": 7252, + "time_per_iteration": 2.4645156860351562 + }, + { + "auxiliary_loss_clip": 0.01098279, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.0425272, + "balance_loss_mlp": 1.03206623, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.6886400725370967, + "language_loss": 0.83795828, + "learning_rate": 2.505621403992348e-06, + "loss": 0.8594135, + "num_input_tokens_seen": 155474250, + "step": 7253, + "time_per_iteration": 2.463794708251953 + }, + { + "auxiliary_loss_clip": 0.01112808, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.04424143, + "balance_loss_mlp": 1.02219212, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.5296157897363296, + "language_loss": 0.70073152, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72222519, + "num_input_tokens_seen": 155494685, + "step": 7254, + "time_per_iteration": 2.4601075649261475 + }, + { + "auxiliary_loss_clip": 0.01102294, + "auxiliary_loss_mlp": 0.01038938, + "balance_loss_clip": 1.04414737, + "balance_loss_mlp": 1.02521133, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 2.1134123642519507, + "language_loss": 0.81273037, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.83414268, + "num_input_tokens_seen": 155513040, + "step": 7255, + "time_per_iteration": 2.487736701965332 + }, + { + "auxiliary_loss_clip": 0.01126029, + "auxiliary_loss_mlp": 0.01040968, + "balance_loss_clip": 1.04475093, + "balance_loss_mlp": 1.02761626, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 2.05498844946981, + "language_loss": 0.7762081, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79787809, + "num_input_tokens_seen": 155530100, + "step": 7256, + "time_per_iteration": 4.003371715545654 + }, + { + "auxiliary_loss_clip": 0.01127189, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.04710674, + "balance_loss_mlp": 1.02527881, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.5133126248782194, + "language_loss": 0.7588138, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78047717, + "num_input_tokens_seen": 155549375, + "step": 7257, + "time_per_iteration": 2.451117992401123 + }, + { + "auxiliary_loss_clip": 0.01113491, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.04292655, + "balance_loss_mlp": 1.02851272, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.704823929190092, + "language_loss": 0.72896326, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75052893, + "num_input_tokens_seen": 155569395, + "step": 7258, + "time_per_iteration": 2.5371828079223633 + }, + { + "auxiliary_loss_clip": 0.01103414, + "auxiliary_loss_mlp": 0.01040655, + "balance_loss_clip": 1.04353106, + "balance_loss_mlp": 1.02639127, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 1.9482157634651032, + "language_loss": 0.76754731, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78898799, + "num_input_tokens_seen": 155589090, + "step": 7259, + "time_per_iteration": 2.563990592956543 + }, + { + "auxiliary_loss_clip": 0.01043755, + "auxiliary_loss_mlp": 0.01014022, + "balance_loss_clip": 1.04263604, + "balance_loss_mlp": 1.01257348, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7491514044295201, + "language_loss": 0.56998307, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59056085, + "num_input_tokens_seen": 155648660, + "step": 7260, + "time_per_iteration": 3.088968515396118 + }, + { + "auxiliary_loss_clip": 0.01105893, + "auxiliary_loss_mlp": 0.01047022, + "balance_loss_clip": 1.04157162, + "balance_loss_mlp": 1.0321449, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 1.758104440595379, + "language_loss": 0.70444947, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.72597861, + "num_input_tokens_seen": 155669945, + "step": 7261, + "time_per_iteration": 2.618824005126953 + }, + { + "auxiliary_loss_clip": 0.01081, + "auxiliary_loss_mlp": 0.01053237, + "balance_loss_clip": 1.04215765, + "balance_loss_mlp": 1.03695321, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 2.197286928080164, + "language_loss": 0.69565028, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71699274, + "num_input_tokens_seen": 155688555, + "step": 7262, + "time_per_iteration": 2.6216323375701904 + }, + { + "auxiliary_loss_clip": 0.01061022, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.04111087, + "balance_loss_mlp": 1.02121162, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.73310222176065, + "language_loss": 0.79625678, + "learning_rate": 2.501852344559726e-06, + "loss": 0.81719863, + "num_input_tokens_seen": 155705370, + "step": 7263, + "time_per_iteration": 2.5983102321624756 + }, + { + "auxiliary_loss_clip": 0.01089229, + "auxiliary_loss_mlp": 0.01041251, + "balance_loss_clip": 1.04483199, + "balance_loss_mlp": 1.02672529, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.7810351052885307, + "language_loss": 0.7499876, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77129239, + "num_input_tokens_seen": 155721890, + "step": 7264, + "time_per_iteration": 2.5068604946136475 + }, + { + "auxiliary_loss_clip": 0.01073137, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.04126549, + "balance_loss_mlp": 1.02032113, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 2.5998977008800757, + "language_loss": 0.61657113, + "learning_rate": 2.501098303852298e-06, + "loss": 0.63765144, + "num_input_tokens_seen": 155743970, + "step": 7265, + "time_per_iteration": 2.7361130714416504 + }, + { + "auxiliary_loss_clip": 0.01102073, + "auxiliary_loss_mlp": 0.0102951, + "balance_loss_clip": 1.04396904, + "balance_loss_mlp": 1.01603341, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 2.679074512464639, + "language_loss": 0.72844809, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.74976391, + "num_input_tokens_seen": 155761830, + "step": 7266, + "time_per_iteration": 2.4735536575317383 + }, + { + "auxiliary_loss_clip": 0.01106697, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.04917741, + "balance_loss_mlp": 1.02094316, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.0707563572758527, + "language_loss": 0.82096767, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.8423835, + "num_input_tokens_seen": 155779610, + "step": 7267, + "time_per_iteration": 2.502269983291626 + }, + { + "auxiliary_loss_clip": 0.01122528, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.04411006, + "balance_loss_mlp": 1.01867962, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 1.9967055747391593, + "language_loss": 0.74467951, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76622206, + "num_input_tokens_seen": 155798765, + "step": 7268, + "time_per_iteration": 2.4386916160583496 + }, + { + "auxiliary_loss_clip": 0.01131741, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.04757023, + "balance_loss_mlp": 1.02519131, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 2.1340095618082566, + "language_loss": 0.79893339, + "learning_rate": 2.499589994531454e-06, + "loss": 0.82065952, + "num_input_tokens_seen": 155817750, + "step": 7269, + "time_per_iteration": 2.390974998474121 + }, + { + "auxiliary_loss_clip": 0.01104243, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.04422092, + "balance_loss_mlp": 1.02280521, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 1.8105555673436726, + "language_loss": 0.7486136, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77001679, + "num_input_tokens_seen": 155836490, + "step": 7270, + "time_per_iteration": 4.022635221481323 + }, + { + "auxiliary_loss_clip": 0.01070513, + "auxiliary_loss_mlp": 0.0104319, + "balance_loss_clip": 1.04037714, + "balance_loss_mlp": 1.02737081, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.8224703537919171, + "language_loss": 0.79839915, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81953621, + "num_input_tokens_seen": 155856225, + "step": 7271, + "time_per_iteration": 2.61909556388855 + }, + { + "auxiliary_loss_clip": 0.01043349, + "auxiliary_loss_mlp": 0.01004791, + "balance_loss_clip": 1.02684236, + "balance_loss_mlp": 1.00315785, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6942206909742875, + "language_loss": 0.5490306, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56951201, + "num_input_tokens_seen": 155916770, + "step": 7272, + "time_per_iteration": 3.1283254623413086 + }, + { + "auxiliary_loss_clip": 0.01131256, + "auxiliary_loss_mlp": 0.01043742, + "balance_loss_clip": 1.04762149, + "balance_loss_mlp": 1.02842951, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 1.7125933174966201, + "language_loss": 0.69866234, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72041237, + "num_input_tokens_seen": 155936490, + "step": 7273, + "time_per_iteration": 3.929950714111328 + }, + { + "auxiliary_loss_clip": 0.01103359, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.04174614, + "balance_loss_mlp": 1.02858245, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 6.509251949961693, + "language_loss": 0.75568384, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77716208, + "num_input_tokens_seen": 155957595, + "step": 7274, + "time_per_iteration": 2.6457765102386475 + }, + { + "auxiliary_loss_clip": 0.01111628, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.04707801, + "balance_loss_mlp": 1.01617861, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.9329767634335815, + "language_loss": 0.80324972, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82465404, + "num_input_tokens_seen": 155975710, + "step": 7275, + "time_per_iteration": 2.469566583633423 + }, + { + "auxiliary_loss_clip": 0.01102748, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.04584312, + "balance_loss_mlp": 1.02522242, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 1.9932469698183266, + "language_loss": 0.80653465, + "learning_rate": 2.496949724407266e-06, + "loss": 0.82795131, + "num_input_tokens_seen": 155993090, + "step": 7276, + "time_per_iteration": 2.472358465194702 + }, + { + "auxiliary_loss_clip": 0.01111454, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.04671395, + "balance_loss_mlp": 1.01578569, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 1.853037625190953, + "language_loss": 0.72928655, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75071084, + "num_input_tokens_seen": 156013685, + "step": 7277, + "time_per_iteration": 2.590235471725464 + }, + { + "auxiliary_loss_clip": 0.0110507, + "auxiliary_loss_mlp": 0.00780045, + "balance_loss_clip": 1.0447135, + "balance_loss_mlp": 1.00044477, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.9485137201096994, + "language_loss": 0.72801095, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.74686217, + "num_input_tokens_seen": 156034300, + "step": 7278, + "time_per_iteration": 3.962329864501953 + }, + { + "auxiliary_loss_clip": 0.0109506, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.04738903, + "balance_loss_mlp": 1.02400565, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.478005576571578, + "language_loss": 0.66115737, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68248057, + "num_input_tokens_seen": 156053805, + "step": 7279, + "time_per_iteration": 2.536228656768799 + }, + { + "auxiliary_loss_clip": 0.01133892, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.04884195, + "balance_loss_mlp": 1.02075887, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 2.556640414296135, + "language_loss": 0.81811297, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.83980209, + "num_input_tokens_seen": 156073295, + "step": 7280, + "time_per_iteration": 2.4695138931274414 + }, + { + "auxiliary_loss_clip": 0.01107472, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.04795957, + "balance_loss_mlp": 1.01895905, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.4914545354328272, + "language_loss": 0.76844597, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.7898497, + "num_input_tokens_seen": 156094540, + "step": 7281, + "time_per_iteration": 2.5748491287231445 + }, + { + "auxiliary_loss_clip": 0.01103373, + "auxiliary_loss_mlp": 0.01042709, + "balance_loss_clip": 1.04372108, + "balance_loss_mlp": 1.02934611, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.810284217245962, + "language_loss": 0.75774646, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77920735, + "num_input_tokens_seen": 156114070, + "step": 7282, + "time_per_iteration": 2.5431978702545166 + }, + { + "auxiliary_loss_clip": 0.01087281, + "auxiliary_loss_mlp": 0.01038513, + "balance_loss_clip": 1.04137027, + "balance_loss_mlp": 1.02429104, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.8543776395714529, + "language_loss": 0.85150117, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.8727591, + "num_input_tokens_seen": 156132130, + "step": 7283, + "time_per_iteration": 2.5573885440826416 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.04545999, + "balance_loss_mlp": 1.02288592, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 2.038670997223886, + "language_loss": 0.8011359, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82258469, + "num_input_tokens_seen": 156150820, + "step": 7284, + "time_per_iteration": 2.544701099395752 + }, + { + "auxiliary_loss_clip": 0.01116917, + "auxiliary_loss_mlp": 0.01041248, + "balance_loss_clip": 1.04492426, + "balance_loss_mlp": 1.02761602, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.6925831417137542, + "language_loss": 0.80147398, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82305562, + "num_input_tokens_seen": 156170125, + "step": 7285, + "time_per_iteration": 2.456341028213501 + }, + { + "auxiliary_loss_clip": 0.01117018, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.04551744, + "balance_loss_mlp": 1.02017283, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 1.932815664041476, + "language_loss": 0.74780321, + "learning_rate": 2.493176309387897e-06, + "loss": 0.76931548, + "num_input_tokens_seen": 156187320, + "step": 7286, + "time_per_iteration": 2.468938112258911 + }, + { + "auxiliary_loss_clip": 0.01090977, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.03973556, + "balance_loss_mlp": 1.01991224, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.8014074964097782, + "language_loss": 0.73113275, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75238466, + "num_input_tokens_seen": 156207455, + "step": 7287, + "time_per_iteration": 2.7029037475585938 + }, + { + "auxiliary_loss_clip": 0.01105371, + "auxiliary_loss_mlp": 0.01043665, + "balance_loss_clip": 1.04564357, + "balance_loss_mlp": 1.02948546, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 2.1549986436989905, + "language_loss": 0.82409155, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84558189, + "num_input_tokens_seen": 156226560, + "step": 7288, + "time_per_iteration": 2.4816482067108154 + }, + { + "auxiliary_loss_clip": 0.01095623, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.04582286, + "balance_loss_mlp": 1.02268577, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.9211463676909428, + "language_loss": 0.84195626, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86328101, + "num_input_tokens_seen": 156246740, + "step": 7289, + "time_per_iteration": 2.590319871902466 + }, + { + "auxiliary_loss_clip": 0.01097482, + "auxiliary_loss_mlp": 0.0105122, + "balance_loss_clip": 1.03958738, + "balance_loss_mlp": 1.03491235, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.5060230459071875, + "language_loss": 0.78151369, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80300075, + "num_input_tokens_seen": 156266440, + "step": 7290, + "time_per_iteration": 2.5732994079589844 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01040295, + "balance_loss_clip": 1.04680085, + "balance_loss_mlp": 1.02674675, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 2.103722422563632, + "language_loss": 0.77926874, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80093312, + "num_input_tokens_seen": 156286900, + "step": 7291, + "time_per_iteration": 2.493504285812378 + }, + { + "auxiliary_loss_clip": 0.01100392, + "auxiliary_loss_mlp": 0.01035272, + "balance_loss_clip": 1.05050325, + "balance_loss_mlp": 1.02137208, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.6885125187763503, + "language_loss": 0.65329361, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67465019, + "num_input_tokens_seen": 156307690, + "step": 7292, + "time_per_iteration": 2.721545696258545 + }, + { + "auxiliary_loss_clip": 0.01110605, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.04102206, + "balance_loss_mlp": 1.02219272, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.5928076296883005, + "language_loss": 0.74056971, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76204187, + "num_input_tokens_seen": 156326620, + "step": 7293, + "time_per_iteration": 2.5464227199554443 + }, + { + "auxiliary_loss_clip": 0.01098819, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.0486362, + "balance_loss_mlp": 1.02522933, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.005482030120731, + "language_loss": 0.78409743, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80548972, + "num_input_tokens_seen": 156345495, + "step": 7294, + "time_per_iteration": 2.661733388900757 + }, + { + "auxiliary_loss_clip": 0.01089764, + "auxiliary_loss_mlp": 0.01039255, + "balance_loss_clip": 1.04424274, + "balance_loss_mlp": 1.02519441, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.6759650591116693, + "language_loss": 0.7288357, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75012594, + "num_input_tokens_seen": 156363155, + "step": 7295, + "time_per_iteration": 4.035998582839966 + }, + { + "auxiliary_loss_clip": 0.01088152, + "auxiliary_loss_mlp": 0.01044988, + "balance_loss_clip": 1.04275894, + "balance_loss_mlp": 1.02860844, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 3.254878087464753, + "language_loss": 0.75352472, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77485609, + "num_input_tokens_seen": 156380940, + "step": 7296, + "time_per_iteration": 2.537792921066284 + }, + { + "auxiliary_loss_clip": 0.01115194, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.04521585, + "balance_loss_mlp": 1.02065229, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.5317230134082649, + "language_loss": 0.69248015, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71397364, + "num_input_tokens_seen": 156400415, + "step": 7297, + "time_per_iteration": 2.5195367336273193 + }, + { + "auxiliary_loss_clip": 0.01108053, + "auxiliary_loss_mlp": 0.01033856, + "balance_loss_clip": 1.04205513, + "balance_loss_mlp": 1.02040875, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.6936677151270487, + "language_loss": 0.69997448, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72139359, + "num_input_tokens_seen": 156421120, + "step": 7298, + "time_per_iteration": 2.5501668453216553 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.01026437, + "balance_loss_clip": 1.0445236, + "balance_loss_mlp": 1.01230514, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.750499748774975, + "language_loss": 0.72640073, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74780536, + "num_input_tokens_seen": 156441535, + "step": 7299, + "time_per_iteration": 2.5462875366210938 + }, + { + "auxiliary_loss_clip": 0.01098883, + "auxiliary_loss_mlp": 0.00783278, + "balance_loss_clip": 1.04285097, + "balance_loss_mlp": 1.00035167, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.797918214232166, + "language_loss": 0.7732687, + "learning_rate": 2.487890389750719e-06, + "loss": 0.7920903, + "num_input_tokens_seen": 156462015, + "step": 7300, + "time_per_iteration": 2.547029495239258 + }, + { + "auxiliary_loss_clip": 0.01106082, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.04652655, + "balance_loss_mlp": 1.01936531, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.6958822172396877, + "language_loss": 0.7040922, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.72548604, + "num_input_tokens_seen": 156482165, + "step": 7301, + "time_per_iteration": 2.5809998512268066 + }, + { + "auxiliary_loss_clip": 0.01082772, + "auxiliary_loss_mlp": 0.01043323, + "balance_loss_clip": 1.04281366, + "balance_loss_mlp": 1.02664542, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 2.201665532323431, + "language_loss": 0.7079078, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72916877, + "num_input_tokens_seen": 156503170, + "step": 7302, + "time_per_iteration": 2.672480821609497 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.04830885, + "balance_loss_mlp": 1.02400637, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.6269635235413604, + "language_loss": 0.82364225, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84509075, + "num_input_tokens_seen": 156523005, + "step": 7303, + "time_per_iteration": 2.6150705814361572 + }, + { + "auxiliary_loss_clip": 0.01114823, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.04378009, + "balance_loss_mlp": 1.03130746, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 2.175584083376173, + "language_loss": 0.68228114, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70390338, + "num_input_tokens_seen": 156544440, + "step": 7304, + "time_per_iteration": 2.655294895172119 + }, + { + "auxiliary_loss_clip": 0.01104693, + "auxiliary_loss_mlp": 0.00778121, + "balance_loss_clip": 1.04345083, + "balance_loss_mlp": 1.00029528, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.5205730464781386, + "language_loss": 0.78020775, + "learning_rate": 2.486001680477873e-06, + "loss": 0.79903591, + "num_input_tokens_seen": 156565410, + "step": 7305, + "time_per_iteration": 2.6812400817871094 + }, + { + "auxiliary_loss_clip": 0.01100604, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.04301882, + "balance_loss_mlp": 1.02095914, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.7055214998676933, + "language_loss": 0.68638855, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70773983, + "num_input_tokens_seen": 156584210, + "step": 7306, + "time_per_iteration": 2.560183525085449 + }, + { + "auxiliary_loss_clip": 0.01086377, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.04064417, + "balance_loss_mlp": 1.01749623, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.4901529595755107, + "language_loss": 0.62667692, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64786363, + "num_input_tokens_seen": 156602730, + "step": 7307, + "time_per_iteration": 2.5512866973876953 + }, + { + "auxiliary_loss_clip": 0.01130179, + "auxiliary_loss_mlp": 0.01036397, + "balance_loss_clip": 1.04654598, + "balance_loss_mlp": 1.02181196, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 2.1445652986672843, + "language_loss": 0.72268158, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74434733, + "num_input_tokens_seen": 156619405, + "step": 7308, + "time_per_iteration": 2.4331259727478027 + }, + { + "auxiliary_loss_clip": 0.01109528, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.04438043, + "balance_loss_mlp": 1.02275717, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.693015722787813, + "language_loss": 0.7705338, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.79200017, + "num_input_tokens_seen": 156638165, + "step": 7309, + "time_per_iteration": 3.978128671646118 + }, + { + "auxiliary_loss_clip": 0.01110764, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.04376364, + "balance_loss_mlp": 1.01910412, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 4.552343019514933, + "language_loss": 0.70929062, + "learning_rate": 2.484112510474251e-06, + "loss": 0.7307173, + "num_input_tokens_seen": 156658845, + "step": 7310, + "time_per_iteration": 2.4901347160339355 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.00780658, + "balance_loss_clip": 1.04510963, + "balance_loss_mlp": 1.00040448, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.4617688344423136, + "language_loss": 0.76026487, + "learning_rate": 2.483734621343429e-06, + "loss": 0.7790966, + "num_input_tokens_seen": 156677275, + "step": 7311, + "time_per_iteration": 2.501823902130127 + }, + { + "auxiliary_loss_clip": 0.01118708, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.04677248, + "balance_loss_mlp": 1.02380943, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 2.265122529590959, + "language_loss": 0.81270635, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83426785, + "num_input_tokens_seen": 156695815, + "step": 7312, + "time_per_iteration": 3.9488027095794678 + }, + { + "auxiliary_loss_clip": 0.01098757, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.04597354, + "balance_loss_mlp": 1.02298212, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 2.1019982366425336, + "language_loss": 0.85299438, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87434918, + "num_input_tokens_seen": 156714385, + "step": 7313, + "time_per_iteration": 2.4950966835021973 + }, + { + "auxiliary_loss_clip": 0.01105814, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.04298103, + "balance_loss_mlp": 1.02287853, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 1.7919626362390757, + "language_loss": 0.67638624, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69781363, + "num_input_tokens_seen": 156732615, + "step": 7314, + "time_per_iteration": 2.493191719055176 + }, + { + "auxiliary_loss_clip": 0.01108873, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.04471397, + "balance_loss_mlp": 1.01770377, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 1.7500839463036402, + "language_loss": 0.76690662, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.78831929, + "num_input_tokens_seen": 156750920, + "step": 7315, + "time_per_iteration": 2.491286039352417 + }, + { + "auxiliary_loss_clip": 0.01108656, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.05070591, + "balance_loss_mlp": 1.01758075, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.241843512482412, + "language_loss": 0.74112892, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76253057, + "num_input_tokens_seen": 156768520, + "step": 7316, + "time_per_iteration": 2.527630567550659 + }, + { + "auxiliary_loss_clip": 0.01098125, + "auxiliary_loss_mlp": 0.01038344, + "balance_loss_clip": 1.05410266, + "balance_loss_mlp": 1.02476585, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 3.047986610446467, + "language_loss": 0.64892709, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67029172, + "num_input_tokens_seen": 156788700, + "step": 7317, + "time_per_iteration": 2.5482101440429688 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01036811, + "balance_loss_clip": 1.04803061, + "balance_loss_mlp": 1.02324533, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.8278757121032123, + "language_loss": 0.79833174, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.81975329, + "num_input_tokens_seen": 156806470, + "step": 7318, + "time_per_iteration": 3.9569506645202637 + }, + { + "auxiliary_loss_clip": 0.01084321, + "auxiliary_loss_mlp": 0.01045033, + "balance_loss_clip": 1.03623414, + "balance_loss_mlp": 1.02995348, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.7772251787041773, + "language_loss": 0.79594809, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81724167, + "num_input_tokens_seen": 156825895, + "step": 7319, + "time_per_iteration": 2.5495314598083496 + }, + { + "auxiliary_loss_clip": 0.01112029, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_clip": 1.04169929, + "balance_loss_mlp": 1.02676058, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.6570425640362587, + "language_loss": 0.7952441, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.81678462, + "num_input_tokens_seen": 156845990, + "step": 7320, + "time_per_iteration": 2.5076701641082764 + }, + { + "auxiliary_loss_clip": 0.0110075, + "auxiliary_loss_mlp": 0.01044196, + "balance_loss_clip": 1.05411148, + "balance_loss_mlp": 1.03044534, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.5363984332301428, + "language_loss": 0.69840831, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71985781, + "num_input_tokens_seen": 156866685, + "step": 7321, + "time_per_iteration": 2.5886785984039307 + }, + { + "auxiliary_loss_clip": 0.01019583, + "auxiliary_loss_mlp": 0.0101002, + "balance_loss_clip": 1.02904594, + "balance_loss_mlp": 1.00779045, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.880390373981287, + "language_loss": 0.56963587, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.58993196, + "num_input_tokens_seen": 156923450, + "step": 7322, + "time_per_iteration": 3.2433388233184814 + }, + { + "auxiliary_loss_clip": 0.01078075, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.0379951, + "balance_loss_mlp": 1.02744734, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.5472580185489562, + "language_loss": 0.76113957, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78233445, + "num_input_tokens_seen": 156944795, + "step": 7323, + "time_per_iteration": 2.5991501808166504 + }, + { + "auxiliary_loss_clip": 0.01120311, + "auxiliary_loss_mlp": 0.01042188, + "balance_loss_clip": 1.04692435, + "balance_loss_mlp": 1.02820516, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.6885249266942626, + "language_loss": 0.80653596, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82816094, + "num_input_tokens_seen": 156962755, + "step": 7324, + "time_per_iteration": 2.481616497039795 + }, + { + "auxiliary_loss_clip": 0.01021277, + "auxiliary_loss_mlp": 0.01001638, + "balance_loss_clip": 1.01926303, + "balance_loss_mlp": 0.99989712, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.6658345951915031, + "language_loss": 0.54528856, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56551772, + "num_input_tokens_seen": 157028095, + "step": 7325, + "time_per_iteration": 3.1213717460632324 + }, + { + "auxiliary_loss_clip": 0.01128693, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.0495739, + "balance_loss_mlp": 1.01729774, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.5129454818657009, + "language_loss": 0.69588077, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71746624, + "num_input_tokens_seen": 157048365, + "step": 7326, + "time_per_iteration": 2.45877742767334 + }, + { + "auxiliary_loss_clip": 0.01091676, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.04543436, + "balance_loss_mlp": 1.01711011, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.4628200259794795, + "language_loss": 0.76577544, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78699934, + "num_input_tokens_seen": 157069130, + "step": 7327, + "time_per_iteration": 2.558234453201294 + }, + { + "auxiliary_loss_clip": 0.01099124, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.04070497, + "balance_loss_mlp": 1.02119923, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.9349471629044612, + "language_loss": 0.83975917, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86109877, + "num_input_tokens_seen": 157084940, + "step": 7328, + "time_per_iteration": 2.4465718269348145 + }, + { + "auxiliary_loss_clip": 0.01102835, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.04459321, + "balance_loss_mlp": 1.01858795, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 1.941149409743413, + "language_loss": 0.77415317, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79550731, + "num_input_tokens_seen": 157102770, + "step": 7329, + "time_per_iteration": 2.512193441390991 + }, + { + "auxiliary_loss_clip": 0.01111993, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.04106569, + "balance_loss_mlp": 1.02279711, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.6197043154974513, + "language_loss": 0.73467219, + "learning_rate": 2.476551258977278e-06, + "loss": 0.7561599, + "num_input_tokens_seen": 157122035, + "step": 7330, + "time_per_iteration": 2.469244956970215 + }, + { + "auxiliary_loss_clip": 0.01102653, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.04402149, + "balance_loss_mlp": 1.02194285, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 2.0347689522686827, + "language_loss": 0.74311113, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.7644881, + "num_input_tokens_seen": 157142800, + "step": 7331, + "time_per_iteration": 2.5303609371185303 + }, + { + "auxiliary_loss_clip": 0.01077767, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.04224634, + "balance_loss_mlp": 1.02068913, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.5687032943683485, + "language_loss": 0.7653321, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78644699, + "num_input_tokens_seen": 157163295, + "step": 7332, + "time_per_iteration": 2.6017582416534424 + }, + { + "auxiliary_loss_clip": 0.01101881, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.04669118, + "balance_loss_mlp": 1.0242219, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 3.9172851648740314, + "language_loss": 0.74053741, + "learning_rate": 2.475416445004285e-06, + "loss": 0.76192492, + "num_input_tokens_seen": 157180890, + "step": 7333, + "time_per_iteration": 2.45817232131958 + }, + { + "auxiliary_loss_clip": 0.01091209, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.04902339, + "balance_loss_mlp": 1.0221343, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.6621994184765787, + "language_loss": 0.79535592, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81662226, + "num_input_tokens_seen": 157200580, + "step": 7334, + "time_per_iteration": 4.100895404815674 + }, + { + "auxiliary_loss_clip": 0.0110063, + "auxiliary_loss_mlp": 0.01040967, + "balance_loss_clip": 1.04376543, + "balance_loss_mlp": 1.02456331, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 5.365373546220552, + "language_loss": 0.75592685, + "learning_rate": 2.47465981219252e-06, + "loss": 0.7773428, + "num_input_tokens_seen": 157218345, + "step": 7335, + "time_per_iteration": 2.550220251083374 + }, + { + "auxiliary_loss_clip": 0.01102583, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.04434037, + "balance_loss_mlp": 1.02256012, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 7.362578332030411, + "language_loss": 0.72509891, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74649483, + "num_input_tokens_seen": 157234395, + "step": 7336, + "time_per_iteration": 2.458341598510742 + }, + { + "auxiliary_loss_clip": 0.01119981, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.04496765, + "balance_loss_mlp": 1.02827001, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 2.9996142868461564, + "language_loss": 0.62408245, + "learning_rate": 2.473903107384165e-06, + "loss": 0.64570487, + "num_input_tokens_seen": 157254805, + "step": 7337, + "time_per_iteration": 2.4955618381500244 + }, + { + "auxiliary_loss_clip": 0.01029552, + "auxiliary_loss_mlp": 0.00754849, + "balance_loss_clip": 1.0197047, + "balance_loss_mlp": 1.00017381, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7526836896564908, + "language_loss": 0.5268724, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54471636, + "num_input_tokens_seen": 157317870, + "step": 7338, + "time_per_iteration": 3.1075491905212402 + }, + { + "auxiliary_loss_clip": 0.01107673, + "auxiliary_loss_mlp": 0.01046002, + "balance_loss_clip": 1.04226542, + "balance_loss_mlp": 1.03027225, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 2.328864247038946, + "language_loss": 0.70623076, + "learning_rate": 2.473146330693997e-06, + "loss": 0.72776747, + "num_input_tokens_seen": 157336505, + "step": 7339, + "time_per_iteration": 2.5015106201171875 + }, + { + "auxiliary_loss_clip": 0.01058385, + "auxiliary_loss_mlp": 0.01044746, + "balance_loss_clip": 1.04081607, + "balance_loss_mlp": 1.03079832, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.458443779380013, + "language_loss": 0.69880772, + "learning_rate": 2.472767915429105e-06, + "loss": 0.71983898, + "num_input_tokens_seen": 157354995, + "step": 7340, + "time_per_iteration": 2.6063058376312256 + }, + { + "auxiliary_loss_clip": 0.01030626, + "auxiliary_loss_mlp": 0.01011866, + "balance_loss_clip": 1.02357531, + "balance_loss_mlp": 1.01006591, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.9052352485266629, + "language_loss": 0.63996935, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66039419, + "num_input_tokens_seen": 157404260, + "step": 7341, + "time_per_iteration": 2.8914239406585693 + }, + { + "auxiliary_loss_clip": 0.01091933, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.04396009, + "balance_loss_mlp": 1.02180696, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 1.9377918147102626, + "language_loss": 0.74124849, + "learning_rate": 2.47201103113145e-06, + "loss": 0.76252067, + "num_input_tokens_seen": 157423045, + "step": 7342, + "time_per_iteration": 2.652505397796631 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.04274654, + "balance_loss_mlp": 1.02261436, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 2.9855443217673967, + "language_loss": 0.80074692, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.82234508, + "num_input_tokens_seen": 157441815, + "step": 7343, + "time_per_iteration": 2.478198528289795 + }, + { + "auxiliary_loss_clip": 0.01089343, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.04375744, + "balance_loss_mlp": 1.02326179, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 2.2504184633842153, + "language_loss": 0.76605344, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.78731406, + "num_input_tokens_seen": 157460470, + "step": 7344, + "time_per_iteration": 2.560039520263672 + }, + { + "auxiliary_loss_clip": 0.01033695, + "auxiliary_loss_mlp": 0.01001499, + "balance_loss_clip": 1.02327013, + "balance_loss_mlp": 0.99973488, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7924446426198899, + "language_loss": 0.63785177, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65820372, + "num_input_tokens_seen": 157512655, + "step": 7345, + "time_per_iteration": 2.8360722064971924 + }, + { + "auxiliary_loss_clip": 0.01129132, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.04650903, + "balance_loss_mlp": 1.02365315, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 1.734625390823603, + "language_loss": 0.86256069, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88423073, + "num_input_tokens_seen": 157533700, + "step": 7346, + "time_per_iteration": 2.47867751121521 + }, + { + "auxiliary_loss_clip": 0.0111744, + "auxiliary_loss_mlp": 0.01042241, + "balance_loss_clip": 1.04513907, + "balance_loss_mlp": 1.02745879, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.8570177506547891, + "language_loss": 0.80496168, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82655847, + "num_input_tokens_seen": 157551105, + "step": 7347, + "time_per_iteration": 2.4459195137023926 + }, + { + "auxiliary_loss_clip": 0.01102333, + "auxiliary_loss_mlp": 0.01038885, + "balance_loss_clip": 1.04626942, + "balance_loss_mlp": 1.02480686, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 2.259711969993193, + "language_loss": 0.83068711, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.8520993, + "num_input_tokens_seen": 157568285, + "step": 7348, + "time_per_iteration": 2.4711074829101562 + }, + { + "auxiliary_loss_clip": 0.0111919, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.04707468, + "balance_loss_mlp": 1.02422118, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 2.0840821203748194, + "language_loss": 0.70312738, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72470278, + "num_input_tokens_seen": 157590405, + "step": 7349, + "time_per_iteration": 3.962186336517334 + }, + { + "auxiliary_loss_clip": 0.01096549, + "auxiliary_loss_mlp": 0.01038808, + "balance_loss_clip": 1.03915644, + "balance_loss_mlp": 1.02392459, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.8132141100043968, + "language_loss": 0.74797559, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76932913, + "num_input_tokens_seen": 157607420, + "step": 7350, + "time_per_iteration": 2.478203296661377 + }, + { + "auxiliary_loss_clip": 0.0112692, + "auxiliary_loss_mlp": 0.01038805, + "balance_loss_clip": 1.04586542, + "balance_loss_mlp": 1.02500081, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 2.326809763966881, + "language_loss": 0.81292391, + "learning_rate": 2.468604167463827e-06, + "loss": 0.83458114, + "num_input_tokens_seen": 157624990, + "step": 7351, + "time_per_iteration": 2.396293878555298 + }, + { + "auxiliary_loss_clip": 0.01077942, + "auxiliary_loss_mlp": 0.0077909, + "balance_loss_clip": 1.03540492, + "balance_loss_mlp": 1.00027609, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.512283231674438, + "language_loss": 0.73503619, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75360644, + "num_input_tokens_seen": 157645300, + "step": 7352, + "time_per_iteration": 4.042862176895142 + }, + { + "auxiliary_loss_clip": 0.01102268, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.04897082, + "balance_loss_mlp": 1.01814628, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 1.8273717280001682, + "language_loss": 0.87281221, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89415389, + "num_input_tokens_seen": 157664060, + "step": 7353, + "time_per_iteration": 2.5156404972076416 + }, + { + "auxiliary_loss_clip": 0.01125978, + "auxiliary_loss_mlp": 0.01039351, + "balance_loss_clip": 1.04479265, + "balance_loss_mlp": 1.02660775, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 2.0263567838023238, + "language_loss": 0.76115656, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.78280985, + "num_input_tokens_seen": 157680905, + "step": 7354, + "time_per_iteration": 2.3846020698547363 + }, + { + "auxiliary_loss_clip": 0.01087412, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.04579318, + "balance_loss_mlp": 1.02270567, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 2.1123216774263813, + "language_loss": 0.64881313, + "learning_rate": 2.467089543204268e-06, + "loss": 0.67003721, + "num_input_tokens_seen": 157701980, + "step": 7355, + "time_per_iteration": 2.7508575916290283 + }, + { + "auxiliary_loss_clip": 0.01128725, + "auxiliary_loss_mlp": 0.01038746, + "balance_loss_clip": 1.04458117, + "balance_loss_mlp": 1.02393389, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.9831887661607386, + "language_loss": 0.7757358, + "learning_rate": 2.466710842823274e-06, + "loss": 0.79741049, + "num_input_tokens_seen": 157720555, + "step": 7356, + "time_per_iteration": 2.4200618267059326 + }, + { + "auxiliary_loss_clip": 0.01106089, + "auxiliary_loss_mlp": 0.00779193, + "balance_loss_clip": 1.04433823, + "balance_loss_mlp": 1.00049448, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.7867538709127326, + "language_loss": 0.77008951, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.7889424, + "num_input_tokens_seen": 157739160, + "step": 7357, + "time_per_iteration": 3.891491174697876 + }, + { + "auxiliary_loss_clip": 0.01098971, + "auxiliary_loss_mlp": 0.01039062, + "balance_loss_clip": 1.04089606, + "balance_loss_mlp": 1.02469087, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.478945947182144, + "language_loss": 0.73470062, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75608093, + "num_input_tokens_seen": 157760020, + "step": 7358, + "time_per_iteration": 2.5545096397399902 + }, + { + "auxiliary_loss_clip": 0.01098792, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.04277372, + "balance_loss_mlp": 1.02113795, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.783394638599039, + "language_loss": 0.76056635, + "learning_rate": 2.465574635551405e-06, + "loss": 0.78189659, + "num_input_tokens_seen": 157780435, + "step": 7359, + "time_per_iteration": 2.55068302154541 + }, + { + "auxiliary_loss_clip": 0.01099392, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.0422405, + "balance_loss_mlp": 1.02342057, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 3.7081526664293283, + "language_loss": 0.69930732, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72067535, + "num_input_tokens_seen": 157799420, + "step": 7360, + "time_per_iteration": 2.516235589981079 + }, + { + "auxiliary_loss_clip": 0.01101295, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.04622173, + "balance_loss_mlp": 1.02051806, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.4045788110095008, + "language_loss": 0.69786596, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71921653, + "num_input_tokens_seen": 157817025, + "step": 7361, + "time_per_iteration": 2.494055986404419 + }, + { + "auxiliary_loss_clip": 0.01098812, + "auxiliary_loss_mlp": 0.01038408, + "balance_loss_clip": 1.0393337, + "balance_loss_mlp": 1.02381659, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 3.086216147850676, + "language_loss": 0.82785028, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84922248, + "num_input_tokens_seen": 157834345, + "step": 7362, + "time_per_iteration": 2.476386070251465 + }, + { + "auxiliary_loss_clip": 0.01096838, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.04394102, + "balance_loss_mlp": 1.02304626, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.6664206905304468, + "language_loss": 0.74645936, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76780391, + "num_input_tokens_seen": 157852290, + "step": 7363, + "time_per_iteration": 2.5147767066955566 + }, + { + "auxiliary_loss_clip": 0.01002518, + "auxiliary_loss_mlp": 0.01006771, + "balance_loss_clip": 1.01637316, + "balance_loss_mlp": 1.00482774, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.7032729616778146, + "language_loss": 0.55673271, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57682562, + "num_input_tokens_seen": 157923060, + "step": 7364, + "time_per_iteration": 3.2337987422943115 + }, + { + "auxiliary_loss_clip": 0.01101574, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.04147792, + "balance_loss_mlp": 1.02017796, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.7356152099139504, + "language_loss": 0.74325848, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76460099, + "num_input_tokens_seen": 157944110, + "step": 7365, + "time_per_iteration": 2.5726191997528076 + }, + { + "auxiliary_loss_clip": 0.01095043, + "auxiliary_loss_mlp": 0.01040694, + "balance_loss_clip": 1.04050422, + "balance_loss_mlp": 1.02689528, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.7332591960569674, + "language_loss": 0.74290287, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76426023, + "num_input_tokens_seen": 157964295, + "step": 7366, + "time_per_iteration": 2.5203309059143066 + }, + { + "auxiliary_loss_clip": 0.01105276, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.04596543, + "balance_loss_mlp": 1.02034378, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 2.0536787923199147, + "language_loss": 0.73165083, + "learning_rate": 2.46254397374245e-06, + "loss": 0.75304252, + "num_input_tokens_seen": 157983970, + "step": 7367, + "time_per_iteration": 2.5362255573272705 + }, + { + "auxiliary_loss_clip": 0.01124118, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.04419637, + "balance_loss_mlp": 1.02648818, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.4654091177210944, + "language_loss": 0.73998916, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.7616266, + "num_input_tokens_seen": 158006515, + "step": 7368, + "time_per_iteration": 2.517159938812256 + }, + { + "auxiliary_loss_clip": 0.01095104, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.04062831, + "balance_loss_mlp": 1.02024555, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.822790947186882, + "language_loss": 0.79977334, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82105422, + "num_input_tokens_seen": 158025565, + "step": 7369, + "time_per_iteration": 2.496257781982422 + }, + { + "auxiliary_loss_clip": 0.01090428, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.04473519, + "balance_loss_mlp": 1.01922655, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.8789636850167173, + "language_loss": 0.71959198, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74081314, + "num_input_tokens_seen": 158045620, + "step": 7370, + "time_per_iteration": 2.5592143535614014 + }, + { + "auxiliary_loss_clip": 0.01123236, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.04324007, + "balance_loss_mlp": 1.02509952, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.7521150162984038, + "language_loss": 0.70467985, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72629309, + "num_input_tokens_seen": 158063505, + "step": 7371, + "time_per_iteration": 2.451524019241333 + }, + { + "auxiliary_loss_clip": 0.01112007, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.04371536, + "balance_loss_mlp": 1.01761055, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.261065877205708, + "language_loss": 0.68299776, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70440805, + "num_input_tokens_seen": 158080335, + "step": 7372, + "time_per_iteration": 2.4690585136413574 + }, + { + "auxiliary_loss_clip": 0.01092136, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.03963315, + "balance_loss_mlp": 1.01908898, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 1.9586846058956988, + "language_loss": 0.83407092, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85532057, + "num_input_tokens_seen": 158098955, + "step": 7373, + "time_per_iteration": 2.526289701461792 + }, + { + "auxiliary_loss_clip": 0.01041315, + "auxiliary_loss_mlp": 0.01002992, + "balance_loss_clip": 1.02356219, + "balance_loss_mlp": 1.00134659, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 1.1946461537355055, + "language_loss": 0.5528065, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57324952, + "num_input_tokens_seen": 158164110, + "step": 7374, + "time_per_iteration": 4.661163330078125 + }, + { + "auxiliary_loss_clip": 0.01083263, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.05207729, + "balance_loss_mlp": 1.02567053, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.7604697065604635, + "language_loss": 0.82704169, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.8482672, + "num_input_tokens_seen": 158179850, + "step": 7375, + "time_per_iteration": 2.5398643016815186 + }, + { + "auxiliary_loss_clip": 0.01123387, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.04388726, + "balance_loss_mlp": 1.01972938, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 3.5839772424707568, + "language_loss": 0.83981627, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86137295, + "num_input_tokens_seen": 158196590, + "step": 7376, + "time_per_iteration": 2.3868114948272705 + }, + { + "auxiliary_loss_clip": 0.01100282, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.04340625, + "balance_loss_mlp": 1.01710153, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.740612038523753, + "language_loss": 0.76983047, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79113114, + "num_input_tokens_seen": 158216355, + "step": 7377, + "time_per_iteration": 2.505617618560791 + }, + { + "auxiliary_loss_clip": 0.01102741, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.04129338, + "balance_loss_mlp": 1.01673234, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.961709963325953, + "language_loss": 0.75932419, + "learning_rate": 2.458374982357057e-06, + "loss": 0.78064167, + "num_input_tokens_seen": 158235825, + "step": 7378, + "time_per_iteration": 2.476017475128174 + }, + { + "auxiliary_loss_clip": 0.01099786, + "auxiliary_loss_mlp": 0.01047426, + "balance_loss_clip": 1.04241157, + "balance_loss_mlp": 1.03346658, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 1.945251185408431, + "language_loss": 0.68934572, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71081781, + "num_input_tokens_seen": 158254230, + "step": 7379, + "time_per_iteration": 2.4718594551086426 + }, + { + "auxiliary_loss_clip": 0.0106237, + "auxiliary_loss_mlp": 0.01043214, + "balance_loss_clip": 1.03708935, + "balance_loss_mlp": 1.028808, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.6536790749253159, + "language_loss": 0.73056394, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75161982, + "num_input_tokens_seen": 158273400, + "step": 7380, + "time_per_iteration": 2.626441717147827 + }, + { + "auxiliary_loss_clip": 0.01102972, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.04419112, + "balance_loss_mlp": 1.01741552, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.6899735194445624, + "language_loss": 0.65032524, + "learning_rate": 2.457237618887458e-06, + "loss": 0.67165804, + "num_input_tokens_seen": 158296840, + "step": 7381, + "time_per_iteration": 2.589956521987915 + }, + { + "auxiliary_loss_clip": 0.01110706, + "auxiliary_loss_mlp": 0.01037061, + "balance_loss_clip": 1.0427773, + "balance_loss_mlp": 1.02438879, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.0727716758984527, + "language_loss": 0.79856622, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82004392, + "num_input_tokens_seen": 158314935, + "step": 7382, + "time_per_iteration": 2.450237274169922 + }, + { + "auxiliary_loss_clip": 0.01117446, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.04709101, + "balance_loss_mlp": 1.0258292, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.9774461463736153, + "language_loss": 0.64917576, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67073727, + "num_input_tokens_seen": 158334620, + "step": 7383, + "time_per_iteration": 2.546325922012329 + }, + { + "auxiliary_loss_clip": 0.01099479, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.04017758, + "balance_loss_mlp": 1.02014267, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 2.2325777150650126, + "language_loss": 0.75568295, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77701533, + "num_input_tokens_seen": 158350550, + "step": 7384, + "time_per_iteration": 2.486177682876587 + }, + { + "auxiliary_loss_clip": 0.01125869, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.04486203, + "balance_loss_mlp": 1.02051306, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.6090712045690179, + "language_loss": 0.81114626, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83274209, + "num_input_tokens_seen": 158369555, + "step": 7385, + "time_per_iteration": 2.4531946182250977 + }, + { + "auxiliary_loss_clip": 0.01080672, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.04163766, + "balance_loss_mlp": 1.02376437, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.7743281054673248, + "language_loss": 0.81503797, + "learning_rate": 2.455341666526582e-06, + "loss": 0.8362273, + "num_input_tokens_seen": 158388045, + "step": 7386, + "time_per_iteration": 2.620579481124878 + }, + { + "auxiliary_loss_clip": 0.01088895, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.04342341, + "balance_loss_mlp": 1.02170992, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.8890154923347353, + "language_loss": 0.69754934, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.7188037, + "num_input_tokens_seen": 158410115, + "step": 7387, + "time_per_iteration": 2.6970021724700928 + }, + { + "auxiliary_loss_clip": 0.01061506, + "auxiliary_loss_mlp": 0.01039101, + "balance_loss_clip": 1.04460216, + "balance_loss_mlp": 1.02555871, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 1.9777000813026369, + "language_loss": 0.71979582, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.74080193, + "num_input_tokens_seen": 158427765, + "step": 7388, + "time_per_iteration": 4.00110936164856 + }, + { + "auxiliary_loss_clip": 0.01114538, + "auxiliary_loss_mlp": 0.01035662, + "balance_loss_clip": 1.04390073, + "balance_loss_mlp": 1.022066, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.7759945579404048, + "language_loss": 0.68852884, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.71003079, + "num_input_tokens_seen": 158446375, + "step": 7389, + "time_per_iteration": 2.4803273677825928 + }, + { + "auxiliary_loss_clip": 0.01115139, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.04594898, + "balance_loss_mlp": 1.018857, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 1.8083019852105195, + "language_loss": 0.74475825, + "learning_rate": 2.453824593752788e-06, + "loss": 0.76622957, + "num_input_tokens_seen": 158467260, + "step": 7390, + "time_per_iteration": 2.608499526977539 + }, + { + "auxiliary_loss_clip": 0.01109314, + "auxiliary_loss_mlp": 0.01040447, + "balance_loss_clip": 1.0432415, + "balance_loss_mlp": 1.02645731, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 1.8048656095732978, + "language_loss": 0.81909251, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.84059012, + "num_input_tokens_seen": 158486720, + "step": 7391, + "time_per_iteration": 2.5246548652648926 + }, + { + "auxiliary_loss_clip": 0.01095481, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.04234242, + "balance_loss_mlp": 1.02394938, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.7776990593929272, + "language_loss": 0.73812807, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75946271, + "num_input_tokens_seen": 158502530, + "step": 7392, + "time_per_iteration": 4.051312446594238 + }, + { + "auxiliary_loss_clip": 0.0111088, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.04232061, + "balance_loss_mlp": 1.02003455, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.7394918635086285, + "language_loss": 0.79516864, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.8166036, + "num_input_tokens_seen": 158522715, + "step": 7393, + "time_per_iteration": 2.498372793197632 + }, + { + "auxiliary_loss_clip": 0.01115861, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.04165721, + "balance_loss_mlp": 1.02075434, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.8331746943224898, + "language_loss": 0.80670738, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.82821125, + "num_input_tokens_seen": 158543615, + "step": 7394, + "time_per_iteration": 2.5601606369018555 + }, + { + "auxiliary_loss_clip": 0.01097463, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.04047453, + "balance_loss_mlp": 1.02690554, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 2.3917007237592425, + "language_loss": 0.8010217, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.82238936, + "num_input_tokens_seen": 158560330, + "step": 7395, + "time_per_iteration": 2.4618873596191406 + }, + { + "auxiliary_loss_clip": 0.01099769, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.0415045, + "balance_loss_mlp": 1.02418661, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 3.051764392964541, + "language_loss": 0.6852771, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70664805, + "num_input_tokens_seen": 158579735, + "step": 7396, + "time_per_iteration": 3.93245005607605 + }, + { + "auxiliary_loss_clip": 0.0111315, + "auxiliary_loss_mlp": 0.00779643, + "balance_loss_clip": 1.04296374, + "balance_loss_mlp": 1.00035346, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.951467157782047, + "language_loss": 0.80618966, + "learning_rate": 2.451169054403126e-06, + "loss": 0.82511759, + "num_input_tokens_seen": 158597075, + "step": 7397, + "time_per_iteration": 2.466862201690674 + }, + { + "auxiliary_loss_clip": 0.01116232, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.04642034, + "balance_loss_mlp": 1.02456272, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.7503940747155373, + "language_loss": 0.67622918, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69776177, + "num_input_tokens_seen": 158616650, + "step": 7398, + "time_per_iteration": 2.5334320068359375 + }, + { + "auxiliary_loss_clip": 0.01090169, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.04024386, + "balance_loss_mlp": 1.02315211, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.7940538035569982, + "language_loss": 0.69856828, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71983004, + "num_input_tokens_seen": 158634515, + "step": 7399, + "time_per_iteration": 2.5130016803741455 + }, + { + "auxiliary_loss_clip": 0.01091365, + "auxiliary_loss_mlp": 0.0103683, + "balance_loss_clip": 1.04077363, + "balance_loss_mlp": 1.02369905, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 2.147953908707231, + "language_loss": 0.72729778, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.74857974, + "num_input_tokens_seen": 158653760, + "step": 7400, + "time_per_iteration": 2.535672187805176 + }, + { + "auxiliary_loss_clip": 0.01081299, + "auxiliary_loss_mlp": 0.00776877, + "balance_loss_clip": 1.04990649, + "balance_loss_mlp": 1.00020194, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.7685916505936221, + "language_loss": 0.84973413, + "learning_rate": 2.449651226645422e-06, + "loss": 0.86831588, + "num_input_tokens_seen": 158672190, + "step": 7401, + "time_per_iteration": 2.586402177810669 + }, + { + "auxiliary_loss_clip": 0.01098988, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.0438571, + "balance_loss_mlp": 1.0231663, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.5441722249890586, + "language_loss": 0.83214319, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85348487, + "num_input_tokens_seen": 158694115, + "step": 7402, + "time_per_iteration": 2.5475118160247803 + }, + { + "auxiliary_loss_clip": 0.01103029, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.04487109, + "balance_loss_mlp": 1.02009308, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 2.079601150070983, + "language_loss": 0.77088624, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.79224694, + "num_input_tokens_seen": 158711000, + "step": 7403, + "time_per_iteration": 2.4940638542175293 + }, + { + "auxiliary_loss_clip": 0.0104739, + "auxiliary_loss_mlp": 0.01018231, + "balance_loss_clip": 1.04569221, + "balance_loss_mlp": 1.01700902, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.748764052286312, + "language_loss": 0.60064816, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62130439, + "num_input_tokens_seen": 158769675, + "step": 7404, + "time_per_iteration": 3.060100793838501 + }, + { + "auxiliary_loss_clip": 0.01103804, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.04292357, + "balance_loss_mlp": 1.02616131, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 2.20137912332326, + "language_loss": 0.8224113, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84386206, + "num_input_tokens_seen": 158788215, + "step": 7405, + "time_per_iteration": 2.5513672828674316 + }, + { + "auxiliary_loss_clip": 0.01101465, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.04431152, + "balance_loss_mlp": 1.01502872, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.8176815572389537, + "language_loss": 0.75048757, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77178317, + "num_input_tokens_seen": 158809090, + "step": 7406, + "time_per_iteration": 2.5301108360290527 + }, + { + "auxiliary_loss_clip": 0.01091907, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.04590106, + "balance_loss_mlp": 1.02105093, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.6095945701078018, + "language_loss": 0.65624666, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67749977, + "num_input_tokens_seen": 158828320, + "step": 7407, + "time_per_iteration": 2.5865471363067627 + }, + { + "auxiliary_loss_clip": 0.01097398, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.04223478, + "balance_loss_mlp": 1.01929283, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.5769513801376065, + "language_loss": 0.68093967, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.70223612, + "num_input_tokens_seen": 158847040, + "step": 7408, + "time_per_iteration": 2.496399402618408 + }, + { + "auxiliary_loss_clip": 0.01122315, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.04208899, + "balance_loss_mlp": 1.01959586, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.4924887535197462, + "language_loss": 0.72197676, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74353647, + "num_input_tokens_seen": 158870490, + "step": 7409, + "time_per_iteration": 2.600153684616089 + }, + { + "auxiliary_loss_clip": 0.01103261, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.04372728, + "balance_loss_mlp": 1.01989532, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 2.7166210687825716, + "language_loss": 0.65209377, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67346549, + "num_input_tokens_seen": 158889920, + "step": 7410, + "time_per_iteration": 2.4965384006500244 + }, + { + "auxiliary_loss_clip": 0.01104178, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.04499507, + "balance_loss_mlp": 1.01926029, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.2675856716742797, + "language_loss": 0.74195671, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76333427, + "num_input_tokens_seen": 158909580, + "step": 7411, + "time_per_iteration": 2.5080151557922363 + }, + { + "auxiliary_loss_clip": 0.01061009, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.04540336, + "balance_loss_mlp": 1.01765752, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.749443200605132, + "language_loss": 0.78582776, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.80674058, + "num_input_tokens_seen": 158924600, + "step": 7412, + "time_per_iteration": 2.566983222961426 + }, + { + "auxiliary_loss_clip": 0.01107374, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.04416895, + "balance_loss_mlp": 1.02026415, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 1.984263640466176, + "language_loss": 0.79943687, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82084799, + "num_input_tokens_seen": 158939345, + "step": 7413, + "time_per_iteration": 4.02654767036438 + }, + { + "auxiliary_loss_clip": 0.0111267, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.04532135, + "balance_loss_mlp": 1.01707101, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 2.4151517978272286, + "language_loss": 0.76025409, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.78167892, + "num_input_tokens_seen": 158955855, + "step": 7414, + "time_per_iteration": 2.436734914779663 + }, + { + "auxiliary_loss_clip": 0.01094985, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.04088283, + "balance_loss_mlp": 1.02889287, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.5234157172265164, + "language_loss": 0.83745098, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85883158, + "num_input_tokens_seen": 158976315, + "step": 7415, + "time_per_iteration": 2.527027130126953 + }, + { + "auxiliary_loss_clip": 0.01122004, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.04265177, + "balance_loss_mlp": 1.02073801, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 2.333313454342538, + "language_loss": 0.84262311, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86417937, + "num_input_tokens_seen": 158996725, + "step": 7416, + "time_per_iteration": 2.441528797149658 + }, + { + "auxiliary_loss_clip": 0.01090118, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.04070854, + "balance_loss_mlp": 1.01897502, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.5264634782439412, + "language_loss": 0.8139444, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83516967, + "num_input_tokens_seen": 159017255, + "step": 7417, + "time_per_iteration": 2.57515811920166 + }, + { + "auxiliary_loss_clip": 0.01099953, + "auxiliary_loss_mlp": 0.01040494, + "balance_loss_clip": 1.03847671, + "balance_loss_mlp": 1.02699351, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 2.348623846453406, + "language_loss": 0.80957711, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83098155, + "num_input_tokens_seen": 159035010, + "step": 7418, + "time_per_iteration": 2.505669355392456 + }, + { + "auxiliary_loss_clip": 0.01116768, + "auxiliary_loss_mlp": 0.00778812, + "balance_loss_clip": 1.04462004, + "balance_loss_mlp": 1.00021482, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 3.0806651758634738, + "language_loss": 0.77391982, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79287565, + "num_input_tokens_seen": 159055345, + "step": 7419, + "time_per_iteration": 2.509140729904175 + }, + { + "auxiliary_loss_clip": 0.01091012, + "auxiliary_loss_mlp": 0.01036917, + "balance_loss_clip": 1.04103684, + "balance_loss_mlp": 1.02404821, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.5537120484804359, + "language_loss": 0.72436237, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74564165, + "num_input_tokens_seen": 159074225, + "step": 7420, + "time_per_iteration": 2.5210092067718506 + }, + { + "auxiliary_loss_clip": 0.01100788, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.04031086, + "balance_loss_mlp": 1.01882267, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.7763408766242208, + "language_loss": 0.751436, + "learning_rate": 2.442058014084156e-06, + "loss": 0.7727741, + "num_input_tokens_seen": 159095415, + "step": 7421, + "time_per_iteration": 2.5545620918273926 + }, + { + "auxiliary_loss_clip": 0.01059486, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.03795147, + "balance_loss_mlp": 1.02201819, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.8070512348723462, + "language_loss": 0.76015913, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78110456, + "num_input_tokens_seen": 159114615, + "step": 7422, + "time_per_iteration": 2.590484380722046 + }, + { + "auxiliary_loss_clip": 0.01121541, + "auxiliary_loss_mlp": 0.01034857, + "balance_loss_clip": 1.04300487, + "balance_loss_mlp": 1.02146995, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.4377836225357665, + "language_loss": 0.65120733, + "learning_rate": 2.441298322143784e-06, + "loss": 0.67277128, + "num_input_tokens_seen": 159134370, + "step": 7423, + "time_per_iteration": 2.4433107376098633 + }, + { + "auxiliary_loss_clip": 0.01095455, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.04190779, + "balance_loss_mlp": 1.01749158, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.7596825915252032, + "language_loss": 0.79097366, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81221831, + "num_input_tokens_seen": 159152540, + "step": 7424, + "time_per_iteration": 2.492323160171509 + }, + { + "auxiliary_loss_clip": 0.01106315, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.04206324, + "balance_loss_mlp": 1.01960564, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.412372674312779, + "language_loss": 0.80225664, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82363719, + "num_input_tokens_seen": 159173425, + "step": 7425, + "time_per_iteration": 2.5207817554473877 + }, + { + "auxiliary_loss_clip": 0.01109881, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.04254878, + "balance_loss_mlp": 1.01910412, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 1.525879525654377, + "language_loss": 0.77031749, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.7917279, + "num_input_tokens_seen": 159191210, + "step": 7426, + "time_per_iteration": 2.5012996196746826 + }, + { + "auxiliary_loss_clip": 0.01100911, + "auxiliary_loss_mlp": 0.00778355, + "balance_loss_clip": 1.04678321, + "balance_loss_mlp": 1.00030255, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.944490639252725, + "language_loss": 0.64855599, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66734862, + "num_input_tokens_seen": 159211755, + "step": 7427, + "time_per_iteration": 2.560192584991455 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01039862, + "balance_loss_clip": 1.04344857, + "balance_loss_mlp": 1.02698183, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.7399976657191378, + "language_loss": 0.75546229, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77695036, + "num_input_tokens_seen": 159230315, + "step": 7428, + "time_per_iteration": 3.9477922916412354 + }, + { + "auxiliary_loss_clip": 0.0109274, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.04347587, + "balance_loss_mlp": 1.02313566, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.8352057090602685, + "language_loss": 0.77683175, + "learning_rate": 2.439018845165806e-06, + "loss": 0.79812443, + "num_input_tokens_seen": 159249810, + "step": 7429, + "time_per_iteration": 2.5031819343566895 + }, + { + "auxiliary_loss_clip": 0.01114096, + "auxiliary_loss_mlp": 0.0103293, + "balance_loss_clip": 1.0437727, + "balance_loss_mlp": 1.01956677, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 1.6359288096938767, + "language_loss": 0.91359019, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93506038, + "num_input_tokens_seen": 159271715, + "step": 7430, + "time_per_iteration": 2.4827888011932373 + }, + { + "auxiliary_loss_clip": 0.01102973, + "auxiliary_loss_mlp": 0.00779997, + "balance_loss_clip": 1.04262221, + "balance_loss_mlp": 1.00037456, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.6951995363261632, + "language_loss": 0.79975492, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81858456, + "num_input_tokens_seen": 159290690, + "step": 7431, + "time_per_iteration": 4.027876615524292 + }, + { + "auxiliary_loss_clip": 0.01104965, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.04443061, + "balance_loss_mlp": 1.01853657, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 2.292120671117939, + "language_loss": 0.79935062, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82072252, + "num_input_tokens_seen": 159309400, + "step": 7432, + "time_per_iteration": 2.4936106204986572 + }, + { + "auxiliary_loss_clip": 0.01095104, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.04743552, + "balance_loss_mlp": 1.02105594, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 2.3136445966178307, + "language_loss": 0.76764476, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78893447, + "num_input_tokens_seen": 159327425, + "step": 7433, + "time_per_iteration": 2.534353256225586 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.04140091, + "balance_loss_mlp": 1.02253318, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 3.0154499176475573, + "language_loss": 0.7742697, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79563951, + "num_input_tokens_seen": 159345805, + "step": 7434, + "time_per_iteration": 2.52014422416687 + }, + { + "auxiliary_loss_clip": 0.01115585, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.04476595, + "balance_loss_mlp": 1.01803207, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.9897547983976258, + "language_loss": 0.64762592, + "learning_rate": 2.436738768872905e-06, + "loss": 0.66909033, + "num_input_tokens_seen": 159364595, + "step": 7435, + "time_per_iteration": 3.8409712314605713 + }, + { + "auxiliary_loss_clip": 0.01107704, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.04612088, + "balance_loss_mlp": 1.01598632, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.6609322021416497, + "language_loss": 0.83416349, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.85553777, + "num_input_tokens_seen": 159385265, + "step": 7436, + "time_per_iteration": 2.5278351306915283 + }, + { + "auxiliary_loss_clip": 0.01076942, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.0422442, + "balance_loss_mlp": 1.03416514, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.883079935102035, + "language_loss": 0.79630762, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81758583, + "num_input_tokens_seen": 159405080, + "step": 7437, + "time_per_iteration": 2.601297378540039 + }, + { + "auxiliary_loss_clip": 0.01082022, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.04680979, + "balance_loss_mlp": 1.02317739, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.7609142816114693, + "language_loss": 0.71676767, + "learning_rate": 2.435598506956009e-06, + "loss": 0.73795277, + "num_input_tokens_seen": 159424595, + "step": 7438, + "time_per_iteration": 2.600729465484619 + }, + { + "auxiliary_loss_clip": 0.01085865, + "auxiliary_loss_mlp": 0.01036888, + "balance_loss_clip": 1.04625297, + "balance_loss_mlp": 1.02251685, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.7559953323279265, + "language_loss": 0.67166215, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69288969, + "num_input_tokens_seen": 159443865, + "step": 7439, + "time_per_iteration": 2.7334353923797607 + }, + { + "auxiliary_loss_clip": 0.01099654, + "auxiliary_loss_mlp": 0.0104031, + "balance_loss_clip": 1.03704429, + "balance_loss_mlp": 1.02461052, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.7043845310091406, + "language_loss": 0.73962396, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.76102358, + "num_input_tokens_seen": 159464525, + "step": 7440, + "time_per_iteration": 2.5409910678863525 + }, + { + "auxiliary_loss_clip": 0.01069711, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.03219664, + "balance_loss_mlp": 1.03019476, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 1.6162957352026488, + "language_loss": 0.74253678, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76368511, + "num_input_tokens_seen": 159486385, + "step": 7441, + "time_per_iteration": 2.636291265487671 + }, + { + "auxiliary_loss_clip": 0.01091614, + "auxiliary_loss_mlp": 0.01039833, + "balance_loss_clip": 1.04315531, + "balance_loss_mlp": 1.02551615, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 1.9439671246101031, + "language_loss": 0.74633169, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.76764613, + "num_input_tokens_seen": 159503880, + "step": 7442, + "time_per_iteration": 2.578293561935425 + }, + { + "auxiliary_loss_clip": 0.01127351, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.04329598, + "balance_loss_mlp": 1.02574253, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.7362277057123887, + "language_loss": 0.7470094, + "learning_rate": 2.433697740261273e-06, + "loss": 0.7686885, + "num_input_tokens_seen": 159522980, + "step": 7443, + "time_per_iteration": 2.523381471633911 + }, + { + "auxiliary_loss_clip": 0.0109848, + "auxiliary_loss_mlp": 0.01032546, + "balance_loss_clip": 1.0360359, + "balance_loss_mlp": 1.01741791, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.9390878958010875, + "language_loss": 0.77586704, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79717731, + "num_input_tokens_seen": 159543340, + "step": 7444, + "time_per_iteration": 2.5071663856506348 + }, + { + "auxiliary_loss_clip": 0.01107163, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.04454076, + "balance_loss_mlp": 1.02254832, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.43970698245181, + "language_loss": 0.8488735, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87030637, + "num_input_tokens_seen": 159558210, + "step": 7445, + "time_per_iteration": 2.4538416862487793 + }, + { + "auxiliary_loss_clip": 0.0108859, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.03978157, + "balance_loss_mlp": 1.02323735, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 2.677767542940449, + "language_loss": 0.64522934, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66652143, + "num_input_tokens_seen": 159577920, + "step": 7446, + "time_per_iteration": 2.5486795902252197 + }, + { + "auxiliary_loss_clip": 0.01034041, + "auxiliary_loss_mlp": 0.01006715, + "balance_loss_clip": 1.01715994, + "balance_loss_mlp": 1.00503433, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.740111409335383, + "language_loss": 0.50257242, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52297997, + "num_input_tokens_seen": 159632295, + "step": 7447, + "time_per_iteration": 2.9006893634796143 + }, + { + "auxiliary_loss_clip": 0.01040112, + "auxiliary_loss_mlp": 0.01002255, + "balance_loss_clip": 1.0134958, + "balance_loss_mlp": 1.00069332, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7680947165111931, + "language_loss": 0.59307826, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61350191, + "num_input_tokens_seen": 159698435, + "step": 7448, + "time_per_iteration": 3.0580410957336426 + }, + { + "auxiliary_loss_clip": 0.0109373, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.04360151, + "balance_loss_mlp": 1.02011883, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.5537829113477568, + "language_loss": 0.59161294, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61287904, + "num_input_tokens_seen": 159722150, + "step": 7449, + "time_per_iteration": 2.753838062286377 + }, + { + "auxiliary_loss_clip": 0.01095503, + "auxiliary_loss_mlp": 0.01033135, + "balance_loss_clip": 1.04079247, + "balance_loss_mlp": 1.01998651, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 2.0852447752055205, + "language_loss": 0.80163652, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.82292295, + "num_input_tokens_seen": 159740550, + "step": 7450, + "time_per_iteration": 2.498547077178955 + }, + { + "auxiliary_loss_clip": 0.01123478, + "auxiliary_loss_mlp": 0.01042788, + "balance_loss_clip": 1.0428642, + "balance_loss_mlp": 1.02899575, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.299741484972217, + "language_loss": 0.79978806, + "learning_rate": 2.430655659114697e-06, + "loss": 0.82145071, + "num_input_tokens_seen": 159758245, + "step": 7451, + "time_per_iteration": 2.41340708732605 + }, + { + "auxiliary_loss_clip": 0.01013019, + "auxiliary_loss_mlp": 0.01009475, + "balance_loss_clip": 1.02588534, + "balance_loss_mlp": 1.00806189, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.8351045018837013, + "language_loss": 0.6285699, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64879483, + "num_input_tokens_seen": 159826790, + "step": 7452, + "time_per_iteration": 4.7902631759643555 + }, + { + "auxiliary_loss_clip": 0.01125212, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_clip": 1.04340279, + "balance_loss_mlp": 1.02404737, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.6516939748416761, + "language_loss": 0.62726045, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64889866, + "num_input_tokens_seen": 159845805, + "step": 7453, + "time_per_iteration": 2.434351921081543 + }, + { + "auxiliary_loss_clip": 0.01020795, + "auxiliary_loss_mlp": 0.01013732, + "balance_loss_clip": 1.01206458, + "balance_loss_mlp": 1.01235509, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7504129476702113, + "language_loss": 0.57061797, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59096324, + "num_input_tokens_seen": 159898860, + "step": 7454, + "time_per_iteration": 2.9136834144592285 + }, + { + "auxiliary_loss_clip": 0.01095302, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.03628302, + "balance_loss_mlp": 1.02759397, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.2346498173846103, + "language_loss": 0.7478888, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.76926911, + "num_input_tokens_seen": 159911555, + "step": 7455, + "time_per_iteration": 2.443641185760498 + }, + { + "auxiliary_loss_clip": 0.01099865, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.04166305, + "balance_loss_mlp": 1.01938665, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.8724257419574024, + "language_loss": 0.76034057, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78166014, + "num_input_tokens_seen": 159931470, + "step": 7456, + "time_per_iteration": 2.6007626056671143 + }, + { + "auxiliary_loss_clip": 0.01123618, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.04459691, + "balance_loss_mlp": 1.02550077, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 1.8068944081378056, + "language_loss": 0.76381278, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78543401, + "num_input_tokens_seen": 159946115, + "step": 7457, + "time_per_iteration": 2.4398937225341797 + }, + { + "auxiliary_loss_clip": 0.01108057, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.03798497, + "balance_loss_mlp": 1.01879597, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 2.014636549120564, + "language_loss": 0.68312776, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.70454091, + "num_input_tokens_seen": 159963915, + "step": 7458, + "time_per_iteration": 2.4175593852996826 + }, + { + "auxiliary_loss_clip": 0.0108897, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.04172671, + "balance_loss_mlp": 1.02072215, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.7377317985207243, + "language_loss": 0.72216821, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74340874, + "num_input_tokens_seen": 159982140, + "step": 7459, + "time_per_iteration": 2.5133631229400635 + }, + { + "auxiliary_loss_clip": 0.01104291, + "auxiliary_loss_mlp": 0.01038125, + "balance_loss_clip": 1.0382483, + "balance_loss_mlp": 1.02387929, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.6012171764803422, + "language_loss": 0.6967923, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71821648, + "num_input_tokens_seen": 160002280, + "step": 7460, + "time_per_iteration": 2.459606409072876 + }, + { + "auxiliary_loss_clip": 0.0112067, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.0401665, + "balance_loss_mlp": 1.02040124, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.9053820999796245, + "language_loss": 0.77226985, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79382277, + "num_input_tokens_seen": 160020260, + "step": 7461, + "time_per_iteration": 2.408339262008667 + }, + { + "auxiliary_loss_clip": 0.01121093, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.03941834, + "balance_loss_mlp": 1.02123094, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.7259290022693672, + "language_loss": 0.68130392, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70285952, + "num_input_tokens_seen": 160040240, + "step": 7462, + "time_per_iteration": 2.4678659439086914 + }, + { + "auxiliary_loss_clip": 0.01037129, + "auxiliary_loss_mlp": 0.01002354, + "balance_loss_clip": 1.01087582, + "balance_loss_mlp": 1.0007571, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7515846073681473, + "language_loss": 0.54495025, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56534511, + "num_input_tokens_seen": 160093865, + "step": 7463, + "time_per_iteration": 3.0417518615722656 + }, + { + "auxiliary_loss_clip": 0.01110878, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.03960323, + "balance_loss_mlp": 1.02159095, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 1.7338965341720134, + "language_loss": 0.75798905, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.77945393, + "num_input_tokens_seen": 160113590, + "step": 7464, + "time_per_iteration": 2.493772506713867 + }, + { + "auxiliary_loss_clip": 0.01109311, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.04237986, + "balance_loss_mlp": 1.01878738, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.9997618314235523, + "language_loss": 0.7378391, + "learning_rate": 2.425329506653441e-06, + "loss": 0.75923812, + "num_input_tokens_seen": 160131795, + "step": 7465, + "time_per_iteration": 2.4381861686706543 + }, + { + "auxiliary_loss_clip": 0.01107686, + "auxiliary_loss_mlp": 0.01037087, + "balance_loss_clip": 1.04495084, + "balance_loss_mlp": 1.02166104, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 1.9353109980224201, + "language_loss": 0.79850852, + "learning_rate": 2.424948945758966e-06, + "loss": 0.81995618, + "num_input_tokens_seen": 160150635, + "step": 7466, + "time_per_iteration": 2.542755603790283 + }, + { + "auxiliary_loss_clip": 0.01103641, + "auxiliary_loss_mlp": 0.01038779, + "balance_loss_clip": 1.04284525, + "balance_loss_mlp": 1.02492726, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.371646741072556, + "language_loss": 0.80361688, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82504112, + "num_input_tokens_seen": 160168615, + "step": 7467, + "time_per_iteration": 3.8947694301605225 + }, + { + "auxiliary_loss_clip": 0.01077642, + "auxiliary_loss_mlp": 0.01030673, + "balance_loss_clip": 1.04906106, + "balance_loss_mlp": 1.01801908, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 2.183075209882206, + "language_loss": 0.74816823, + "learning_rate": 2.424187775642129e-06, + "loss": 0.76925147, + "num_input_tokens_seen": 160187295, + "step": 7468, + "time_per_iteration": 2.558886766433716 + }, + { + "auxiliary_loss_clip": 0.01089943, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.03621411, + "balance_loss_mlp": 1.02089274, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.932261449362449, + "language_loss": 0.7088058, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73003757, + "num_input_tokens_seen": 160205115, + "step": 7469, + "time_per_iteration": 2.467020273208618 + }, + { + "auxiliary_loss_clip": 0.01112257, + "auxiliary_loss_mlp": 0.01035972, + "balance_loss_clip": 1.04633403, + "balance_loss_mlp": 1.02224493, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.8559708461774143, + "language_loss": 0.71681845, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.73830068, + "num_input_tokens_seen": 160222580, + "step": 7470, + "time_per_iteration": 2.463486433029175 + }, + { + "auxiliary_loss_clip": 0.01085203, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.04208684, + "balance_loss_mlp": 1.02079356, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 2.083399251515554, + "language_loss": 0.77184129, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79304719, + "num_input_tokens_seen": 160241520, + "step": 7471, + "time_per_iteration": 4.015499830245972 + }, + { + "auxiliary_loss_clip": 0.01120712, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.04142356, + "balance_loss_mlp": 1.02476692, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 2.11595384401638, + "language_loss": 0.70360184, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72518718, + "num_input_tokens_seen": 160261815, + "step": 7472, + "time_per_iteration": 2.442748785018921 + }, + { + "auxiliary_loss_clip": 0.01034128, + "auxiliary_loss_mlp": 0.01001649, + "balance_loss_clip": 1.01663554, + "balance_loss_mlp": 1.00019455, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7341762660418961, + "language_loss": 0.61668235, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63704014, + "num_input_tokens_seen": 160317070, + "step": 7473, + "time_per_iteration": 3.012693405151367 + }, + { + "auxiliary_loss_clip": 0.01121773, + "auxiliary_loss_mlp": 0.00779165, + "balance_loss_clip": 1.04169416, + "balance_loss_mlp": 1.00036144, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 1.9530243361295574, + "language_loss": 0.77435905, + "learning_rate": 2.421903879707657e-06, + "loss": 0.79336846, + "num_input_tokens_seen": 160334980, + "step": 7474, + "time_per_iteration": 2.425788402557373 + }, + { + "auxiliary_loss_clip": 0.01077646, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.04231215, + "balance_loss_mlp": 1.02542114, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.629176486553012, + "language_loss": 0.72145104, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74261677, + "num_input_tokens_seen": 160354500, + "step": 7475, + "time_per_iteration": 3.9379189014434814 + }, + { + "auxiliary_loss_clip": 0.01076335, + "auxiliary_loss_mlp": 0.01042128, + "balance_loss_clip": 1.03964019, + "balance_loss_mlp": 1.02765584, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 2.058583120957327, + "language_loss": 0.76878244, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.78996706, + "num_input_tokens_seen": 160373650, + "step": 7476, + "time_per_iteration": 2.6181185245513916 + }, + { + "auxiliary_loss_clip": 0.01117175, + "auxiliary_loss_mlp": 0.0078062, + "balance_loss_clip": 1.04305279, + "balance_loss_mlp": 1.0004338, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 1.9366719239566363, + "language_loss": 0.71785808, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73683596, + "num_input_tokens_seen": 160393430, + "step": 7477, + "time_per_iteration": 2.473512649536133 + }, + { + "auxiliary_loss_clip": 0.01100154, + "auxiliary_loss_mlp": 0.01046795, + "balance_loss_clip": 1.03864169, + "balance_loss_mlp": 1.03144062, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 1.8979118252747034, + "language_loss": 0.67582357, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.69729304, + "num_input_tokens_seen": 160410545, + "step": 7478, + "time_per_iteration": 2.4742422103881836 + }, + { + "auxiliary_loss_clip": 0.01093799, + "auxiliary_loss_mlp": 0.01038108, + "balance_loss_clip": 1.03977609, + "balance_loss_mlp": 1.02538228, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 2.22392600370011, + "language_loss": 0.89302641, + "learning_rate": 2.420000193000779e-06, + "loss": 0.9143455, + "num_input_tokens_seen": 160428105, + "step": 7479, + "time_per_iteration": 2.4722163677215576 + }, + { + "auxiliary_loss_clip": 0.01070385, + "auxiliary_loss_mlp": 0.01041839, + "balance_loss_clip": 1.04458797, + "balance_loss_mlp": 1.02758765, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 3.286922536868595, + "language_loss": 0.75418794, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77531016, + "num_input_tokens_seen": 160448815, + "step": 7480, + "time_per_iteration": 2.624246597290039 + }, + { + "auxiliary_loss_clip": 0.01091425, + "auxiliary_loss_mlp": 0.0103896, + "balance_loss_clip": 1.03972089, + "balance_loss_mlp": 1.02438092, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.1629639721553193, + "language_loss": 0.79404837, + "learning_rate": 2.419238606731815e-06, + "loss": 0.8153522, + "num_input_tokens_seen": 160465940, + "step": 7481, + "time_per_iteration": 2.5264313220977783 + }, + { + "auxiliary_loss_clip": 0.01100749, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.04373431, + "balance_loss_mlp": 1.01921427, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.801890155727204, + "language_loss": 0.68516529, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70650136, + "num_input_tokens_seen": 160486710, + "step": 7482, + "time_per_iteration": 2.6089296340942383 + }, + { + "auxiliary_loss_clip": 0.0111353, + "auxiliary_loss_mlp": 0.01045994, + "balance_loss_clip": 1.0430088, + "balance_loss_mlp": 1.03256559, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.0109199571506218, + "language_loss": 0.85364318, + "learning_rate": 2.418476956872571e-06, + "loss": 0.87523848, + "num_input_tokens_seen": 160503405, + "step": 7483, + "time_per_iteration": 2.436757802963257 + }, + { + "auxiliary_loss_clip": 0.01093986, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.03851902, + "balance_loss_mlp": 1.02956998, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.6880377736211905, + "language_loss": 0.80846095, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82984543, + "num_input_tokens_seen": 160525080, + "step": 7484, + "time_per_iteration": 2.56852126121521 + }, + { + "auxiliary_loss_clip": 0.01072702, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.03500056, + "balance_loss_mlp": 1.01872945, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.3806270833418663, + "language_loss": 0.74548113, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.76655209, + "num_input_tokens_seen": 160540895, + "step": 7485, + "time_per_iteration": 2.586812973022461 + }, + { + "auxiliary_loss_clip": 0.01025545, + "auxiliary_loss_mlp": 0.01004698, + "balance_loss_clip": 1.01717877, + "balance_loss_mlp": 1.00315988, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7898637289789762, + "language_loss": 0.58647758, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60677999, + "num_input_tokens_seen": 160598270, + "step": 7486, + "time_per_iteration": 3.0938220024108887 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.01043448, + "balance_loss_clip": 1.03992414, + "balance_loss_mlp": 1.02863073, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 3.7790014647313237, + "language_loss": 0.83444607, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85595167, + "num_input_tokens_seen": 160614720, + "step": 7487, + "time_per_iteration": 2.4665470123291016 + }, + { + "auxiliary_loss_clip": 0.01121775, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.04265904, + "balance_loss_mlp": 1.02178884, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.7585786728713841, + "language_loss": 0.77232778, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79389465, + "num_input_tokens_seen": 160635170, + "step": 7488, + "time_per_iteration": 2.4908084869384766 + }, + { + "auxiliary_loss_clip": 0.01121929, + "auxiliary_loss_mlp": 0.01041321, + "balance_loss_clip": 1.04565811, + "balance_loss_mlp": 1.02661085, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 2.315229117399177, + "language_loss": 0.71779406, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.73942655, + "num_input_tokens_seen": 160654490, + "step": 7489, + "time_per_iteration": 2.524132490158081 + }, + { + "auxiliary_loss_clip": 0.01103394, + "auxiliary_loss_mlp": 0.01039038, + "balance_loss_clip": 1.04106927, + "balance_loss_mlp": 1.0229094, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.736638084246929, + "language_loss": 0.70206702, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.72349131, + "num_input_tokens_seen": 160669400, + "step": 7490, + "time_per_iteration": 2.450669050216675 + }, + { + "auxiliary_loss_clip": 0.01030657, + "auxiliary_loss_mlp": 0.01008296, + "balance_loss_clip": 1.0284338, + "balance_loss_mlp": 1.00665081, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7366978426760195, + "language_loss": 0.56754798, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58793753, + "num_input_tokens_seen": 160733820, + "step": 7491, + "time_per_iteration": 3.025160312652588 + }, + { + "auxiliary_loss_clip": 0.01105903, + "auxiliary_loss_mlp": 0.01036816, + "balance_loss_clip": 1.0420661, + "balance_loss_mlp": 1.02370882, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 2.1550522288567064, + "language_loss": 0.79218334, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81361055, + "num_input_tokens_seen": 160753175, + "step": 7492, + "time_per_iteration": 3.95871639251709 + }, + { + "auxiliary_loss_clip": 0.01092661, + "auxiliary_loss_mlp": 0.00783804, + "balance_loss_clip": 1.0408268, + "balance_loss_mlp": 1.00051463, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.244980852457121, + "language_loss": 0.92439836, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.94316304, + "num_input_tokens_seen": 160768310, + "step": 7493, + "time_per_iteration": 2.493799924850464 + }, + { + "auxiliary_loss_clip": 0.01031714, + "auxiliary_loss_mlp": 0.01008115, + "balance_loss_clip": 1.01467824, + "balance_loss_mlp": 1.00657678, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.7955110854529097, + "language_loss": 0.62895423, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64935255, + "num_input_tokens_seen": 160827370, + "step": 7494, + "time_per_iteration": 3.070996046066284 + }, + { + "auxiliary_loss_clip": 0.01122332, + "auxiliary_loss_mlp": 0.01034175, + "balance_loss_clip": 1.04363203, + "balance_loss_mlp": 1.02095437, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.4144645472113941, + "language_loss": 0.82266665, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84423172, + "num_input_tokens_seen": 160849140, + "step": 7495, + "time_per_iteration": 2.432504415512085 + }, + { + "auxiliary_loss_clip": 0.01108741, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.04080701, + "balance_loss_mlp": 1.02938151, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.6315275102964029, + "language_loss": 0.85779488, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87932992, + "num_input_tokens_seen": 160871280, + "step": 7496, + "time_per_iteration": 2.619209051132202 + }, + { + "auxiliary_loss_clip": 0.01124139, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.04410958, + "balance_loss_mlp": 1.01653433, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 5.1253791515831395, + "language_loss": 0.76010633, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.781654, + "num_input_tokens_seen": 160888625, + "step": 7497, + "time_per_iteration": 2.4165964126586914 + }, + { + "auxiliary_loss_clip": 0.010918, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.04015613, + "balance_loss_mlp": 1.01690197, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 2.5405046897806436, + "language_loss": 0.74969578, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77091765, + "num_input_tokens_seen": 160907040, + "step": 7498, + "time_per_iteration": 2.4971654415130615 + }, + { + "auxiliary_loss_clip": 0.011245, + "auxiliary_loss_mlp": 0.01039905, + "balance_loss_clip": 1.04329145, + "balance_loss_mlp": 1.02575445, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 3.773752200569021, + "language_loss": 0.70602381, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72766781, + "num_input_tokens_seen": 160927115, + "step": 7499, + "time_per_iteration": 2.419475793838501 + }, + { + "auxiliary_loss_clip": 0.01088395, + "auxiliary_loss_mlp": 0.01036089, + "balance_loss_clip": 1.04738426, + "balance_loss_mlp": 1.02176595, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 2.250860925588834, + "language_loss": 0.77332747, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79457223, + "num_input_tokens_seen": 160944405, + "step": 7500, + "time_per_iteration": 2.5733535289764404 + }, + { + "auxiliary_loss_clip": 0.01081917, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.04339266, + "balance_loss_mlp": 1.01857066, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 2.051298093338359, + "language_loss": 0.62648916, + "learning_rate": 2.411619265641992e-06, + "loss": 0.64763153, + "num_input_tokens_seen": 160961345, + "step": 7501, + "time_per_iteration": 2.5564258098602295 + }, + { + "auxiliary_loss_clip": 0.01126596, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.04409122, + "balance_loss_mlp": 1.02159822, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 2.0099628403898544, + "language_loss": 0.84799457, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86962616, + "num_input_tokens_seen": 160977330, + "step": 7502, + "time_per_iteration": 2.405575752258301 + }, + { + "auxiliary_loss_clip": 0.01098522, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.04354572, + "balance_loss_mlp": 1.01831985, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.5087683990280718, + "language_loss": 0.79810429, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81939995, + "num_input_tokens_seen": 160997280, + "step": 7503, + "time_per_iteration": 2.528036594390869 + }, + { + "auxiliary_loss_clip": 0.01097636, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.04417408, + "balance_loss_mlp": 1.02049828, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 2.424899800482965, + "language_loss": 0.80633658, + "learning_rate": 2.410475823155484e-06, + "loss": 0.82765502, + "num_input_tokens_seen": 161014235, + "step": 7504, + "time_per_iteration": 2.4694888591766357 + }, + { + "auxiliary_loss_clip": 0.01082513, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.03867185, + "balance_loss_mlp": 1.02115655, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 1.609916702038616, + "language_loss": 0.6351918, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.65635282, + "num_input_tokens_seen": 161032360, + "step": 7505, + "time_per_iteration": 2.5524582862854004 + }, + { + "auxiliary_loss_clip": 0.01017904, + "auxiliary_loss_mlp": 0.01008178, + "balance_loss_clip": 1.02632558, + "balance_loss_mlp": 1.00642538, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8297149209906687, + "language_loss": 0.58827025, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60853106, + "num_input_tokens_seen": 161091360, + "step": 7506, + "time_per_iteration": 4.647017955780029 + }, + { + "auxiliary_loss_clip": 0.01075738, + "auxiliary_loss_mlp": 0.0103658, + "balance_loss_clip": 1.04235888, + "balance_loss_mlp": 1.02266812, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.8743697008351863, + "language_loss": 0.79626894, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81739217, + "num_input_tokens_seen": 161110825, + "step": 7507, + "time_per_iteration": 2.5538179874420166 + }, + { + "auxiliary_loss_clip": 0.01085021, + "auxiliary_loss_mlp": 0.01031822, + "balance_loss_clip": 1.04107726, + "balance_loss_mlp": 1.01680183, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.6841695339926999, + "language_loss": 0.74446523, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76563364, + "num_input_tokens_seen": 161130685, + "step": 7508, + "time_per_iteration": 2.5634734630584717 + }, + { + "auxiliary_loss_clip": 0.01109818, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.04420304, + "balance_loss_mlp": 1.01975918, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.8879327606449778, + "language_loss": 0.79393047, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81535292, + "num_input_tokens_seen": 161147555, + "step": 7509, + "time_per_iteration": 2.444042682647705 + }, + { + "auxiliary_loss_clip": 0.01124048, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.04577148, + "balance_loss_mlp": 1.01769233, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.7231615769042323, + "language_loss": 0.72999251, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75154048, + "num_input_tokens_seen": 161166255, + "step": 7510, + "time_per_iteration": 2.4709312915802 + }, + { + "auxiliary_loss_clip": 0.01122494, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.0415591, + "balance_loss_mlp": 1.02310109, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 2.7564712938500837, + "language_loss": 0.76955426, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79115641, + "num_input_tokens_seen": 161184720, + "step": 7511, + "time_per_iteration": 3.9222888946533203 + }, + { + "auxiliary_loss_clip": 0.01111076, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.04187405, + "balance_loss_mlp": 1.02257705, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.59096520843139, + "language_loss": 0.78556317, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.80704033, + "num_input_tokens_seen": 161204360, + "step": 7512, + "time_per_iteration": 2.4657061100006104 + }, + { + "auxiliary_loss_clip": 0.01090547, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.0393306, + "balance_loss_mlp": 1.02544451, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 2.0512776220140747, + "language_loss": 0.8709873, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.8922987, + "num_input_tokens_seen": 161223575, + "step": 7513, + "time_per_iteration": 2.553302764892578 + }, + { + "auxiliary_loss_clip": 0.01104592, + "auxiliary_loss_mlp": 0.01033133, + "balance_loss_clip": 1.04219675, + "balance_loss_mlp": 1.02097356, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.5177706769934292, + "language_loss": 0.67273688, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69411409, + "num_input_tokens_seen": 161243805, + "step": 7514, + "time_per_iteration": 2.509502410888672 + }, + { + "auxiliary_loss_clip": 0.01113581, + "auxiliary_loss_mlp": 0.01035642, + "balance_loss_clip": 1.04875112, + "balance_loss_mlp": 1.01989412, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 2.1272076316192225, + "language_loss": 0.69413865, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71563089, + "num_input_tokens_seen": 161261450, + "step": 7515, + "time_per_iteration": 3.860330820083618 + }, + { + "auxiliary_loss_clip": 0.01111971, + "auxiliary_loss_mlp": 0.01040617, + "balance_loss_clip": 1.0407294, + "balance_loss_mlp": 1.02496481, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.6964015656600764, + "language_loss": 0.81683689, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83836275, + "num_input_tokens_seen": 161276965, + "step": 7516, + "time_per_iteration": 2.442112445831299 + }, + { + "auxiliary_loss_clip": 0.01120592, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.04351187, + "balance_loss_mlp": 1.02032888, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.6929469300878077, + "language_loss": 0.65991533, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.68146384, + "num_input_tokens_seen": 161295375, + "step": 7517, + "time_per_iteration": 2.443448066711426 + }, + { + "auxiliary_loss_clip": 0.01088544, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.04214501, + "balance_loss_mlp": 1.01804936, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 2.037439413757184, + "language_loss": 0.62887859, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65006852, + "num_input_tokens_seen": 161313010, + "step": 7518, + "time_per_iteration": 2.543884038925171 + }, + { + "auxiliary_loss_clip": 0.01111131, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.0433352, + "balance_loss_mlp": 1.02374697, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.4262133354573627, + "language_loss": 0.59580767, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61729538, + "num_input_tokens_seen": 161336690, + "step": 7519, + "time_per_iteration": 2.711259603500366 + }, + { + "auxiliary_loss_clip": 0.01115062, + "auxiliary_loss_mlp": 0.01044411, + "balance_loss_clip": 1.04710245, + "balance_loss_mlp": 1.03104186, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.4160625891884422, + "language_loss": 0.72403282, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74562752, + "num_input_tokens_seen": 161357845, + "step": 7520, + "time_per_iteration": 2.480398178100586 + }, + { + "auxiliary_loss_clip": 0.01104557, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.04458034, + "balance_loss_mlp": 1.02404881, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 2.2930042975632268, + "language_loss": 0.75488091, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77629769, + "num_input_tokens_seen": 161375160, + "step": 7521, + "time_per_iteration": 2.471125364303589 + }, + { + "auxiliary_loss_clip": 0.01106033, + "auxiliary_loss_mlp": 0.0103923, + "balance_loss_clip": 1.04348087, + "balance_loss_mlp": 1.02460265, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.7125048596944972, + "language_loss": 0.67588717, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.69733983, + "num_input_tokens_seen": 161393690, + "step": 7522, + "time_per_iteration": 2.492868661880493 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01039339, + "balance_loss_clip": 1.04080176, + "balance_loss_mlp": 1.02543962, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.4683046531959978, + "language_loss": 0.60794938, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62942612, + "num_input_tokens_seen": 161415015, + "step": 7523, + "time_per_iteration": 2.5103330612182617 + }, + { + "auxiliary_loss_clip": 0.01118312, + "auxiliary_loss_mlp": 0.01043234, + "balance_loss_clip": 1.04516423, + "balance_loss_mlp": 1.02807045, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 23.124287210228616, + "language_loss": 0.78099716, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80261254, + "num_input_tokens_seen": 161432940, + "step": 7524, + "time_per_iteration": 2.444391965866089 + }, + { + "auxiliary_loss_clip": 0.01084354, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.04662097, + "balance_loss_mlp": 1.02709067, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.6713596860813127, + "language_loss": 0.63748527, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65873802, + "num_input_tokens_seen": 161452215, + "step": 7525, + "time_per_iteration": 2.5909364223480225 + }, + { + "auxiliary_loss_clip": 0.01110694, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.04255676, + "balance_loss_mlp": 1.0212276, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.5766175974592036, + "language_loss": 0.78959244, + "learning_rate": 2.402086322981083e-06, + "loss": 0.81103945, + "num_input_tokens_seen": 161469520, + "step": 7526, + "time_per_iteration": 2.4379687309265137 + }, + { + "auxiliary_loss_clip": 0.01098846, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.04260051, + "balance_loss_mlp": 1.02202559, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.629559212437546, + "language_loss": 0.80808002, + "learning_rate": 2.40170480555747e-06, + "loss": 0.82942593, + "num_input_tokens_seen": 161487335, + "step": 7527, + "time_per_iteration": 2.5064995288848877 + }, + { + "auxiliary_loss_clip": 0.01088592, + "auxiliary_loss_mlp": 0.01030745, + "balance_loss_clip": 1.04329419, + "balance_loss_mlp": 1.01674962, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.7108113074836837, + "language_loss": 0.65256554, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67375892, + "num_input_tokens_seen": 161510095, + "step": 7528, + "time_per_iteration": 2.6027109622955322 + }, + { + "auxiliary_loss_clip": 0.0110038, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.04132056, + "balance_loss_mlp": 1.0219152, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.646816209353776, + "language_loss": 0.75394434, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.77530628, + "num_input_tokens_seen": 161528725, + "step": 7529, + "time_per_iteration": 2.5063347816467285 + }, + { + "auxiliary_loss_clip": 0.01123366, + "auxiliary_loss_mlp": 0.01039122, + "balance_loss_clip": 1.0439626, + "balance_loss_mlp": 1.0252645, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 1.9543922833043381, + "language_loss": 0.72577953, + "learning_rate": 2.400560161948384e-06, + "loss": 0.74740446, + "num_input_tokens_seen": 161547195, + "step": 7530, + "time_per_iteration": 2.406508684158325 + }, + { + "auxiliary_loss_clip": 0.0109163, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.04272509, + "balance_loss_mlp": 1.02080202, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.838108027663247, + "language_loss": 0.76146203, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78271413, + "num_input_tokens_seen": 161565565, + "step": 7531, + "time_per_iteration": 4.129749059677124 + }, + { + "auxiliary_loss_clip": 0.01118368, + "auxiliary_loss_mlp": 0.01039035, + "balance_loss_clip": 1.04233956, + "balance_loss_mlp": 1.02468801, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.8743751397763833, + "language_loss": 0.67353439, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69510853, + "num_input_tokens_seen": 161586630, + "step": 7532, + "time_per_iteration": 2.4666249752044678 + }, + { + "auxiliary_loss_clip": 0.01112716, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.04558706, + "balance_loss_mlp": 1.02531695, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.1945081019139265, + "language_loss": 0.7885986, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81011289, + "num_input_tokens_seen": 161603815, + "step": 7533, + "time_per_iteration": 2.4396181106567383 + }, + { + "auxiliary_loss_clip": 0.01100216, + "auxiliary_loss_mlp": 0.01038979, + "balance_loss_clip": 1.04401064, + "balance_loss_mlp": 1.02310038, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.7727795652100578, + "language_loss": 0.83250678, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85389876, + "num_input_tokens_seen": 161622900, + "step": 7534, + "time_per_iteration": 2.527519941329956 + }, + { + "auxiliary_loss_clip": 0.01097693, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.04630589, + "balance_loss_mlp": 1.02059579, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.6717215001507684, + "language_loss": 0.76548016, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78680956, + "num_input_tokens_seen": 161641700, + "step": 7535, + "time_per_iteration": 2.498878002166748 + }, + { + "auxiliary_loss_clip": 0.01086885, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.04752493, + "balance_loss_mlp": 1.02407122, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5470888849147504, + "language_loss": 0.80535346, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82659668, + "num_input_tokens_seen": 161661955, + "step": 7536, + "time_per_iteration": 2.5497677326202393 + }, + { + "auxiliary_loss_clip": 0.01095826, + "auxiliary_loss_mlp": 0.01034765, + "balance_loss_clip": 1.04147065, + "balance_loss_mlp": 1.02040005, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 1.73490968321375, + "language_loss": 0.7625525, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78385842, + "num_input_tokens_seen": 161679245, + "step": 7537, + "time_per_iteration": 2.5032029151916504 + }, + { + "auxiliary_loss_clip": 0.01114605, + "auxiliary_loss_mlp": 0.01035005, + "balance_loss_clip": 1.04200971, + "balance_loss_mlp": 1.02213657, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.317055672001456, + "language_loss": 0.75962782, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78112388, + "num_input_tokens_seen": 161698795, + "step": 7538, + "time_per_iteration": 2.492122173309326 + }, + { + "auxiliary_loss_clip": 0.01035503, + "auxiliary_loss_mlp": 0.0100319, + "balance_loss_clip": 1.01839375, + "balance_loss_mlp": 1.00173569, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.798303727303294, + "language_loss": 0.62364924, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64403617, + "num_input_tokens_seen": 161761980, + "step": 7539, + "time_per_iteration": 3.084768533706665 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01043125, + "balance_loss_clip": 1.04476261, + "balance_loss_mlp": 1.02923095, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.7872215787691217, + "language_loss": 0.66013455, + "learning_rate": 2.396743698142872e-06, + "loss": 0.68180597, + "num_input_tokens_seen": 161779455, + "step": 7540, + "time_per_iteration": 2.509305477142334 + }, + { + "auxiliary_loss_clip": 0.01105836, + "auxiliary_loss_mlp": 0.01040729, + "balance_loss_clip": 1.04199815, + "balance_loss_mlp": 1.02469516, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.835272239833714, + "language_loss": 0.84985226, + "learning_rate": 2.396361968778424e-06, + "loss": 0.87131792, + "num_input_tokens_seen": 161798980, + "step": 7541, + "time_per_iteration": 2.538823127746582 + }, + { + "auxiliary_loss_clip": 0.01101749, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.04164529, + "balance_loss_mlp": 1.02117968, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.702235787428781, + "language_loss": 0.76688516, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78824496, + "num_input_tokens_seen": 161819745, + "step": 7542, + "time_per_iteration": 2.672499418258667 + }, + { + "auxiliary_loss_clip": 0.01102476, + "auxiliary_loss_mlp": 0.01029683, + "balance_loss_clip": 1.04744768, + "balance_loss_mlp": 1.01551485, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.840247743865596, + "language_loss": 0.8045283, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82584989, + "num_input_tokens_seen": 161838575, + "step": 7543, + "time_per_iteration": 2.6146442890167236 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.00779886, + "balance_loss_clip": 1.04310548, + "balance_loss_mlp": 1.00046587, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 11.37937731509017, + "language_loss": 0.7643708, + "learning_rate": 2.395216690562469e-06, + "loss": 0.7833159, + "num_input_tokens_seen": 161858590, + "step": 7544, + "time_per_iteration": 2.570359230041504 + }, + { + "auxiliary_loss_clip": 0.01097756, + "auxiliary_loss_mlp": 0.01039605, + "balance_loss_clip": 1.04653692, + "balance_loss_mlp": 1.02659917, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 2.645918617553996, + "language_loss": 0.75253117, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77390474, + "num_input_tokens_seen": 161878390, + "step": 7545, + "time_per_iteration": 4.080491304397583 + }, + { + "auxiliary_loss_clip": 0.0110226, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.0417937, + "balance_loss_mlp": 1.02312338, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.760690743065322, + "language_loss": 0.72473699, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74613196, + "num_input_tokens_seen": 161898610, + "step": 7546, + "time_per_iteration": 2.5718677043914795 + }, + { + "auxiliary_loss_clip": 0.01107094, + "auxiliary_loss_mlp": 0.01036552, + "balance_loss_clip": 1.04315495, + "balance_loss_mlp": 1.02106655, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.539856859744317, + "language_loss": 0.7559182, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77735466, + "num_input_tokens_seen": 161918210, + "step": 7547, + "time_per_iteration": 2.5004220008850098 + }, + { + "auxiliary_loss_clip": 0.01118823, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.04489255, + "balance_loss_mlp": 1.02223337, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.054653891714358, + "language_loss": 0.69869989, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72026086, + "num_input_tokens_seen": 161936950, + "step": 7548, + "time_per_iteration": 2.453732967376709 + }, + { + "auxiliary_loss_clip": 0.01124196, + "auxiliary_loss_mlp": 0.01045731, + "balance_loss_clip": 1.04310775, + "balance_loss_mlp": 1.03170609, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.1485393829451804, + "language_loss": 0.72910249, + "learning_rate": 2.393307593995794e-06, + "loss": 0.7508018, + "num_input_tokens_seen": 161955550, + "step": 7549, + "time_per_iteration": 2.461073398590088 + }, + { + "auxiliary_loss_clip": 0.01092096, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.04302192, + "balance_loss_mlp": 1.01747704, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.4560552269932194, + "language_loss": 0.64901048, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67023599, + "num_input_tokens_seen": 161976760, + "step": 7550, + "time_per_iteration": 4.101134538650513 + }, + { + "auxiliary_loss_clip": 0.01109723, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.04729152, + "balance_loss_mlp": 1.02518296, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.7699471947087655, + "language_loss": 0.68827891, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.7097621, + "num_input_tokens_seen": 161996120, + "step": 7551, + "time_per_iteration": 2.4743993282318115 + }, + { + "auxiliary_loss_clip": 0.01112356, + "auxiliary_loss_mlp": 0.0103705, + "balance_loss_clip": 1.03997648, + "balance_loss_mlp": 1.02244735, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.642940863009197, + "language_loss": 0.79248714, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81398118, + "num_input_tokens_seen": 162011125, + "step": 7552, + "time_per_iteration": 2.4208531379699707 + }, + { + "auxiliary_loss_clip": 0.01036779, + "auxiliary_loss_mlp": 0.01010388, + "balance_loss_clip": 1.02083743, + "balance_loss_mlp": 1.00905252, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8220409942957582, + "language_loss": 0.57754254, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59801424, + "num_input_tokens_seen": 162068705, + "step": 7553, + "time_per_iteration": 3.0266740322113037 + }, + { + "auxiliary_loss_clip": 0.01065156, + "auxiliary_loss_mlp": 0.01036299, + "balance_loss_clip": 1.04476094, + "balance_loss_mlp": 1.02332258, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.4066251792439148, + "language_loss": 0.76656735, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78758192, + "num_input_tokens_seen": 162089655, + "step": 7554, + "time_per_iteration": 4.1149351596832275 + }, + { + "auxiliary_loss_clip": 0.01109562, + "auxiliary_loss_mlp": 0.01036605, + "balance_loss_clip": 1.04764056, + "balance_loss_mlp": 1.02072668, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 6.41682289176561, + "language_loss": 0.76807392, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.78953564, + "num_input_tokens_seen": 162108465, + "step": 7555, + "time_per_iteration": 2.487901210784912 + }, + { + "auxiliary_loss_clip": 0.01063719, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.04631615, + "balance_loss_mlp": 1.02194512, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.865665016350912, + "language_loss": 0.72651672, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74750626, + "num_input_tokens_seen": 162129910, + "step": 7556, + "time_per_iteration": 2.7140212059020996 + }, + { + "auxiliary_loss_clip": 0.01129547, + "auxiliary_loss_mlp": 0.01039422, + "balance_loss_clip": 1.04553676, + "balance_loss_mlp": 1.02486658, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 1.9797540698943292, + "language_loss": 0.63036835, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65205801, + "num_input_tokens_seen": 162148840, + "step": 7557, + "time_per_iteration": 2.4443199634552 + }, + { + "auxiliary_loss_clip": 0.01024521, + "auxiliary_loss_mlp": 0.01001542, + "balance_loss_clip": 1.01690841, + "balance_loss_mlp": 0.99983686, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6837974876839762, + "language_loss": 0.57679534, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59705597, + "num_input_tokens_seen": 162208500, + "step": 7558, + "time_per_iteration": 2.9904539585113525 + }, + { + "auxiliary_loss_clip": 0.01120502, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_clip": 1.04816854, + "balance_loss_mlp": 1.02314591, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 3.242357916128512, + "language_loss": 0.56361258, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58520496, + "num_input_tokens_seen": 162224650, + "step": 7559, + "time_per_iteration": 2.465625524520874 + }, + { + "auxiliary_loss_clip": 0.01115988, + "auxiliary_loss_mlp": 0.00779525, + "balance_loss_clip": 1.05086672, + "balance_loss_mlp": 1.00061989, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 1.8264880543228594, + "language_loss": 0.71851468, + "learning_rate": 2.389106271642792e-06, + "loss": 0.73746985, + "num_input_tokens_seen": 162242930, + "step": 7560, + "time_per_iteration": 2.461829662322998 + }, + { + "auxiliary_loss_clip": 0.01047202, + "auxiliary_loss_mlp": 0.01041552, + "balance_loss_clip": 1.03608692, + "balance_loss_mlp": 1.02548218, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 2.669366991689714, + "language_loss": 0.69576907, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.71665663, + "num_input_tokens_seen": 162261455, + "step": 7561, + "time_per_iteration": 2.6033096313476562 + }, + { + "auxiliary_loss_clip": 0.01097881, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.04211104, + "balance_loss_mlp": 1.02247584, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.6669292153203221, + "language_loss": 0.85268688, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.87401211, + "num_input_tokens_seen": 162279725, + "step": 7562, + "time_per_iteration": 2.5177881717681885 + }, + { + "auxiliary_loss_clip": 0.01109115, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.04082513, + "balance_loss_mlp": 1.02197564, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.9587842211385882, + "language_loss": 0.89187497, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91332573, + "num_input_tokens_seen": 162297865, + "step": 7563, + "time_per_iteration": 2.525893449783325 + }, + { + "auxiliary_loss_clip": 0.01124207, + "auxiliary_loss_mlp": 0.00780552, + "balance_loss_clip": 1.04347825, + "balance_loss_mlp": 1.00053978, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.719919518799681, + "language_loss": 0.71337575, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73242331, + "num_input_tokens_seen": 162316010, + "step": 7564, + "time_per_iteration": 2.5211448669433594 + }, + { + "auxiliary_loss_clip": 0.01114168, + "auxiliary_loss_mlp": 0.01036877, + "balance_loss_clip": 1.04181719, + "balance_loss_mlp": 1.02283418, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.311429801775531, + "language_loss": 0.67834437, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.69985485, + "num_input_tokens_seen": 162336115, + "step": 7565, + "time_per_iteration": 2.5118207931518555 + }, + { + "auxiliary_loss_clip": 0.01086813, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.04487944, + "balance_loss_mlp": 1.02237678, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.866346705177876, + "language_loss": 0.80017614, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82140934, + "num_input_tokens_seen": 162355705, + "step": 7566, + "time_per_iteration": 2.630465507507324 + }, + { + "auxiliary_loss_clip": 0.0108829, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.04112422, + "balance_loss_mlp": 1.02059257, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 2.1663479091303213, + "language_loss": 0.73896515, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.76021582, + "num_input_tokens_seen": 162374055, + "step": 7567, + "time_per_iteration": 2.528552293777466 + }, + { + "auxiliary_loss_clip": 0.01093138, + "auxiliary_loss_mlp": 0.0104234, + "balance_loss_clip": 1.04312599, + "balance_loss_mlp": 1.02766562, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 2.568535315003128, + "language_loss": 0.81133008, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83268487, + "num_input_tokens_seen": 162393560, + "step": 7568, + "time_per_iteration": 2.598558187484741 + }, + { + "auxiliary_loss_clip": 0.01120763, + "auxiliary_loss_mlp": 0.01047862, + "balance_loss_clip": 1.04469836, + "balance_loss_mlp": 1.03153038, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 2.9770890244183352, + "language_loss": 0.79489708, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81658328, + "num_input_tokens_seen": 162413170, + "step": 7569, + "time_per_iteration": 2.471222400665283 + }, + { + "auxiliary_loss_clip": 0.0111709, + "auxiliary_loss_mlp": 0.01037357, + "balance_loss_clip": 1.04394722, + "balance_loss_mlp": 1.0209775, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.432305574424415, + "language_loss": 0.75077105, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77231544, + "num_input_tokens_seen": 162434080, + "step": 7570, + "time_per_iteration": 2.518326759338379 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01042559, + "balance_loss_clip": 1.04658914, + "balance_loss_mlp": 1.02826619, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.7535155161317622, + "language_loss": 0.74725997, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76873159, + "num_input_tokens_seen": 162455445, + "step": 7571, + "time_per_iteration": 4.0284035205841064 + }, + { + "auxiliary_loss_clip": 0.01109337, + "auxiliary_loss_mlp": 0.0103446, + "balance_loss_clip": 1.04286051, + "balance_loss_mlp": 1.0200479, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.7490205864577855, + "language_loss": 0.81316864, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83460665, + "num_input_tokens_seen": 162474940, + "step": 7572, + "time_per_iteration": 2.474083185195923 + }, + { + "auxiliary_loss_clip": 0.01109837, + "auxiliary_loss_mlp": 0.010406, + "balance_loss_clip": 1.0434444, + "balance_loss_mlp": 1.0241375, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 2.0407475068506757, + "language_loss": 0.72825837, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.74976271, + "num_input_tokens_seen": 162493340, + "step": 7573, + "time_per_iteration": 2.5655441284179688 + }, + { + "auxiliary_loss_clip": 0.01115354, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.04295063, + "balance_loss_mlp": 1.01970434, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 1.8845202158229495, + "language_loss": 0.74774373, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76925433, + "num_input_tokens_seen": 162514360, + "step": 7574, + "time_per_iteration": 2.5353031158447266 + }, + { + "auxiliary_loss_clip": 0.01113562, + "auxiliary_loss_mlp": 0.01033872, + "balance_loss_clip": 1.0429337, + "balance_loss_mlp": 1.01957929, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.8419756519492223, + "language_loss": 0.71082532, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73229963, + "num_input_tokens_seen": 162535240, + "step": 7575, + "time_per_iteration": 2.5036027431488037 + }, + { + "auxiliary_loss_clip": 0.0110097, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.04184151, + "balance_loss_mlp": 1.02116466, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.9963001652977832, + "language_loss": 0.73111844, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75248528, + "num_input_tokens_seen": 162553880, + "step": 7576, + "time_per_iteration": 2.516406297683716 + }, + { + "auxiliary_loss_clip": 0.01120873, + "auxiliary_loss_mlp": 0.0103711, + "balance_loss_clip": 1.04263604, + "balance_loss_mlp": 1.02267981, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.7245960177125468, + "language_loss": 0.66361767, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68519747, + "num_input_tokens_seen": 162574485, + "step": 7577, + "time_per_iteration": 2.4498884677886963 + }, + { + "auxiliary_loss_clip": 0.01098209, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.04148161, + "balance_loss_mlp": 1.02870238, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 2.300334233926273, + "language_loss": 0.73941839, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76085222, + "num_input_tokens_seen": 162595130, + "step": 7578, + "time_per_iteration": 2.548083543777466 + }, + { + "auxiliary_loss_clip": 0.01072278, + "auxiliary_loss_mlp": 0.00784426, + "balance_loss_clip": 1.03831363, + "balance_loss_mlp": 1.00050449, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.993241518222632, + "language_loss": 0.70498788, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72355485, + "num_input_tokens_seen": 162615720, + "step": 7579, + "time_per_iteration": 2.625309467315674 + }, + { + "auxiliary_loss_clip": 0.01108593, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.03931451, + "balance_loss_mlp": 1.02097917, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.7030774508292277, + "language_loss": 0.78370619, + "learning_rate": 2.381462943170627e-06, + "loss": 0.8051464, + "num_input_tokens_seen": 162635825, + "step": 7580, + "time_per_iteration": 2.5265908241271973 + }, + { + "auxiliary_loss_clip": 0.01124365, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.04528785, + "balance_loss_mlp": 1.01871896, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 4.796985925368635, + "language_loss": 0.68983597, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71141362, + "num_input_tokens_seen": 162659130, + "step": 7581, + "time_per_iteration": 2.6166999340057373 + }, + { + "auxiliary_loss_clip": 0.01105372, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.03838491, + "balance_loss_mlp": 1.01991594, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.6158801476649949, + "language_loss": 0.73150122, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75290507, + "num_input_tokens_seen": 162681665, + "step": 7582, + "time_per_iteration": 2.5567643642425537 + }, + { + "auxiliary_loss_clip": 0.01125221, + "auxiliary_loss_mlp": 0.01048628, + "balance_loss_clip": 1.04298759, + "balance_loss_mlp": 1.03229654, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.935609701639594, + "language_loss": 0.73082054, + "learning_rate": 2.380315942019729e-06, + "loss": 0.75255901, + "num_input_tokens_seen": 162702040, + "step": 7583, + "time_per_iteration": 2.4380552768707275 + }, + { + "auxiliary_loss_clip": 0.01112481, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.02103996, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 2.5074721443730517, + "language_loss": 0.72654474, + "learning_rate": 2.379933579440195e-06, + "loss": 0.7480334, + "num_input_tokens_seen": 162722375, + "step": 7584, + "time_per_iteration": 2.5095834732055664 + }, + { + "auxiliary_loss_clip": 0.01086821, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.04094815, + "balance_loss_mlp": 1.02218819, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.5474086313720838, + "language_loss": 0.68020195, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70144463, + "num_input_tokens_seen": 162746095, + "step": 7585, + "time_per_iteration": 4.12816309928894 + }, + { + "auxiliary_loss_clip": 0.01123357, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.04438818, + "balance_loss_mlp": 1.01849771, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.5270352702582672, + "language_loss": 0.76512885, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78668535, + "num_input_tokens_seen": 162766330, + "step": 7586, + "time_per_iteration": 2.4325199127197266 + }, + { + "auxiliary_loss_clip": 0.01102695, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.04226482, + "balance_loss_mlp": 1.01659298, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.7575742331997588, + "language_loss": 0.78412271, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80544877, + "num_input_tokens_seen": 162784755, + "step": 7587, + "time_per_iteration": 2.534221649169922 + }, + { + "auxiliary_loss_clip": 0.0109962, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.03820074, + "balance_loss_mlp": 1.02895808, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.386510353225004, + "language_loss": 0.69398034, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71541893, + "num_input_tokens_seen": 162803850, + "step": 7588, + "time_per_iteration": 2.498286247253418 + }, + { + "auxiliary_loss_clip": 0.01106893, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.04369891, + "balance_loss_mlp": 1.01696682, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.9480858120220543, + "language_loss": 0.79700845, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81837654, + "num_input_tokens_seen": 162820775, + "step": 7589, + "time_per_iteration": 2.4894003868103027 + }, + { + "auxiliary_loss_clip": 0.01112106, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.04136038, + "balance_loss_mlp": 1.02227664, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.256385715304069, + "language_loss": 0.62414372, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64563823, + "num_input_tokens_seen": 162839695, + "step": 7590, + "time_per_iteration": 3.985278844833374 + }, + { + "auxiliary_loss_clip": 0.01094724, + "auxiliary_loss_mlp": 0.01039178, + "balance_loss_clip": 1.03801429, + "balance_loss_mlp": 1.02542102, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 1.921258525113599, + "language_loss": 0.72790337, + "learning_rate": 2.377256638796135e-06, + "loss": 0.74924231, + "num_input_tokens_seen": 162856095, + "step": 7591, + "time_per_iteration": 2.4939022064208984 + }, + { + "auxiliary_loss_clip": 0.01107249, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.04612565, + "balance_loss_mlp": 1.0250721, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.4815360360041243, + "language_loss": 0.77086091, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.79233778, + "num_input_tokens_seen": 162874070, + "step": 7592, + "time_per_iteration": 2.518474578857422 + }, + { + "auxiliary_loss_clip": 0.01096944, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.03901899, + "balance_loss_mlp": 1.01947737, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 2.1728725025601627, + "language_loss": 0.69516885, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71648955, + "num_input_tokens_seen": 162891000, + "step": 7593, + "time_per_iteration": 3.890216827392578 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.03843164, + "balance_loss_mlp": 1.02088547, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 1.887112374554184, + "language_loss": 0.84039712, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86176187, + "num_input_tokens_seen": 162910120, + "step": 7594, + "time_per_iteration": 2.499398708343506 + }, + { + "auxiliary_loss_clip": 0.01032871, + "auxiliary_loss_mlp": 0.00755578, + "balance_loss_clip": 1.01479006, + "balance_loss_mlp": 1.00031686, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.8008103104082888, + "language_loss": 0.52741969, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54530418, + "num_input_tokens_seen": 162963720, + "step": 7595, + "time_per_iteration": 3.1023900508880615 + }, + { + "auxiliary_loss_clip": 0.01090597, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.03978348, + "balance_loss_mlp": 1.02086115, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.2439111517112043, + "language_loss": 0.87312752, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89438456, + "num_input_tokens_seen": 162975760, + "step": 7596, + "time_per_iteration": 2.510800361633301 + }, + { + "auxiliary_loss_clip": 0.01116068, + "auxiliary_loss_mlp": 0.01044321, + "balance_loss_clip": 1.04468703, + "balance_loss_mlp": 1.03037965, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 2.867818666583026, + "language_loss": 0.77646005, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79806387, + "num_input_tokens_seen": 162994865, + "step": 7597, + "time_per_iteration": 2.4651427268981934 + }, + { + "auxiliary_loss_clip": 0.01110483, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.04184175, + "balance_loss_mlp": 1.02052951, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 2.1478239177012473, + "language_loss": 0.78213549, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80358595, + "num_input_tokens_seen": 163014730, + "step": 7598, + "time_per_iteration": 2.477238416671753 + }, + { + "auxiliary_loss_clip": 0.01118957, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.0415355, + "balance_loss_mlp": 1.01790309, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.5317587120604514, + "language_loss": 0.7103104, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73180807, + "num_input_tokens_seen": 163033405, + "step": 7599, + "time_per_iteration": 2.5275461673736572 + }, + { + "auxiliary_loss_clip": 0.01085631, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.0384835, + "balance_loss_mlp": 1.02247238, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.893583276158663, + "language_loss": 0.69393855, + "learning_rate": 2.373813828660544e-06, + "loss": 0.7151531, + "num_input_tokens_seen": 163051400, + "step": 7600, + "time_per_iteration": 2.5535104274749756 + }, + { + "auxiliary_loss_clip": 0.01060586, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.0394181, + "balance_loss_mlp": 1.02521265, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 2.2120185753372823, + "language_loss": 0.78818494, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80917132, + "num_input_tokens_seen": 163069250, + "step": 7601, + "time_per_iteration": 2.6111669540405273 + }, + { + "auxiliary_loss_clip": 0.01095073, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.03957129, + "balance_loss_mlp": 1.02769661, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 2.0575737368830183, + "language_loss": 0.7155323, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73689097, + "num_input_tokens_seen": 163091755, + "step": 7602, + "time_per_iteration": 2.682992696762085 + }, + { + "auxiliary_loss_clip": 0.01109186, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.04061747, + "balance_loss_mlp": 1.02153647, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 1.7204540783764266, + "language_loss": 0.73410845, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75558043, + "num_input_tokens_seen": 163111600, + "step": 7603, + "time_per_iteration": 2.5132944583892822 + }, + { + "auxiliary_loss_clip": 0.01109483, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.0415895, + "balance_loss_mlp": 1.02598286, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.9320929730787009, + "language_loss": 0.83150053, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85300303, + "num_input_tokens_seen": 163127350, + "step": 7604, + "time_per_iteration": 2.475741386413574 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.04648006, + "balance_loss_mlp": 1.0229212, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 2.219710782528166, + "language_loss": 0.86220431, + "learning_rate": 2.371900659559016e-06, + "loss": 0.88360476, + "num_input_tokens_seen": 163145855, + "step": 7605, + "time_per_iteration": 2.513615131378174 + }, + { + "auxiliary_loss_clip": 0.01078126, + "auxiliary_loss_mlp": 0.01039992, + "balance_loss_clip": 1.04033566, + "balance_loss_mlp": 1.02575302, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.8860542186039966, + "language_loss": 0.73351085, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75469208, + "num_input_tokens_seen": 163163830, + "step": 7606, + "time_per_iteration": 2.5480751991271973 + }, + { + "auxiliary_loss_clip": 0.01090917, + "auxiliary_loss_mlp": 0.0104291, + "balance_loss_clip": 1.04065013, + "balance_loss_mlp": 1.02874207, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 2.139901354136758, + "language_loss": 0.80528855, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82662678, + "num_input_tokens_seen": 163180700, + "step": 7607, + "time_per_iteration": 2.5063130855560303 + }, + { + "auxiliary_loss_clip": 0.01084582, + "auxiliary_loss_mlp": 0.01042003, + "balance_loss_clip": 1.04405499, + "balance_loss_mlp": 1.02808523, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 1.8398030745881089, + "language_loss": 0.804699, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.82596487, + "num_input_tokens_seen": 163199450, + "step": 7608, + "time_per_iteration": 2.5438485145568848 + }, + { + "auxiliary_loss_clip": 0.01097053, + "auxiliary_loss_mlp": 0.01043782, + "balance_loss_clip": 1.03772628, + "balance_loss_mlp": 1.02953053, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.773253320256977, + "language_loss": 0.68148255, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70289093, + "num_input_tokens_seen": 163217875, + "step": 7609, + "time_per_iteration": 3.887852191925049 + }, + { + "auxiliary_loss_clip": 0.01092339, + "auxiliary_loss_mlp": 0.01042358, + "balance_loss_clip": 1.04262376, + "balance_loss_mlp": 1.02832103, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 2.3060403455152345, + "language_loss": 0.80577654, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82712352, + "num_input_tokens_seen": 163237430, + "step": 7610, + "time_per_iteration": 2.5418035984039307 + }, + { + "auxiliary_loss_clip": 0.0111112, + "auxiliary_loss_mlp": 0.01036951, + "balance_loss_clip": 1.04113483, + "balance_loss_mlp": 1.02318215, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 3.0262571643503797, + "language_loss": 0.82415348, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84563422, + "num_input_tokens_seen": 163253905, + "step": 7611, + "time_per_iteration": 2.479814291000366 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.04569507, + "balance_loss_mlp": 1.01835585, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 3.3717711200718554, + "language_loss": 0.7382772, + "learning_rate": 2.369221630917819e-06, + "loss": 0.75978011, + "num_input_tokens_seen": 163274285, + "step": 7612, + "time_per_iteration": 2.583319902420044 + }, + { + "auxiliary_loss_clip": 0.01097697, + "auxiliary_loss_mlp": 0.01039869, + "balance_loss_clip": 1.04010785, + "balance_loss_mlp": 1.02550411, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.859836291598016, + "language_loss": 0.84993351, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87130916, + "num_input_tokens_seen": 163293150, + "step": 7613, + "time_per_iteration": 2.4994022846221924 + }, + { + "auxiliary_loss_clip": 0.01085083, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.03820157, + "balance_loss_mlp": 1.02230418, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.6001955835083446, + "language_loss": 0.75768441, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77889717, + "num_input_tokens_seen": 163310065, + "step": 7614, + "time_per_iteration": 2.496830463409424 + }, + { + "auxiliary_loss_clip": 0.01118784, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.0420258, + "balance_loss_mlp": 1.02487969, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.5029086276786168, + "language_loss": 0.74824518, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76981431, + "num_input_tokens_seen": 163329415, + "step": 7615, + "time_per_iteration": 2.4439029693603516 + }, + { + "auxiliary_loss_clip": 0.01028115, + "auxiliary_loss_mlp": 0.01004018, + "balance_loss_clip": 1.0210197, + "balance_loss_mlp": 1.00249243, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7730498030483784, + "language_loss": 0.57625067, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59657192, + "num_input_tokens_seen": 163385875, + "step": 7616, + "time_per_iteration": 3.0323126316070557 + }, + { + "auxiliary_loss_clip": 0.01091981, + "auxiliary_loss_mlp": 0.00779717, + "balance_loss_clip": 1.03839326, + "balance_loss_mlp": 1.0003612, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.524997928750695, + "language_loss": 0.70678467, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.72550166, + "num_input_tokens_seen": 163405170, + "step": 7617, + "time_per_iteration": 2.500277042388916 + }, + { + "auxiliary_loss_clip": 0.01123339, + "auxiliary_loss_mlp": 0.01039369, + "balance_loss_clip": 1.04440713, + "balance_loss_mlp": 1.02490926, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 2.0348658398563626, + "language_loss": 0.76472461, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78635168, + "num_input_tokens_seen": 163423155, + "step": 7618, + "time_per_iteration": 2.4245309829711914 + }, + { + "auxiliary_loss_clip": 0.01093776, + "auxiliary_loss_mlp": 0.01044922, + "balance_loss_clip": 1.04804826, + "balance_loss_mlp": 1.0317378, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.7255599422388759, + "language_loss": 0.76930565, + "learning_rate": 2.366541916231585e-06, + "loss": 0.79069257, + "num_input_tokens_seen": 163442450, + "step": 7619, + "time_per_iteration": 2.5506694316864014 + }, + { + "auxiliary_loss_clip": 0.01120816, + "auxiliary_loss_mlp": 0.01038523, + "balance_loss_clip": 1.0447439, + "balance_loss_mlp": 1.02635741, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.796054910028527, + "language_loss": 0.71924335, + "learning_rate": 2.366159044134473e-06, + "loss": 0.74083674, + "num_input_tokens_seen": 163459810, + "step": 7620, + "time_per_iteration": 2.4233477115631104 + }, + { + "auxiliary_loss_clip": 0.01097422, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.04227901, + "balance_loss_mlp": 1.02305603, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 1.759545975441291, + "language_loss": 0.78348863, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80482244, + "num_input_tokens_seen": 163482970, + "step": 7621, + "time_per_iteration": 2.6874279975891113 + }, + { + "auxiliary_loss_clip": 0.01035084, + "auxiliary_loss_mlp": 0.01000999, + "balance_loss_clip": 1.01822782, + "balance_loss_mlp": 0.99960995, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.8489350154784928, + "language_loss": 0.64928365, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.66964447, + "num_input_tokens_seen": 163545330, + "step": 7622, + "time_per_iteration": 3.059861183166504 + }, + { + "auxiliary_loss_clip": 0.01111271, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.04675508, + "balance_loss_mlp": 1.01976538, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.899511622566929, + "language_loss": 0.79587841, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81732714, + "num_input_tokens_seen": 163564620, + "step": 7623, + "time_per_iteration": 2.533909320831299 + }, + { + "auxiliary_loss_clip": 0.01074914, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.03943574, + "balance_loss_mlp": 1.02385116, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 2.4243167794572624, + "language_loss": 0.71060133, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.73172504, + "num_input_tokens_seen": 163581010, + "step": 7624, + "time_per_iteration": 4.0195581912994385 + }, + { + "auxiliary_loss_clip": 0.01094531, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.03956532, + "balance_loss_mlp": 1.02447021, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 7.955745285680336, + "language_loss": 0.73743856, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75876862, + "num_input_tokens_seen": 163599955, + "step": 7625, + "time_per_iteration": 2.518192768096924 + }, + { + "auxiliary_loss_clip": 0.01106424, + "auxiliary_loss_mlp": 0.01043363, + "balance_loss_clip": 1.04212689, + "balance_loss_mlp": 1.03028035, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 2.6631522645627874, + "language_loss": 0.78342927, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80492717, + "num_input_tokens_seen": 163618545, + "step": 7626, + "time_per_iteration": 2.4639222621917725 + }, + { + "auxiliary_loss_clip": 0.01123299, + "auxiliary_loss_mlp": 0.01042706, + "balance_loss_clip": 1.04276729, + "balance_loss_mlp": 1.02861536, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.6409499178442963, + "language_loss": 0.8489368, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87059677, + "num_input_tokens_seen": 163636055, + "step": 7627, + "time_per_iteration": 2.406721353530884 + }, + { + "auxiliary_loss_clip": 0.01125719, + "auxiliary_loss_mlp": 0.01041182, + "balance_loss_clip": 1.04302716, + "balance_loss_mlp": 1.0268594, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 3.2837270995649814, + "language_loss": 0.69154364, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71321261, + "num_input_tokens_seen": 163657485, + "step": 7628, + "time_per_iteration": 2.489763021469116 + }, + { + "auxiliary_loss_clip": 0.01107128, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.03861666, + "balance_loss_mlp": 1.01794302, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.7756363309723233, + "language_loss": 0.78367478, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80505943, + "num_input_tokens_seen": 163676030, + "step": 7629, + "time_per_iteration": 2.485192060470581 + }, + { + "auxiliary_loss_clip": 0.01105645, + "auxiliary_loss_mlp": 0.0104452, + "balance_loss_clip": 1.03956175, + "balance_loss_mlp": 1.02852821, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.035286142012631, + "language_loss": 0.7906009, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.8121025, + "num_input_tokens_seen": 163694490, + "step": 7630, + "time_per_iteration": 4.0359578132629395 + }, + { + "auxiliary_loss_clip": 0.01101948, + "auxiliary_loss_mlp": 0.01037562, + "balance_loss_clip": 1.04121006, + "balance_loss_mlp": 1.02341187, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 2.029311168976595, + "language_loss": 0.7205984, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.74199349, + "num_input_tokens_seen": 163717035, + "step": 7631, + "time_per_iteration": 2.5975546836853027 + }, + { + "auxiliary_loss_clip": 0.01086605, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_clip": 1.04021287, + "balance_loss_mlp": 1.0298183, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.400356074023285, + "language_loss": 0.71869516, + "learning_rate": 2.361563500108531e-06, + "loss": 0.74001837, + "num_input_tokens_seen": 163734525, + "step": 7632, + "time_per_iteration": 2.4960620403289795 + }, + { + "auxiliary_loss_clip": 0.01075802, + "auxiliary_loss_mlp": 0.00781821, + "balance_loss_clip": 1.03988254, + "balance_loss_mlp": 1.00044727, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 2.8992772304512413, + "language_loss": 0.69552135, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71409756, + "num_input_tokens_seen": 163752860, + "step": 7633, + "time_per_iteration": 4.068222761154175 + }, + { + "auxiliary_loss_clip": 0.01111828, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.04209614, + "balance_loss_mlp": 1.02550864, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.8652834920280292, + "language_loss": 0.80687439, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82838404, + "num_input_tokens_seen": 163772495, + "step": 7634, + "time_per_iteration": 2.4886765480041504 + }, + { + "auxiliary_loss_clip": 0.01114734, + "auxiliary_loss_mlp": 0.00781102, + "balance_loss_clip": 1.04538596, + "balance_loss_mlp": 1.00052059, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.8326697307249347, + "language_loss": 0.81829113, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83724952, + "num_input_tokens_seen": 163791475, + "step": 7635, + "time_per_iteration": 2.4990153312683105 + }, + { + "auxiliary_loss_clip": 0.01099695, + "auxiliary_loss_mlp": 0.0104364, + "balance_loss_clip": 1.04263043, + "balance_loss_mlp": 1.02974057, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.5107526005923477, + "language_loss": 0.6481123, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.66954565, + "num_input_tokens_seen": 163812995, + "step": 7636, + "time_per_iteration": 2.6315524578094482 + }, + { + "auxiliary_loss_clip": 0.01109499, + "auxiliary_loss_mlp": 0.01031084, + "balance_loss_clip": 1.04767251, + "balance_loss_mlp": 1.01777458, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.8467055887784556, + "language_loss": 0.80332768, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82473356, + "num_input_tokens_seen": 163833945, + "step": 7637, + "time_per_iteration": 2.5195417404174805 + }, + { + "auxiliary_loss_clip": 0.01096086, + "auxiliary_loss_mlp": 0.01039386, + "balance_loss_clip": 1.03910017, + "balance_loss_mlp": 1.02318573, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.3975116753515227, + "language_loss": 0.75356376, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77491856, + "num_input_tokens_seen": 163853885, + "step": 7638, + "time_per_iteration": 2.5007665157318115 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.04335999, + "balance_loss_mlp": 1.02115905, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.771201009757169, + "language_loss": 0.7380985, + "learning_rate": 2.358881852733989e-06, + "loss": 0.7595551, + "num_input_tokens_seen": 163871855, + "step": 7639, + "time_per_iteration": 2.446972608566284 + }, + { + "auxiliary_loss_clip": 0.01123423, + "auxiliary_loss_mlp": 0.01037206, + "balance_loss_clip": 1.04358661, + "balance_loss_mlp": 1.02297235, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 1.9595757208520168, + "language_loss": 0.67938924, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70099556, + "num_input_tokens_seen": 163891450, + "step": 7640, + "time_per_iteration": 2.4566917419433594 + }, + { + "auxiliary_loss_clip": 0.01098005, + "auxiliary_loss_mlp": 0.01038591, + "balance_loss_clip": 1.03978658, + "balance_loss_mlp": 1.02408361, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 1.6789077048425565, + "language_loss": 0.75706345, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77842939, + "num_input_tokens_seen": 163909345, + "step": 7641, + "time_per_iteration": 2.470895528793335 + }, + { + "auxiliary_loss_clip": 0.0109964, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.04176128, + "balance_loss_mlp": 1.01916337, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 2.127089513303056, + "language_loss": 0.74726647, + "learning_rate": 2.357732370864668e-06, + "loss": 0.76860178, + "num_input_tokens_seen": 163926940, + "step": 7642, + "time_per_iteration": 2.4845476150512695 + }, + { + "auxiliary_loss_clip": 0.01043363, + "auxiliary_loss_mlp": 0.01004348, + "balance_loss_clip": 1.03341746, + "balance_loss_mlp": 1.00260758, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8739521755678222, + "language_loss": 0.58215296, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60263008, + "num_input_tokens_seen": 163977785, + "step": 7643, + "time_per_iteration": 2.7946183681488037 + }, + { + "auxiliary_loss_clip": 0.01117133, + "auxiliary_loss_mlp": 0.01037844, + "balance_loss_clip": 1.04251337, + "balance_loss_mlp": 1.0232048, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.749584841231574, + "language_loss": 0.92775559, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.94930542, + "num_input_tokens_seen": 163996630, + "step": 7644, + "time_per_iteration": 2.47755765914917 + }, + { + "auxiliary_loss_clip": 0.01114058, + "auxiliary_loss_mlp": 0.01039509, + "balance_loss_clip": 1.04673195, + "balance_loss_mlp": 1.02530479, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 5.1027922683815365, + "language_loss": 0.8316651, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.85320079, + "num_input_tokens_seen": 164013190, + "step": 7645, + "time_per_iteration": 2.450662612915039 + }, + { + "auxiliary_loss_clip": 0.01007578, + "auxiliary_loss_mlp": 0.01001975, + "balance_loss_clip": 1.01634026, + "balance_loss_mlp": 1.00051486, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7592174176711771, + "language_loss": 0.59846008, + "learning_rate": 2.356199538526593e-06, + "loss": 0.61855567, + "num_input_tokens_seen": 164074030, + "step": 7646, + "time_per_iteration": 3.022493839263916 + }, + { + "auxiliary_loss_clip": 0.01106469, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.03951585, + "balance_loss_mlp": 1.01963949, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.6553968579671858, + "language_loss": 0.72618198, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74758983, + "num_input_tokens_seen": 164095515, + "step": 7647, + "time_per_iteration": 2.5245108604431152 + }, + { + "auxiliary_loss_clip": 0.01090259, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.03884232, + "balance_loss_mlp": 1.02122569, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 5.606308109720163, + "language_loss": 0.66506296, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.68631643, + "num_input_tokens_seen": 164117270, + "step": 7648, + "time_per_iteration": 2.5852718353271484 + }, + { + "auxiliary_loss_clip": 0.01111018, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.03947616, + "balance_loss_mlp": 1.02257192, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.6153567615534419, + "language_loss": 0.78714037, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80861968, + "num_input_tokens_seen": 164137850, + "step": 7649, + "time_per_iteration": 3.988504648208618 + }, + { + "auxiliary_loss_clip": 0.01067396, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_clip": 1.04375482, + "balance_loss_mlp": 1.02730107, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 1.7394382828838577, + "language_loss": 0.69444352, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.71553212, + "num_input_tokens_seen": 164157960, + "step": 7650, + "time_per_iteration": 2.64648175239563 + }, + { + "auxiliary_loss_clip": 0.01118844, + "auxiliary_loss_mlp": 0.01045813, + "balance_loss_clip": 1.04571939, + "balance_loss_mlp": 1.02942133, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 2.3486385142325163, + "language_loss": 0.84182167, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86346823, + "num_input_tokens_seen": 164174590, + "step": 7651, + "time_per_iteration": 2.440020799636841 + }, + { + "auxiliary_loss_clip": 0.01099405, + "auxiliary_loss_mlp": 0.0077957, + "balance_loss_clip": 1.04061317, + "balance_loss_mlp": 1.00045824, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 8.481953755876454, + "language_loss": 0.75068349, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.7694732, + "num_input_tokens_seen": 164192935, + "step": 7652, + "time_per_iteration": 2.4802939891815186 + }, + { + "auxiliary_loss_clip": 0.01079389, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.0403949, + "balance_loss_mlp": 1.01832843, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 2.204974243521645, + "language_loss": 0.76101905, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78213918, + "num_input_tokens_seen": 164213160, + "step": 7653, + "time_per_iteration": 2.612837553024292 + }, + { + "auxiliary_loss_clip": 0.01083534, + "auxiliary_loss_mlp": 0.01039271, + "balance_loss_clip": 1.04457855, + "balance_loss_mlp": 1.02330875, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 3.381655296479119, + "language_loss": 0.66104233, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68227035, + "num_input_tokens_seen": 164229330, + "step": 7654, + "time_per_iteration": 2.587829113006592 + }, + { + "auxiliary_loss_clip": 0.01096282, + "auxiliary_loss_mlp": 0.01038783, + "balance_loss_clip": 1.03745198, + "balance_loss_mlp": 1.02480578, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.8666869587765864, + "language_loss": 0.79528129, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81663197, + "num_input_tokens_seen": 164248240, + "step": 7655, + "time_per_iteration": 2.525848865509033 + }, + { + "auxiliary_loss_clip": 0.01085174, + "auxiliary_loss_mlp": 0.01038929, + "balance_loss_clip": 1.04177523, + "balance_loss_mlp": 1.02501774, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 2.1138525591970905, + "language_loss": 0.67696035, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69820136, + "num_input_tokens_seen": 164268020, + "step": 7656, + "time_per_iteration": 2.57490611076355 + }, + { + "auxiliary_loss_clip": 0.01100897, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.03897309, + "balance_loss_mlp": 1.02442646, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 2.3538244759353475, + "language_loss": 0.81416398, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83555686, + "num_input_tokens_seen": 164287305, + "step": 7657, + "time_per_iteration": 2.5690674781799316 + }, + { + "auxiliary_loss_clip": 0.01122593, + "auxiliary_loss_mlp": 0.0077899, + "balance_loss_clip": 1.0423665, + "balance_loss_mlp": 1.00045049, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.6792071522192478, + "language_loss": 0.70576596, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72478175, + "num_input_tokens_seen": 164306835, + "step": 7658, + "time_per_iteration": 2.466050148010254 + }, + { + "auxiliary_loss_clip": 0.01033228, + "auxiliary_loss_mlp": 0.01002702, + "balance_loss_clip": 1.01598668, + "balance_loss_mlp": 1.00127125, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9557654356930045, + "language_loss": 0.62100649, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64136577, + "num_input_tokens_seen": 164367095, + "step": 7659, + "time_per_iteration": 3.1374683380126953 + }, + { + "auxiliary_loss_clip": 0.01073426, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.03738809, + "balance_loss_mlp": 1.0307219, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 1.5297098665153788, + "language_loss": 0.68649298, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70769227, + "num_input_tokens_seen": 164388895, + "step": 7660, + "time_per_iteration": 2.640430450439453 + }, + { + "auxiliary_loss_clip": 0.01107949, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.03792596, + "balance_loss_mlp": 1.02903128, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.7751995635976716, + "language_loss": 0.77043074, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79194927, + "num_input_tokens_seen": 164409080, + "step": 7661, + "time_per_iteration": 2.491727352142334 + }, + { + "auxiliary_loss_clip": 0.01107139, + "auxiliary_loss_mlp": 0.01049647, + "balance_loss_clip": 1.04219484, + "balance_loss_mlp": 1.03469825, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.9937069676802455, + "language_loss": 0.74534738, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.7669152, + "num_input_tokens_seen": 164427585, + "step": 7662, + "time_per_iteration": 2.516362190246582 + }, + { + "auxiliary_loss_clip": 0.01100758, + "auxiliary_loss_mlp": 0.01045509, + "balance_loss_clip": 1.04066443, + "balance_loss_mlp": 1.02898681, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 3.9711369018770624, + "language_loss": 0.79490048, + "learning_rate": 2.349682601310998e-06, + "loss": 0.81636316, + "num_input_tokens_seen": 164438455, + "step": 7663, + "time_per_iteration": 2.4393093585968018 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.04268026, + "balance_loss_mlp": 1.02159679, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 2.411489562113859, + "language_loss": 0.73871523, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.7601679, + "num_input_tokens_seen": 164456830, + "step": 7664, + "time_per_iteration": 3.8895270824432373 + }, + { + "auxiliary_loss_clip": 0.01093317, + "auxiliary_loss_mlp": 0.0103832, + "balance_loss_clip": 1.04425085, + "balance_loss_mlp": 1.02436066, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.5378576906901074, + "language_loss": 0.72251213, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74382854, + "num_input_tokens_seen": 164475375, + "step": 7665, + "time_per_iteration": 2.5199215412139893 + }, + { + "auxiliary_loss_clip": 0.01098067, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.04510045, + "balance_loss_mlp": 1.01899147, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.8686095350657959, + "language_loss": 0.77913129, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80044007, + "num_input_tokens_seen": 164492040, + "step": 7666, + "time_per_iteration": 2.526829719543457 + }, + { + "auxiliary_loss_clip": 0.01081066, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.03913999, + "balance_loss_mlp": 1.02045465, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.4160544698388207, + "language_loss": 0.73956859, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76073688, + "num_input_tokens_seen": 164513665, + "step": 7667, + "time_per_iteration": 2.6453518867492676 + }, + { + "auxiliary_loss_clip": 0.01076656, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.04089642, + "balance_loss_mlp": 1.02011931, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 4.305075359708667, + "language_loss": 0.76235133, + "learning_rate": 2.347765122572676e-06, + "loss": 0.78345454, + "num_input_tokens_seen": 164533890, + "step": 7668, + "time_per_iteration": 2.5899415016174316 + }, + { + "auxiliary_loss_clip": 0.0107155, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.04094672, + "balance_loss_mlp": 1.01884139, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 1.596844066313816, + "language_loss": 0.78170836, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80274135, + "num_input_tokens_seen": 164553815, + "step": 7669, + "time_per_iteration": 4.096314191818237 + }, + { + "auxiliary_loss_clip": 0.01107746, + "auxiliary_loss_mlp": 0.01040519, + "balance_loss_clip": 1.03939021, + "balance_loss_mlp": 1.02488482, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.7111482128719642, + "language_loss": 0.82598394, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84746659, + "num_input_tokens_seen": 164573125, + "step": 7670, + "time_per_iteration": 2.4968903064727783 + }, + { + "auxiliary_loss_clip": 0.01109217, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.03988171, + "balance_loss_mlp": 1.02024007, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 2.1771898253355775, + "language_loss": 0.63283569, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.65426874, + "num_input_tokens_seen": 164592575, + "step": 7671, + "time_per_iteration": 3.8968353271484375 + }, + { + "auxiliary_loss_clip": 0.01024578, + "auxiliary_loss_mlp": 0.01001076, + "balance_loss_clip": 1.01578128, + "balance_loss_mlp": 0.99947876, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.7072083756509039, + "language_loss": 0.55877107, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57902759, + "num_input_tokens_seen": 164659795, + "step": 7672, + "time_per_iteration": 3.1728005409240723 + }, + { + "auxiliary_loss_clip": 0.01114834, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.04475117, + "balance_loss_mlp": 1.02344942, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.71874594433668, + "language_loss": 0.70598829, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.72750413, + "num_input_tokens_seen": 164678735, + "step": 7673, + "time_per_iteration": 2.444202423095703 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.03872538, + "balance_loss_mlp": 1.01746702, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.8000150144620481, + "language_loss": 0.71097887, + "learning_rate": 2.345463713066195e-06, + "loss": 0.73226655, + "num_input_tokens_seen": 164700885, + "step": 7674, + "time_per_iteration": 2.611957550048828 + }, + { + "auxiliary_loss_clip": 0.01098285, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.03952503, + "balance_loss_mlp": 1.02172887, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.592859079062197, + "language_loss": 0.65929741, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.68063927, + "num_input_tokens_seen": 164726960, + "step": 7675, + "time_per_iteration": 2.6786997318267822 + }, + { + "auxiliary_loss_clip": 0.01044421, + "auxiliary_loss_mlp": 0.01002532, + "balance_loss_clip": 1.0184325, + "balance_loss_mlp": 1.00097084, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.9586391155027576, + "language_loss": 0.58593059, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60640013, + "num_input_tokens_seen": 164788525, + "step": 7676, + "time_per_iteration": 3.029369592666626 + }, + { + "auxiliary_loss_clip": 0.01015084, + "auxiliary_loss_mlp": 0.01005077, + "balance_loss_clip": 1.01666164, + "balance_loss_mlp": 1.00344419, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7895024377107702, + "language_loss": 0.62669086, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64689255, + "num_input_tokens_seen": 164843525, + "step": 7677, + "time_per_iteration": 2.9540648460388184 + }, + { + "auxiliary_loss_clip": 0.01098794, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.04188514, + "balance_loss_mlp": 1.02108705, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 3.6124689756458936, + "language_loss": 0.76056033, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78189027, + "num_input_tokens_seen": 164859895, + "step": 7678, + "time_per_iteration": 2.475675344467163 + }, + { + "auxiliary_loss_clip": 0.01127557, + "auxiliary_loss_mlp": 0.01040392, + "balance_loss_clip": 1.0472939, + "balance_loss_mlp": 1.02578855, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 2.0280489884169683, + "language_loss": 0.66832101, + "learning_rate": 2.343545511426974e-06, + "loss": 0.69000053, + "num_input_tokens_seen": 164878030, + "step": 7679, + "time_per_iteration": 2.494292974472046 + }, + { + "auxiliary_loss_clip": 0.01091428, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.04185367, + "balance_loss_mlp": 1.02441955, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 2.155163828432475, + "language_loss": 0.69760829, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.71889806, + "num_input_tokens_seen": 164895710, + "step": 7680, + "time_per_iteration": 2.639162302017212 + }, + { + "auxiliary_loss_clip": 0.01131248, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.04818118, + "balance_loss_mlp": 1.02822959, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 2.2134570871153976, + "language_loss": 0.63862538, + "learning_rate": 2.342778139478487e-06, + "loss": 0.66036963, + "num_input_tokens_seen": 164913365, + "step": 7681, + "time_per_iteration": 2.4935998916625977 + }, + { + "auxiliary_loss_clip": 0.01109396, + "auxiliary_loss_mlp": 0.01033166, + "balance_loss_clip": 1.04067969, + "balance_loss_mlp": 1.01988614, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.8536646292117, + "language_loss": 0.67299104, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69441664, + "num_input_tokens_seen": 164931620, + "step": 7682, + "time_per_iteration": 2.4859325885772705 + }, + { + "auxiliary_loss_clip": 0.01084756, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.04037285, + "balance_loss_mlp": 1.02963746, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.184337612902785, + "language_loss": 0.74024159, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76152468, + "num_input_tokens_seen": 164950905, + "step": 7683, + "time_per_iteration": 2.6243410110473633 + }, + { + "auxiliary_loss_clip": 0.01124054, + "auxiliary_loss_mlp": 0.01039986, + "balance_loss_clip": 1.04524469, + "balance_loss_mlp": 1.02574623, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.7733957636145066, + "language_loss": 0.76682961, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78846997, + "num_input_tokens_seen": 164970950, + "step": 7684, + "time_per_iteration": 2.455580711364746 + }, + { + "auxiliary_loss_clip": 0.01131201, + "auxiliary_loss_mlp": 0.01038106, + "balance_loss_clip": 1.04648972, + "balance_loss_mlp": 1.02341318, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 3.113952523032617, + "language_loss": 0.79413033, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.81582344, + "num_input_tokens_seen": 164989855, + "step": 7685, + "time_per_iteration": 2.3994948863983154 + }, + { + "auxiliary_loss_clip": 0.01083362, + "auxiliary_loss_mlp": 0.01045977, + "balance_loss_clip": 1.04356837, + "balance_loss_mlp": 1.03025913, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 2.0748845364000834, + "language_loss": 0.66612947, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68742293, + "num_input_tokens_seen": 165012290, + "step": 7686, + "time_per_iteration": 2.6369543075561523 + }, + { + "auxiliary_loss_clip": 0.01104221, + "auxiliary_loss_mlp": 0.0078059, + "balance_loss_clip": 1.04336786, + "balance_loss_mlp": 1.00048923, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.4565478102401492, + "language_loss": 0.74279046, + "learning_rate": 2.340475712142296e-06, + "loss": 0.76163852, + "num_input_tokens_seen": 165030810, + "step": 7687, + "time_per_iteration": 2.5247082710266113 + }, + { + "auxiliary_loss_clip": 0.01069857, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.05193615, + "balance_loss_mlp": 1.02112436, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.544230088050338, + "language_loss": 0.74246597, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76352453, + "num_input_tokens_seen": 165050205, + "step": 7688, + "time_per_iteration": 4.141162872314453 + }, + { + "auxiliary_loss_clip": 0.01074456, + "auxiliary_loss_mlp": 0.00780623, + "balance_loss_clip": 1.03788328, + "balance_loss_mlp": 1.00028324, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 4.704400123471425, + "language_loss": 0.78730637, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80585712, + "num_input_tokens_seen": 165069370, + "step": 7689, + "time_per_iteration": 2.5856616497039795 + }, + { + "auxiliary_loss_clip": 0.01114472, + "auxiliary_loss_mlp": 0.01045323, + "balance_loss_clip": 1.04035258, + "balance_loss_mlp": 1.02962291, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 2.8013337989851923, + "language_loss": 0.57310426, + "learning_rate": 2.339324323980964e-06, + "loss": 0.59470224, + "num_input_tokens_seen": 165089610, + "step": 7690, + "time_per_iteration": 2.527355909347534 + }, + { + "auxiliary_loss_clip": 0.01115112, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.0435071, + "balance_loss_mlp": 1.02269661, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.56538123207184, + "language_loss": 0.82565618, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.8471784, + "num_input_tokens_seen": 165109050, + "step": 7691, + "time_per_iteration": 2.4671213626861572 + }, + { + "auxiliary_loss_clip": 0.01102725, + "auxiliary_loss_mlp": 0.01030162, + "balance_loss_clip": 1.04268539, + "balance_loss_mlp": 1.01644731, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.4291741262704467, + "language_loss": 0.75580072, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77712959, + "num_input_tokens_seen": 165130130, + "step": 7692, + "time_per_iteration": 2.5232343673706055 + }, + { + "auxiliary_loss_clip": 0.0109319, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.04317403, + "balance_loss_mlp": 1.02614808, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 2.4253401497523335, + "language_loss": 0.74202842, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76337522, + "num_input_tokens_seen": 165152685, + "step": 7693, + "time_per_iteration": 2.647761344909668 + }, + { + "auxiliary_loss_clip": 0.01086206, + "auxiliary_loss_mlp": 0.01045192, + "balance_loss_clip": 1.0450573, + "balance_loss_mlp": 1.02979028, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.9100915145078026, + "language_loss": 0.85512698, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87644088, + "num_input_tokens_seen": 165173315, + "step": 7694, + "time_per_iteration": 2.528076648712158 + }, + { + "auxiliary_loss_clip": 0.01107782, + "auxiliary_loss_mlp": 0.01041599, + "balance_loss_clip": 1.04567516, + "balance_loss_mlp": 1.02725828, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 2.056622628064956, + "language_loss": 0.78794277, + "learning_rate": 2.337405086561902e-06, + "loss": 0.80943656, + "num_input_tokens_seen": 165192395, + "step": 7695, + "time_per_iteration": 2.5444164276123047 + }, + { + "auxiliary_loss_clip": 0.0110948, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.04198992, + "balance_loss_mlp": 1.02088368, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.7696190404808583, + "language_loss": 0.72341919, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74486351, + "num_input_tokens_seen": 165211355, + "step": 7696, + "time_per_iteration": 2.4516961574554443 + }, + { + "auxiliary_loss_clip": 0.01105691, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.0444777, + "balance_loss_mlp": 1.02676749, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.5542168062109696, + "language_loss": 0.6930232, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71450013, + "num_input_tokens_seen": 165229380, + "step": 7697, + "time_per_iteration": 2.485318183898926 + }, + { + "auxiliary_loss_clip": 0.01125476, + "auxiliary_loss_mlp": 0.01036896, + "balance_loss_clip": 1.04598153, + "balance_loss_mlp": 1.0223887, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.9779228548993057, + "language_loss": 0.85067391, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.87229759, + "num_input_tokens_seen": 165247200, + "step": 7698, + "time_per_iteration": 2.4322900772094727 + }, + { + "auxiliary_loss_clip": 0.0112387, + "auxiliary_loss_mlp": 0.01035617, + "balance_loss_clip": 1.04403472, + "balance_loss_mlp": 1.02112746, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 1.8591260631241289, + "language_loss": 0.71036339, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73195827, + "num_input_tokens_seen": 165265825, + "step": 7699, + "time_per_iteration": 2.4310288429260254 + }, + { + "auxiliary_loss_clip": 0.01071456, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.04041767, + "balance_loss_mlp": 1.02067876, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.972663072105181, + "language_loss": 0.71590841, + "learning_rate": 2.335485529281996e-06, + "loss": 0.73697871, + "num_input_tokens_seen": 165284380, + "step": 7700, + "time_per_iteration": 2.6167988777160645 + }, + { + "auxiliary_loss_clip": 0.01122034, + "auxiliary_loss_mlp": 0.00780339, + "balance_loss_clip": 1.04268837, + "balance_loss_mlp": 1.00036407, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 2.124558752429578, + "language_loss": 0.72033584, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.73935956, + "num_input_tokens_seen": 165300320, + "step": 7701, + "time_per_iteration": 2.4216365814208984 + }, + { + "auxiliary_loss_clip": 0.01084379, + "auxiliary_loss_mlp": 0.01042054, + "balance_loss_clip": 1.04218924, + "balance_loss_mlp": 1.02677107, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 2.5574880148620465, + "language_loss": 0.6516906, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67295492, + "num_input_tokens_seen": 165318130, + "step": 7702, + "time_per_iteration": 4.212126970291138 + }, + { + "auxiliary_loss_clip": 0.01097482, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.04119468, + "balance_loss_mlp": 1.01844478, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.887002982200029, + "language_loss": 0.73373967, + "learning_rate": 2.33433364213785e-06, + "loss": 0.7550385, + "num_input_tokens_seen": 165336225, + "step": 7703, + "time_per_iteration": 2.5137526988983154 + }, + { + "auxiliary_loss_clip": 0.01106406, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.04390168, + "balance_loss_mlp": 1.02242708, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 3.99658027729135, + "language_loss": 0.68535274, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70680308, + "num_input_tokens_seen": 165355005, + "step": 7704, + "time_per_iteration": 2.5516374111175537 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.04436517, + "balance_loss_mlp": 1.01716995, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 2.21522460951648, + "language_loss": 0.81071055, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83218646, + "num_input_tokens_seen": 165374910, + "step": 7705, + "time_per_iteration": 2.517411231994629 + }, + { + "auxiliary_loss_clip": 0.01112638, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.0423013, + "balance_loss_mlp": 1.02003646, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.8651038119254382, + "language_loss": 0.775065, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79653358, + "num_input_tokens_seen": 165392590, + "step": 7706, + "time_per_iteration": 2.4429917335510254 + }, + { + "auxiliary_loss_clip": 0.01094107, + "auxiliary_loss_mlp": 0.01033133, + "balance_loss_clip": 1.04241538, + "balance_loss_mlp": 1.01912642, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.7625667045836078, + "language_loss": 0.70074666, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.72201908, + "num_input_tokens_seen": 165411195, + "step": 7707, + "time_per_iteration": 2.4987781047821045 + }, + { + "auxiliary_loss_clip": 0.01104644, + "auxiliary_loss_mlp": 0.01041914, + "balance_loss_clip": 1.0399133, + "balance_loss_mlp": 1.02583313, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 5.217806903242108, + "language_loss": 0.61280143, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63426703, + "num_input_tokens_seen": 165430150, + "step": 7708, + "time_per_iteration": 2.6305813789367676 + }, + { + "auxiliary_loss_clip": 0.01088968, + "auxiliary_loss_mlp": 0.01038136, + "balance_loss_clip": 1.04459023, + "balance_loss_mlp": 1.02358067, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 1.9814662002630778, + "language_loss": 0.77466273, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.79593372, + "num_input_tokens_seen": 165450595, + "step": 7709, + "time_per_iteration": 4.063988208770752 + }, + { + "auxiliary_loss_clip": 0.01128758, + "auxiliary_loss_mlp": 0.01038698, + "balance_loss_clip": 1.04677415, + "balance_loss_mlp": 1.02348733, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.7255861041420266, + "language_loss": 0.77373171, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79540634, + "num_input_tokens_seen": 165469515, + "step": 7710, + "time_per_iteration": 2.417494297027588 + }, + { + "auxiliary_loss_clip": 0.01117666, + "auxiliary_loss_mlp": 0.01038398, + "balance_loss_clip": 1.04303455, + "balance_loss_mlp": 1.02204275, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 2.0770216466348623, + "language_loss": 0.7309227, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75248337, + "num_input_tokens_seen": 165488125, + "step": 7711, + "time_per_iteration": 3.919297695159912 + }, + { + "auxiliary_loss_clip": 0.01102615, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_clip": 1.04651904, + "balance_loss_mlp": 1.03046513, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.3426302494330538, + "language_loss": 0.71447438, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73595858, + "num_input_tokens_seen": 165509225, + "step": 7712, + "time_per_iteration": 2.527737617492676 + }, + { + "auxiliary_loss_clip": 0.01106692, + "auxiliary_loss_mlp": 0.01043126, + "balance_loss_clip": 1.04694748, + "balance_loss_mlp": 1.02682447, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 1.7217233642819085, + "language_loss": 0.73266667, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75416487, + "num_input_tokens_seen": 165529945, + "step": 7713, + "time_per_iteration": 2.5963666439056396 + }, + { + "auxiliary_loss_clip": 0.01097242, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_clip": 1.04390109, + "balance_loss_mlp": 1.02611256, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 2.755345210305684, + "language_loss": 0.58896649, + "learning_rate": 2.3301090827294e-06, + "loss": 0.61036152, + "num_input_tokens_seen": 165550690, + "step": 7714, + "time_per_iteration": 2.540799140930176 + }, + { + "auxiliary_loss_clip": 0.01114587, + "auxiliary_loss_mlp": 0.01035739, + "balance_loss_clip": 1.04574823, + "balance_loss_mlp": 1.0214572, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 1.9636298334411826, + "language_loss": 0.70448327, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.7259866, + "num_input_tokens_seen": 165567775, + "step": 7715, + "time_per_iteration": 2.4630215167999268 + }, + { + "auxiliary_loss_clip": 0.01134023, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.04758871, + "balance_loss_mlp": 1.02746379, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 2.270145867482809, + "language_loss": 0.6861757, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70794213, + "num_input_tokens_seen": 165587010, + "step": 7716, + "time_per_iteration": 2.4567344188690186 + }, + { + "auxiliary_loss_clip": 0.01127286, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.04528737, + "balance_loss_mlp": 1.01641321, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.7337865976247695, + "language_loss": 0.81049705, + "learning_rate": 2.328956666474691e-06, + "loss": 0.8320905, + "num_input_tokens_seen": 165607850, + "step": 7717, + "time_per_iteration": 2.481550931930542 + }, + { + "auxiliary_loss_clip": 0.01125983, + "auxiliary_loss_mlp": 0.01037297, + "balance_loss_clip": 1.04526472, + "balance_loss_mlp": 1.022295, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.6528628337570512, + "language_loss": 0.73163778, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75327063, + "num_input_tokens_seen": 165627175, + "step": 7718, + "time_per_iteration": 2.4350790977478027 + }, + { + "auxiliary_loss_clip": 0.0112434, + "auxiliary_loss_mlp": 0.00780287, + "balance_loss_clip": 1.04496145, + "balance_loss_mlp": 1.00030959, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.8831351043405746, + "language_loss": 0.70481056, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72385687, + "num_input_tokens_seen": 165648340, + "step": 7719, + "time_per_iteration": 2.5442638397216797 + }, + { + "auxiliary_loss_clip": 0.01106144, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.04650044, + "balance_loss_mlp": 1.02435422, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 1.6878218756037504, + "language_loss": 0.86628497, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88774419, + "num_input_tokens_seen": 165667195, + "step": 7720, + "time_per_iteration": 2.53045392036438 + }, + { + "auxiliary_loss_clip": 0.01031093, + "auxiliary_loss_mlp": 0.01001099, + "balance_loss_clip": 1.02372026, + "balance_loss_mlp": 0.99956113, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7193263299086532, + "language_loss": 0.55054808, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57087004, + "num_input_tokens_seen": 165726760, + "step": 7721, + "time_per_iteration": 3.1260128021240234 + }, + { + "auxiliary_loss_clip": 0.01103167, + "auxiliary_loss_mlp": 0.01044164, + "balance_loss_clip": 1.04333472, + "balance_loss_mlp": 1.02827346, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 3.7650453088393863, + "language_loss": 0.8008979, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.82237124, + "num_input_tokens_seen": 165745005, + "step": 7722, + "time_per_iteration": 2.5134599208831787 + }, + { + "auxiliary_loss_clip": 0.01114586, + "auxiliary_loss_mlp": 0.01039907, + "balance_loss_clip": 1.04176641, + "balance_loss_mlp": 1.02463615, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 2.6300937111152525, + "language_loss": 0.77727985, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.79882479, + "num_input_tokens_seen": 165765750, + "step": 7723, + "time_per_iteration": 2.5125491619110107 + }, + { + "auxiliary_loss_clip": 0.01045736, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.04108739, + "balance_loss_mlp": 1.01977861, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.720570378324533, + "language_loss": 0.68582034, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70661658, + "num_input_tokens_seen": 165787515, + "step": 7724, + "time_per_iteration": 2.929626703262329 + }, + { + "auxiliary_loss_clip": 0.01111526, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.04740846, + "balance_loss_mlp": 1.02054811, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 2.7470794792207633, + "language_loss": 0.66775799, + "learning_rate": 2.325883008671415e-06, + "loss": 0.68922502, + "num_input_tokens_seen": 165806675, + "step": 7725, + "time_per_iteration": 2.6227807998657227 + }, + { + "auxiliary_loss_clip": 0.01108918, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.04406309, + "balance_loss_mlp": 1.02520466, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.5908903734124245, + "language_loss": 0.64952362, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67099488, + "num_input_tokens_seen": 165829835, + "step": 7726, + "time_per_iteration": 4.105157136917114 + }, + { + "auxiliary_loss_clip": 0.01102616, + "auxiliary_loss_mlp": 0.00780384, + "balance_loss_clip": 1.04385316, + "balance_loss_mlp": 1.0003686, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 2.5539640297504005, + "language_loss": 0.75039178, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.76922178, + "num_input_tokens_seen": 165849380, + "step": 7727, + "time_per_iteration": 2.535905122756958 + }, + { + "auxiliary_loss_clip": 0.0110048, + "auxiliary_loss_mlp": 0.01043465, + "balance_loss_clip": 1.0425303, + "balance_loss_mlp": 1.02859402, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 2.146495024262345, + "language_loss": 0.7868858, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80832523, + "num_input_tokens_seen": 165868620, + "step": 7728, + "time_per_iteration": 2.6148815155029297 + }, + { + "auxiliary_loss_clip": 0.01090648, + "auxiliary_loss_mlp": 0.01039761, + "balance_loss_clip": 1.0410651, + "balance_loss_mlp": 1.02600408, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 1.8109744966398487, + "language_loss": 0.75971997, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78102398, + "num_input_tokens_seen": 165885915, + "step": 7729, + "time_per_iteration": 2.540717363357544 + }, + { + "auxiliary_loss_clip": 0.01101358, + "auxiliary_loss_mlp": 0.01056436, + "balance_loss_clip": 1.04400897, + "balance_loss_mlp": 1.04109943, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.6313608196912264, + "language_loss": 0.79708695, + "learning_rate": 2.323961570451588e-06, + "loss": 0.81866491, + "num_input_tokens_seen": 165905465, + "step": 7730, + "time_per_iteration": 2.6608121395111084 + }, + { + "auxiliary_loss_clip": 0.01123243, + "auxiliary_loss_mlp": 0.01042967, + "balance_loss_clip": 1.04452968, + "balance_loss_mlp": 1.02873325, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.7127632632560845, + "language_loss": 0.76926094, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.790923, + "num_input_tokens_seen": 165924640, + "step": 7731, + "time_per_iteration": 2.485740900039673 + }, + { + "auxiliary_loss_clip": 0.01088707, + "auxiliary_loss_mlp": 0.01032819, + "balance_loss_clip": 1.0452323, + "balance_loss_mlp": 1.01876473, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.7489249103185325, + "language_loss": 0.65859908, + "learning_rate": 2.323192909069061e-06, + "loss": 0.67981434, + "num_input_tokens_seen": 165945765, + "step": 7732, + "time_per_iteration": 2.634753942489624 + }, + { + "auxiliary_loss_clip": 0.01105463, + "auxiliary_loss_mlp": 0.01039391, + "balance_loss_clip": 1.04257429, + "balance_loss_mlp": 1.02245152, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.2614829102962233, + "language_loss": 0.73121512, + "learning_rate": 2.32280855998725e-06, + "loss": 0.75266361, + "num_input_tokens_seen": 165964025, + "step": 7733, + "time_per_iteration": 2.5049679279327393 + }, + { + "auxiliary_loss_clip": 0.01045502, + "auxiliary_loss_mlp": 0.01000651, + "balance_loss_clip": 1.01906443, + "balance_loss_mlp": 0.99937564, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.243398874894626, + "language_loss": 0.51899016, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.53945166, + "num_input_tokens_seen": 166021950, + "step": 7734, + "time_per_iteration": 2.955495834350586 + }, + { + "auxiliary_loss_clip": 0.01100202, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.04475331, + "balance_loss_mlp": 1.01755881, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 1.9817674957980291, + "language_loss": 0.75710535, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77842659, + "num_input_tokens_seen": 166039675, + "step": 7735, + "time_per_iteration": 2.488806962966919 + }, + { + "auxiliary_loss_clip": 0.01084383, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.04463553, + "balance_loss_mlp": 1.02992129, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 1.7216926108591273, + "language_loss": 0.70043349, + "learning_rate": 2.321655439354519e-06, + "loss": 0.72172356, + "num_input_tokens_seen": 166057745, + "step": 7736, + "time_per_iteration": 2.558324098587036 + }, + { + "auxiliary_loss_clip": 0.01123465, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.04660642, + "balance_loss_mlp": 1.02011228, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6547381790999838, + "language_loss": 0.72430182, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74586761, + "num_input_tokens_seen": 166076440, + "step": 7737, + "time_per_iteration": 2.4489872455596924 + }, + { + "auxiliary_loss_clip": 0.01104494, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_clip": 1.04875731, + "balance_loss_mlp": 1.03137803, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 1.7797961234838842, + "language_loss": 0.83626097, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85777378, + "num_input_tokens_seen": 166092520, + "step": 7738, + "time_per_iteration": 2.4735028743743896 + }, + { + "auxiliary_loss_clip": 0.01036739, + "auxiliary_loss_mlp": 0.01000466, + "balance_loss_clip": 1.02006102, + "balance_loss_mlp": 0.99921417, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7699553570271216, + "language_loss": 0.57828349, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59865552, + "num_input_tokens_seen": 166156285, + "step": 7739, + "time_per_iteration": 3.093266010284424 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01039701, + "balance_loss_clip": 1.04575443, + "balance_loss_mlp": 1.02634335, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.9030549191363413, + "language_loss": 0.85041577, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.87186134, + "num_input_tokens_seen": 166173455, + "step": 7740, + "time_per_iteration": 2.4831626415252686 + }, + { + "auxiliary_loss_clip": 0.01098277, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.04038703, + "balance_loss_mlp": 1.02612174, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 5.133965795106015, + "language_loss": 0.76072019, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78211319, + "num_input_tokens_seen": 166194370, + "step": 7741, + "time_per_iteration": 2.511829376220703 + }, + { + "auxiliary_loss_clip": 0.01097127, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.04470372, + "balance_loss_mlp": 1.02221811, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 2.20637912402172, + "language_loss": 0.80555737, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82689309, + "num_input_tokens_seen": 166213195, + "step": 7742, + "time_per_iteration": 4.28493332862854 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01043724, + "balance_loss_clip": 1.04252851, + "balance_loss_mlp": 1.02835798, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 2.9032105657234037, + "language_loss": 0.72747374, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74896985, + "num_input_tokens_seen": 166231350, + "step": 7743, + "time_per_iteration": 2.538661479949951 + }, + { + "auxiliary_loss_clip": 0.01094905, + "auxiliary_loss_mlp": 0.01039864, + "balance_loss_clip": 1.04447365, + "balance_loss_mlp": 1.02445662, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 1.8522988039614987, + "language_loss": 0.7155503, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73689795, + "num_input_tokens_seen": 166250530, + "step": 7744, + "time_per_iteration": 2.493746757507324 + }, + { + "auxiliary_loss_clip": 0.01076212, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.04732418, + "balance_loss_mlp": 1.02001941, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.5355487193915451, + "language_loss": 0.8515563, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87264729, + "num_input_tokens_seen": 166272545, + "step": 7745, + "time_per_iteration": 2.708678722381592 + }, + { + "auxiliary_loss_clip": 0.01115734, + "auxiliary_loss_mlp": 0.01041167, + "balance_loss_clip": 1.04584253, + "balance_loss_mlp": 1.02655149, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.418374690568018, + "language_loss": 0.73233247, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75390148, + "num_input_tokens_seen": 166292135, + "step": 7746, + "time_per_iteration": 2.5083417892456055 + }, + { + "auxiliary_loss_clip": 0.01112561, + "auxiliary_loss_mlp": 0.01041647, + "balance_loss_clip": 1.04638124, + "balance_loss_mlp": 1.02764606, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.617265436558783, + "language_loss": 0.70250529, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72404736, + "num_input_tokens_seen": 166316710, + "step": 7747, + "time_per_iteration": 2.8236372470855713 + }, + { + "auxiliary_loss_clip": 0.01085291, + "auxiliary_loss_mlp": 0.01042064, + "balance_loss_clip": 1.03965998, + "balance_loss_mlp": 1.02659094, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.8336135055550822, + "language_loss": 0.6740368, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69531035, + "num_input_tokens_seen": 166338535, + "step": 7748, + "time_per_iteration": 2.615666151046753 + }, + { + "auxiliary_loss_clip": 0.01090831, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.04472673, + "balance_loss_mlp": 1.02049541, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.0077443002965243, + "language_loss": 0.63909274, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.66037351, + "num_input_tokens_seen": 166355540, + "step": 7749, + "time_per_iteration": 4.053802490234375 + }, + { + "auxiliary_loss_clip": 0.01119381, + "auxiliary_loss_mlp": 0.01040039, + "balance_loss_clip": 1.04606414, + "balance_loss_mlp": 1.02417827, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 2.224610185929357, + "language_loss": 0.74169886, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76329309, + "num_input_tokens_seen": 166372635, + "step": 7750, + "time_per_iteration": 3.872962236404419 + }, + { + "auxiliary_loss_clip": 0.01106704, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.04688525, + "balance_loss_mlp": 1.01715541, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 1.8774589465354212, + "language_loss": 0.74478388, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76617014, + "num_input_tokens_seen": 166393175, + "step": 7751, + "time_per_iteration": 2.5960941314697266 + }, + { + "auxiliary_loss_clip": 0.0109931, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.04719329, + "balance_loss_mlp": 1.02109313, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 1.9882897651411982, + "language_loss": 0.73629999, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.75765783, + "num_input_tokens_seen": 166408630, + "step": 7752, + "time_per_iteration": 2.5235133171081543 + }, + { + "auxiliary_loss_clip": 0.01106303, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.04917991, + "balance_loss_mlp": 1.0302527, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.8387371766966, + "language_loss": 0.694502, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71602178, + "num_input_tokens_seen": 166428170, + "step": 7753, + "time_per_iteration": 2.544386863708496 + }, + { + "auxiliary_loss_clip": 0.0109794, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.04242754, + "balance_loss_mlp": 1.02215052, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 3.2656320816802076, + "language_loss": 0.73730958, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.75865841, + "num_input_tokens_seen": 166446705, + "step": 7754, + "time_per_iteration": 2.494610071182251 + }, + { + "auxiliary_loss_clip": 0.01108668, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.04567409, + "balance_loss_mlp": 1.01906776, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.7832362657829712, + "language_loss": 0.79205883, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.8134914, + "num_input_tokens_seen": 166466750, + "step": 7755, + "time_per_iteration": 2.540537118911743 + }, + { + "auxiliary_loss_clip": 0.01112468, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.04465139, + "balance_loss_mlp": 1.02016902, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.8303707110796594, + "language_loss": 0.72528231, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74675393, + "num_input_tokens_seen": 166485400, + "step": 7756, + "time_per_iteration": 2.482567548751831 + }, + { + "auxiliary_loss_clip": 0.01113788, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.04446149, + "balance_loss_mlp": 1.02320218, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 2.3565166895932443, + "language_loss": 0.78340977, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80492461, + "num_input_tokens_seen": 166505730, + "step": 7757, + "time_per_iteration": 2.5091753005981445 + }, + { + "auxiliary_loss_clip": 0.01086652, + "auxiliary_loss_mlp": 0.01033784, + "balance_loss_clip": 1.0406456, + "balance_loss_mlp": 1.01859713, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 1.6402527023481968, + "language_loss": 0.6622026, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68340701, + "num_input_tokens_seen": 166523770, + "step": 7758, + "time_per_iteration": 2.5732333660125732 + }, + { + "auxiliary_loss_clip": 0.01094566, + "auxiliary_loss_mlp": 0.01044836, + "balance_loss_clip": 1.04036117, + "balance_loss_mlp": 1.03005362, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.7049371917542797, + "language_loss": 0.74885345, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.77024746, + "num_input_tokens_seen": 166542935, + "step": 7759, + "time_per_iteration": 2.485696315765381 + }, + { + "auxiliary_loss_clip": 0.01106292, + "auxiliary_loss_mlp": 0.0104077, + "balance_loss_clip": 1.04645789, + "balance_loss_mlp": 1.02660823, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.5293112099165374, + "language_loss": 0.7779941, + "learning_rate": 2.312426555462893e-06, + "loss": 0.7994647, + "num_input_tokens_seen": 166563935, + "step": 7760, + "time_per_iteration": 2.5265719890594482 + }, + { + "auxiliary_loss_clip": 0.01100746, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.04435849, + "balance_loss_mlp": 1.01944458, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.6692036823628853, + "language_loss": 0.73914403, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76049256, + "num_input_tokens_seen": 166582175, + "step": 7761, + "time_per_iteration": 2.4948198795318604 + }, + { + "auxiliary_loss_clip": 0.01113571, + "auxiliary_loss_mlp": 0.0103889, + "balance_loss_clip": 1.04471922, + "balance_loss_mlp": 1.02195621, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.7890957377913552, + "language_loss": 0.78993374, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.81145835, + "num_input_tokens_seen": 166601870, + "step": 7762, + "time_per_iteration": 2.487931489944458 + }, + { + "auxiliary_loss_clip": 0.01033278, + "auxiliary_loss_mlp": 0.0100187, + "balance_loss_clip": 1.01497412, + "balance_loss_mlp": 1.00051081, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7990504409122481, + "language_loss": 0.59802884, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61838031, + "num_input_tokens_seen": 166668960, + "step": 7763, + "time_per_iteration": 3.118476152420044 + }, + { + "auxiliary_loss_clip": 0.01087546, + "auxiliary_loss_mlp": 0.01042734, + "balance_loss_clip": 1.03924668, + "balance_loss_mlp": 1.02581239, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.7753167535187844, + "language_loss": 0.78508669, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.80638945, + "num_input_tokens_seen": 166686110, + "step": 7764, + "time_per_iteration": 2.5430541038513184 + }, + { + "auxiliary_loss_clip": 0.01094666, + "auxiliary_loss_mlp": 0.01037195, + "balance_loss_clip": 1.04505146, + "balance_loss_mlp": 1.0240227, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.9760935824874242, + "language_loss": 0.72379595, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74511456, + "num_input_tokens_seen": 166703930, + "step": 7765, + "time_per_iteration": 2.5123417377471924 + }, + { + "auxiliary_loss_clip": 0.01086314, + "auxiliary_loss_mlp": 0.01041982, + "balance_loss_clip": 1.04683638, + "balance_loss_mlp": 1.02693808, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 1.9085550529663378, + "language_loss": 0.77779329, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.7990762, + "num_input_tokens_seen": 166719940, + "step": 7766, + "time_per_iteration": 4.09860897064209 + }, + { + "auxiliary_loss_clip": 0.0110726, + "auxiliary_loss_mlp": 0.01039183, + "balance_loss_clip": 1.03952503, + "balance_loss_mlp": 1.02465773, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 2.1286098414168224, + "language_loss": 0.65623534, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.6776998, + "num_input_tokens_seen": 166738285, + "step": 7767, + "time_per_iteration": 2.438272714614868 + }, + { + "auxiliary_loss_clip": 0.01113181, + "auxiliary_loss_mlp": 0.01042913, + "balance_loss_clip": 1.0434649, + "balance_loss_mlp": 1.02851236, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.9646299328780057, + "language_loss": 0.74940884, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.77096975, + "num_input_tokens_seen": 166758170, + "step": 7768, + "time_per_iteration": 2.4966604709625244 + }, + { + "auxiliary_loss_clip": 0.010943, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.04245293, + "balance_loss_mlp": 1.02226949, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.7661928650922536, + "language_loss": 0.70799339, + "learning_rate": 2.308963953858982e-06, + "loss": 0.72930551, + "num_input_tokens_seen": 166775750, + "step": 7769, + "time_per_iteration": 2.4802002906799316 + }, + { + "auxiliary_loss_clip": 0.01122844, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.04266489, + "balance_loss_mlp": 1.02349985, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 2.204657288366959, + "language_loss": 0.81356096, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83516353, + "num_input_tokens_seen": 166791720, + "step": 7770, + "time_per_iteration": 2.398916721343994 + }, + { + "auxiliary_loss_clip": 0.01043896, + "auxiliary_loss_mlp": 0.01004158, + "balance_loss_clip": 1.01757312, + "balance_loss_mlp": 1.00278687, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.8485589895034042, + "language_loss": 0.55655074, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57703131, + "num_input_tokens_seen": 166856360, + "step": 7771, + "time_per_iteration": 3.0330493450164795 + }, + { + "auxiliary_loss_clip": 0.01110442, + "auxiliary_loss_mlp": 0.00779256, + "balance_loss_clip": 1.0409236, + "balance_loss_mlp": 1.00044787, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.3300303946090346, + "language_loss": 0.65973121, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.67862821, + "num_input_tokens_seen": 166875925, + "step": 7772, + "time_per_iteration": 2.501049518585205 + }, + { + "auxiliary_loss_clip": 0.01113023, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.04797149, + "balance_loss_mlp": 1.02015936, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 4.637316479804755, + "language_loss": 0.63728452, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65875387, + "num_input_tokens_seen": 166896520, + "step": 7773, + "time_per_iteration": 2.538116216659546 + }, + { + "auxiliary_loss_clip": 0.01109733, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.04694581, + "balance_loss_mlp": 1.02150369, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 2.2607947543585976, + "language_loss": 0.80221844, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82368445, + "num_input_tokens_seen": 166915370, + "step": 7774, + "time_per_iteration": 2.4994940757751465 + }, + { + "auxiliary_loss_clip": 0.01095248, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.04332018, + "balance_loss_mlp": 1.01634812, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.6176919711837145, + "language_loss": 0.77662754, + "learning_rate": 2.306655024915726e-06, + "loss": 0.79789573, + "num_input_tokens_seen": 166934875, + "step": 7775, + "time_per_iteration": 2.5422933101654053 + }, + { + "auxiliary_loss_clip": 0.01093464, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.04370022, + "balance_loss_mlp": 1.01848543, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 6.839397517357377, + "language_loss": 0.69955719, + "learning_rate": 2.306270162640694e-06, + "loss": 0.72082901, + "num_input_tokens_seen": 166954285, + "step": 7776, + "time_per_iteration": 2.5074450969696045 + }, + { + "auxiliary_loss_clip": 0.01117321, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.04695797, + "balance_loss_mlp": 1.01925397, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.5261659031481007, + "language_loss": 0.7401672, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.761666, + "num_input_tokens_seen": 166975975, + "step": 7777, + "time_per_iteration": 2.5376107692718506 + }, + { + "auxiliary_loss_clip": 0.01114148, + "auxiliary_loss_mlp": 0.01037865, + "balance_loss_clip": 1.04411077, + "balance_loss_mlp": 1.02338743, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.126278762371088, + "language_loss": 0.69744784, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71896797, + "num_input_tokens_seen": 166996140, + "step": 7778, + "time_per_iteration": 2.5768213272094727 + }, + { + "auxiliary_loss_clip": 0.0111502, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.04481506, + "balance_loss_mlp": 1.02561903, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 1.6484642346126097, + "language_loss": 0.73193461, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75349605, + "num_input_tokens_seen": 167016105, + "step": 7779, + "time_per_iteration": 2.5241858959198 + }, + { + "auxiliary_loss_clip": 0.01082939, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.04209435, + "balance_loss_mlp": 1.02420354, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.5332451284627406, + "language_loss": 0.72589558, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74710315, + "num_input_tokens_seen": 167036185, + "step": 7780, + "time_per_iteration": 2.601825475692749 + }, + { + "auxiliary_loss_clip": 0.01096724, + "auxiliary_loss_mlp": 0.01052911, + "balance_loss_clip": 1.03753054, + "balance_loss_mlp": 1.03505313, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.0917940335133185, + "language_loss": 0.73834014, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.75983655, + "num_input_tokens_seen": 167054515, + "step": 7781, + "time_per_iteration": 4.02583384513855 + }, + { + "auxiliary_loss_clip": 0.01114827, + "auxiliary_loss_mlp": 0.01037362, + "balance_loss_clip": 1.04270804, + "balance_loss_mlp": 1.02203143, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.8215282218309585, + "language_loss": 0.63441181, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65593368, + "num_input_tokens_seen": 167077245, + "step": 7782, + "time_per_iteration": 2.5854036808013916 + }, + { + "auxiliary_loss_clip": 0.01108062, + "auxiliary_loss_mlp": 0.01045989, + "balance_loss_clip": 1.04433846, + "balance_loss_mlp": 1.03085494, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 2.394835977394018, + "language_loss": 0.63334489, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65488541, + "num_input_tokens_seen": 167097235, + "step": 7783, + "time_per_iteration": 2.5629489421844482 + }, + { + "auxiliary_loss_clip": 0.01117431, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.04608965, + "balance_loss_mlp": 1.02409863, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.892839216603469, + "language_loss": 0.6841259, + "learning_rate": 2.303190847569801e-06, + "loss": 0.7056998, + "num_input_tokens_seen": 167113155, + "step": 7784, + "time_per_iteration": 2.4392616748809814 + }, + { + "auxiliary_loss_clip": 0.01094367, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.04332423, + "balance_loss_mlp": 1.02153492, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 1.7515740698662667, + "language_loss": 0.84251451, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.86381131, + "num_input_tokens_seen": 167131765, + "step": 7785, + "time_per_iteration": 2.4585065841674805 + }, + { + "auxiliary_loss_clip": 0.0109315, + "auxiliary_loss_mlp": 0.01034547, + "balance_loss_clip": 1.04822516, + "balance_loss_mlp": 1.01938415, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 1.856102584622496, + "language_loss": 0.77370656, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79498357, + "num_input_tokens_seen": 167149030, + "step": 7786, + "time_per_iteration": 2.498009443283081 + }, + { + "auxiliary_loss_clip": 0.01109169, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.04301941, + "balance_loss_mlp": 1.01630437, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 1.8526159863636333, + "language_loss": 0.74459743, + "learning_rate": 2.302035914315856e-06, + "loss": 0.7659862, + "num_input_tokens_seen": 167167375, + "step": 7787, + "time_per_iteration": 2.490567445755005 + }, + { + "auxiliary_loss_clip": 0.0109609, + "auxiliary_loss_mlp": 0.01039739, + "balance_loss_clip": 1.04110503, + "balance_loss_mlp": 1.02523136, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.835555024048758, + "language_loss": 0.65268403, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67404234, + "num_input_tokens_seen": 167188065, + "step": 7788, + "time_per_iteration": 4.102871417999268 + }, + { + "auxiliary_loss_clip": 0.0111115, + "auxiliary_loss_mlp": 0.01036356, + "balance_loss_clip": 1.04356122, + "balance_loss_mlp": 1.02326035, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.506237904063135, + "language_loss": 0.63963246, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.66110748, + "num_input_tokens_seen": 167209675, + "step": 7789, + "time_per_iteration": 2.5230510234832764 + }, + { + "auxiliary_loss_clip": 0.01037365, + "auxiliary_loss_mlp": 0.01002234, + "balance_loss_clip": 1.02101564, + "balance_loss_mlp": 1.00083959, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.6927719348396966, + "language_loss": 0.61861461, + "learning_rate": 2.300880877982825e-06, + "loss": 0.63901061, + "num_input_tokens_seen": 167273940, + "step": 7790, + "time_per_iteration": 4.508617639541626 + }, + { + "auxiliary_loss_clip": 0.01086999, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.04572833, + "balance_loss_mlp": 1.02274561, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.5964489871796976, + "language_loss": 0.78897399, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81021982, + "num_input_tokens_seen": 167292730, + "step": 7791, + "time_per_iteration": 2.540543794631958 + }, + { + "auxiliary_loss_clip": 0.0111712, + "auxiliary_loss_mlp": 0.01037758, + "balance_loss_clip": 1.04708123, + "balance_loss_mlp": 1.02305984, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.557841785712611, + "language_loss": 0.74845701, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77000576, + "num_input_tokens_seen": 167313460, + "step": 7792, + "time_per_iteration": 2.5239813327789307 + }, + { + "auxiliary_loss_clip": 0.01092656, + "auxiliary_loss_mlp": 0.01044611, + "balance_loss_clip": 1.03896892, + "balance_loss_mlp": 1.02910221, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.4345715495745657, + "language_loss": 0.68473488, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70610756, + "num_input_tokens_seen": 167335385, + "step": 7793, + "time_per_iteration": 2.564054489135742 + }, + { + "auxiliary_loss_clip": 0.01114964, + "auxiliary_loss_mlp": 0.00778084, + "balance_loss_clip": 1.04668772, + "balance_loss_mlp": 1.00028002, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.5306102834317552, + "language_loss": 0.73554254, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.75447297, + "num_input_tokens_seen": 167353625, + "step": 7794, + "time_per_iteration": 2.4865987300872803 + }, + { + "auxiliary_loss_clip": 0.01097261, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.04525197, + "balance_loss_mlp": 1.02565169, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.612636420334097, + "language_loss": 0.6309756, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65235668, + "num_input_tokens_seen": 167374565, + "step": 7795, + "time_per_iteration": 2.575660228729248 + }, + { + "auxiliary_loss_clip": 0.01087194, + "auxiliary_loss_mlp": 0.01031822, + "balance_loss_clip": 1.04137504, + "balance_loss_mlp": 1.01681352, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.6030561165935178, + "language_loss": 0.67786467, + "learning_rate": 2.298570497656304e-06, + "loss": 0.69905484, + "num_input_tokens_seen": 167395010, + "step": 7796, + "time_per_iteration": 2.650590181350708 + }, + { + "auxiliary_loss_clip": 0.01124592, + "auxiliary_loss_mlp": 0.00780036, + "balance_loss_clip": 1.04511786, + "balance_loss_mlp": 1.00028419, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.871734565271466, + "language_loss": 0.70030099, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.71934724, + "num_input_tokens_seen": 167415285, + "step": 7797, + "time_per_iteration": 2.4752461910247803 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.04430914, + "balance_loss_mlp": 1.02212512, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 2.319558419448278, + "language_loss": 0.66936535, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69078195, + "num_input_tokens_seen": 167432405, + "step": 7798, + "time_per_iteration": 2.4771194458007812 + }, + { + "auxiliary_loss_clip": 0.01037777, + "auxiliary_loss_mlp": 0.01002429, + "balance_loss_clip": 1.02030611, + "balance_loss_mlp": 1.00105214, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9401196909162994, + "language_loss": 0.64532268, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66572475, + "num_input_tokens_seen": 167499365, + "step": 7799, + "time_per_iteration": 3.222271680831909 + }, + { + "auxiliary_loss_clip": 0.01099367, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.04880989, + "balance_loss_mlp": 1.01513815, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.32129729219226, + "language_loss": 0.72148669, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74277306, + "num_input_tokens_seen": 167520390, + "step": 7800, + "time_per_iteration": 2.5893213748931885 + }, + { + "auxiliary_loss_clip": 0.01124057, + "auxiliary_loss_mlp": 0.01037379, + "balance_loss_clip": 1.04733896, + "balance_loss_mlp": 1.02408099, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 1.7048427133007515, + "language_loss": 0.72580343, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74741781, + "num_input_tokens_seen": 167539865, + "step": 7801, + "time_per_iteration": 2.471555233001709 + }, + { + "auxiliary_loss_clip": 0.01096435, + "auxiliary_loss_mlp": 0.01046472, + "balance_loss_clip": 1.04219604, + "balance_loss_mlp": 1.02990162, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 2.0530087579831706, + "language_loss": 0.62956494, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.650994, + "num_input_tokens_seen": 167558190, + "step": 7802, + "time_per_iteration": 2.528181552886963 + }, + { + "auxiliary_loss_clip": 0.01125916, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.04499841, + "balance_loss_mlp": 1.02582574, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 3.3670149221021344, + "language_loss": 0.73759538, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75925863, + "num_input_tokens_seen": 167577685, + "step": 7803, + "time_per_iteration": 2.4832329750061035 + }, + { + "auxiliary_loss_clip": 0.01100653, + "auxiliary_loss_mlp": 0.00779442, + "balance_loss_clip": 1.04398584, + "balance_loss_mlp": 1.00024414, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 2.2816815170837157, + "language_loss": 0.77576828, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79456925, + "num_input_tokens_seen": 167596390, + "step": 7804, + "time_per_iteration": 2.48545503616333 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.04928303, + "balance_loss_mlp": 1.01774716, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.882137501072085, + "language_loss": 0.77305883, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79440522, + "num_input_tokens_seen": 167614980, + "step": 7805, + "time_per_iteration": 3.9695234298706055 + }, + { + "auxiliary_loss_clip": 0.01134253, + "auxiliary_loss_mlp": 0.01050758, + "balance_loss_clip": 1.04821908, + "balance_loss_mlp": 1.03483164, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.570443586081812, + "language_loss": 0.82863247, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85048264, + "num_input_tokens_seen": 167635895, + "step": 7806, + "time_per_iteration": 2.5065741539001465 + }, + { + "auxiliary_loss_clip": 0.01103281, + "auxiliary_loss_mlp": 0.01038114, + "balance_loss_clip": 1.04385185, + "balance_loss_mlp": 1.02363002, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.9077536568831803, + "language_loss": 0.77341276, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79482675, + "num_input_tokens_seen": 167657440, + "step": 7807, + "time_per_iteration": 2.6228787899017334 + }, + { + "auxiliary_loss_clip": 0.01106012, + "auxiliary_loss_mlp": 0.01040264, + "balance_loss_clip": 1.05062807, + "balance_loss_mlp": 1.0251776, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 2.1217648185056577, + "language_loss": 0.51513386, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.5365966, + "num_input_tokens_seen": 167675025, + "step": 7808, + "time_per_iteration": 2.4954752922058105 + }, + { + "auxiliary_loss_clip": 0.01003661, + "auxiliary_loss_mlp": 0.01000306, + "balance_loss_clip": 1.01632869, + "balance_loss_mlp": 0.99892867, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.7880792094743957, + "language_loss": 0.57834053, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59838009, + "num_input_tokens_seen": 167729635, + "step": 7809, + "time_per_iteration": 2.9858779907226562 + }, + { + "auxiliary_loss_clip": 0.01084622, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.04619062, + "balance_loss_mlp": 1.02282739, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.507709757737659, + "language_loss": 0.71744138, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.73865944, + "num_input_tokens_seen": 167745135, + "step": 7810, + "time_per_iteration": 2.53324818611145 + }, + { + "auxiliary_loss_clip": 0.01118646, + "auxiliary_loss_mlp": 0.01040013, + "balance_loss_clip": 1.04703903, + "balance_loss_mlp": 1.02537417, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 1.911655353848844, + "language_loss": 0.81088138, + "learning_rate": 2.29279277055369e-06, + "loss": 0.83246803, + "num_input_tokens_seen": 167763875, + "step": 7811, + "time_per_iteration": 2.47802472114563 + }, + { + "auxiliary_loss_clip": 0.01112628, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.0475502, + "balance_loss_mlp": 1.02329826, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.6538356662303424, + "language_loss": 0.80976677, + "learning_rate": 2.292407499379644e-06, + "loss": 0.83127367, + "num_input_tokens_seen": 167784895, + "step": 7812, + "time_per_iteration": 2.4805657863616943 + }, + { + "auxiliary_loss_clip": 0.0107139, + "auxiliary_loss_mlp": 0.01038631, + "balance_loss_clip": 1.03968012, + "balance_loss_mlp": 1.02414703, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.7964309598079102, + "language_loss": 0.74229282, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76339304, + "num_input_tokens_seen": 167803185, + "step": 7813, + "time_per_iteration": 2.5514461994171143 + }, + { + "auxiliary_loss_clip": 0.01103119, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.04919124, + "balance_loss_mlp": 1.01799214, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.508775137198475, + "language_loss": 0.84827077, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86963356, + "num_input_tokens_seen": 167816550, + "step": 7814, + "time_per_iteration": 2.4756038188934326 + }, + { + "auxiliary_loss_clip": 0.01103677, + "auxiliary_loss_mlp": 0.01039839, + "balance_loss_clip": 1.04383373, + "balance_loss_mlp": 1.02592182, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 2.241474454105294, + "language_loss": 0.81756049, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83899564, + "num_input_tokens_seen": 167831845, + "step": 7815, + "time_per_iteration": 2.5094828605651855 + }, + { + "auxiliary_loss_clip": 0.01083723, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.04600787, + "balance_loss_mlp": 1.01984119, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 2.140593668251049, + "language_loss": 0.7762779, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79746962, + "num_input_tokens_seen": 167850360, + "step": 7816, + "time_per_iteration": 2.573572874069214 + }, + { + "auxiliary_loss_clip": 0.01046332, + "auxiliary_loss_mlp": 0.01000603, + "balance_loss_clip": 1.0203011, + "balance_loss_mlp": 0.99917811, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8440688354747076, + "language_loss": 0.59099358, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61146295, + "num_input_tokens_seen": 167908660, + "step": 7817, + "time_per_iteration": 3.037658929824829 + }, + { + "auxiliary_loss_clip": 0.01103625, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.04591787, + "balance_loss_mlp": 1.02109396, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.840394908554011, + "language_loss": 0.79409075, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81547415, + "num_input_tokens_seen": 167927905, + "step": 7818, + "time_per_iteration": 2.5391736030578613 + }, + { + "auxiliary_loss_clip": 0.01125275, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.04475212, + "balance_loss_mlp": 1.02413046, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.6352701847610935, + "language_loss": 0.83931106, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86094123, + "num_input_tokens_seen": 167945995, + "step": 7819, + "time_per_iteration": 2.451120376586914 + }, + { + "auxiliary_loss_clip": 0.01099704, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.04395676, + "balance_loss_mlp": 1.02235556, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.268076879774856, + "language_loss": 0.76732445, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78869891, + "num_input_tokens_seen": 167963380, + "step": 7820, + "time_per_iteration": 3.9490973949432373 + }, + { + "auxiliary_loss_clip": 0.01113791, + "auxiliary_loss_mlp": 0.01043991, + "balance_loss_clip": 1.05157185, + "balance_loss_mlp": 1.03025866, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 2.011721279959789, + "language_loss": 0.74590814, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76748598, + "num_input_tokens_seen": 167981740, + "step": 7821, + "time_per_iteration": 2.4543466567993164 + }, + { + "auxiliary_loss_clip": 0.01123653, + "auxiliary_loss_mlp": 0.01050329, + "balance_loss_clip": 1.04532719, + "balance_loss_mlp": 1.03712678, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.8248451769246294, + "language_loss": 0.88951159, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91125143, + "num_input_tokens_seen": 167999380, + "step": 7822, + "time_per_iteration": 2.453488826751709 + }, + { + "auxiliary_loss_clip": 0.01107195, + "auxiliary_loss_mlp": 0.01034646, + "balance_loss_clip": 1.04425097, + "balance_loss_mlp": 1.0216279, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.7187164128048587, + "language_loss": 0.79465103, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.81606948, + "num_input_tokens_seen": 168018395, + "step": 7823, + "time_per_iteration": 2.4827537536621094 + }, + { + "auxiliary_loss_clip": 0.01041108, + "auxiliary_loss_mlp": 0.01006808, + "balance_loss_clip": 1.03934276, + "balance_loss_mlp": 1.00537765, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.7335608234132108, + "language_loss": 0.56674701, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58722615, + "num_input_tokens_seen": 168084080, + "step": 7824, + "time_per_iteration": 3.1551408767700195 + }, + { + "auxiliary_loss_clip": 0.01101247, + "auxiliary_loss_mlp": 0.01045855, + "balance_loss_clip": 1.04164171, + "balance_loss_mlp": 1.03056073, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.875957732709927, + "language_loss": 0.8153339, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83680493, + "num_input_tokens_seen": 168101555, + "step": 7825, + "time_per_iteration": 2.5040252208709717 + }, + { + "auxiliary_loss_clip": 0.01105367, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.04479241, + "balance_loss_mlp": 1.02226377, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.6709501883715416, + "language_loss": 0.66957599, + "learning_rate": 2.287012545338324e-06, + "loss": 0.69099504, + "num_input_tokens_seen": 168121530, + "step": 7826, + "time_per_iteration": 2.61015248298645 + }, + { + "auxiliary_loss_clip": 0.01098723, + "auxiliary_loss_mlp": 0.0104443, + "balance_loss_clip": 1.03983808, + "balance_loss_mlp": 1.02950513, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 1.8137230626924117, + "language_loss": 0.84135437, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86278594, + "num_input_tokens_seen": 168140335, + "step": 7827, + "time_per_iteration": 4.007572174072266 + }, + { + "auxiliary_loss_clip": 0.01025435, + "auxiliary_loss_mlp": 0.01012961, + "balance_loss_clip": 1.01777148, + "balance_loss_mlp": 1.01150692, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.959422771833342, + "language_loss": 0.55694771, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57733172, + "num_input_tokens_seen": 168200535, + "step": 7828, + "time_per_iteration": 3.060180187225342 + }, + { + "auxiliary_loss_clip": 0.01122487, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.04479742, + "balance_loss_mlp": 1.02032399, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 3.8787746372660403, + "language_loss": 0.81114376, + "learning_rate": 2.285856204861245e-06, + "loss": 0.83271116, + "num_input_tokens_seen": 168219610, + "step": 7829, + "time_per_iteration": 2.414858341217041 + }, + { + "auxiliary_loss_clip": 0.01121809, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.04492128, + "balance_loss_mlp": 1.0237689, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.2681646064198144, + "language_loss": 0.76065224, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78223479, + "num_input_tokens_seen": 168242505, + "step": 7830, + "time_per_iteration": 3.8941941261291504 + }, + { + "auxiliary_loss_clip": 0.01089745, + "auxiliary_loss_mlp": 0.01033815, + "balance_loss_clip": 1.04256308, + "balance_loss_mlp": 1.01945019, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 2.0434121165243995, + "language_loss": 0.7905097, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.81174529, + "num_input_tokens_seen": 168260220, + "step": 7831, + "time_per_iteration": 2.518437385559082 + }, + { + "auxiliary_loss_clip": 0.0108341, + "auxiliary_loss_mlp": 0.0104768, + "balance_loss_clip": 1.03791022, + "balance_loss_mlp": 1.0307169, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.8962776233445877, + "language_loss": 0.75534207, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.77665299, + "num_input_tokens_seen": 168277360, + "step": 7832, + "time_per_iteration": 2.586268424987793 + }, + { + "auxiliary_loss_clip": 0.0109625, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.04316044, + "balance_loss_mlp": 1.02020574, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.2066803651354054, + "language_loss": 0.74621439, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76750422, + "num_input_tokens_seen": 168296605, + "step": 7833, + "time_per_iteration": 2.512817859649658 + }, + { + "auxiliary_loss_clip": 0.01112434, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.04511714, + "balance_loss_mlp": 1.02511239, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.5800076517384052, + "language_loss": 0.75559586, + "learning_rate": 2.283928754133762e-06, + "loss": 0.77710778, + "num_input_tokens_seen": 168316205, + "step": 7834, + "time_per_iteration": 2.4862864017486572 + }, + { + "auxiliary_loss_clip": 0.0107771, + "auxiliary_loss_mlp": 0.01039124, + "balance_loss_clip": 1.04415119, + "balance_loss_mlp": 1.02578425, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.4277023775991908, + "language_loss": 0.66483456, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68600285, + "num_input_tokens_seen": 168338935, + "step": 7835, + "time_per_iteration": 2.7680182456970215 + }, + { + "auxiliary_loss_clip": 0.01033696, + "auxiliary_loss_mlp": 0.00755262, + "balance_loss_clip": 1.01594996, + "balance_loss_mlp": 1.00003183, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8714028001476893, + "language_loss": 0.62085879, + "learning_rate": 2.283157698374194e-06, + "loss": 0.63874841, + "num_input_tokens_seen": 168392800, + "step": 7836, + "time_per_iteration": 3.044044017791748 + }, + { + "auxiliary_loss_clip": 0.01091215, + "auxiliary_loss_mlp": 0.00780263, + "balance_loss_clip": 1.04424703, + "balance_loss_mlp": 1.00027442, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.6765563997382962, + "language_loss": 0.69799381, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71670854, + "num_input_tokens_seen": 168412940, + "step": 7837, + "time_per_iteration": 2.592686891555786 + }, + { + "auxiliary_loss_clip": 0.01111717, + "auxiliary_loss_mlp": 0.0104128, + "balance_loss_clip": 1.04539466, + "balance_loss_mlp": 1.02633715, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 2.59408487601582, + "language_loss": 0.660685, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68221498, + "num_input_tokens_seen": 168431995, + "step": 7838, + "time_per_iteration": 2.4647796154022217 + }, + { + "auxiliary_loss_clip": 0.01097884, + "auxiliary_loss_mlp": 0.01039795, + "balance_loss_clip": 1.03986263, + "balance_loss_mlp": 1.02471519, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.95889848210071, + "language_loss": 0.77091253, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79228938, + "num_input_tokens_seen": 168454585, + "step": 7839, + "time_per_iteration": 2.560403347015381 + }, + { + "auxiliary_loss_clip": 0.01085122, + "auxiliary_loss_mlp": 0.01038739, + "balance_loss_clip": 1.04112875, + "balance_loss_mlp": 1.02556586, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 2.226030555392228, + "language_loss": 0.72602314, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.74726176, + "num_input_tokens_seen": 168471265, + "step": 7840, + "time_per_iteration": 2.5706963539123535 + }, + { + "auxiliary_loss_clip": 0.01097698, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.04519629, + "balance_loss_mlp": 1.01917613, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.6914236600566699, + "language_loss": 0.75017571, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77147698, + "num_input_tokens_seen": 168491360, + "step": 7841, + "time_per_iteration": 2.5198781490325928 + }, + { + "auxiliary_loss_clip": 0.01097878, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.04720902, + "balance_loss_mlp": 1.01929653, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 3.2632364694739056, + "language_loss": 0.70516491, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72647017, + "num_input_tokens_seen": 168511335, + "step": 7842, + "time_per_iteration": 2.5046069622039795 + }, + { + "auxiliary_loss_clip": 0.01117253, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.04744947, + "balance_loss_mlp": 1.02199292, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.5876407601113747, + "language_loss": 0.78829509, + "learning_rate": 2.280458665756177e-06, + "loss": 0.80982685, + "num_input_tokens_seen": 168529920, + "step": 7843, + "time_per_iteration": 2.458566904067993 + }, + { + "auxiliary_loss_clip": 0.011089, + "auxiliary_loss_mlp": 0.01032218, + "balance_loss_clip": 1.04350257, + "balance_loss_mlp": 1.01873565, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.9146546709071297, + "language_loss": 0.74203682, + "learning_rate": 2.280073047010832e-06, + "loss": 0.763448, + "num_input_tokens_seen": 168550595, + "step": 7844, + "time_per_iteration": 2.484013080596924 + }, + { + "auxiliary_loss_clip": 0.01097592, + "auxiliary_loss_mlp": 0.0104688, + "balance_loss_clip": 1.0447644, + "balance_loss_mlp": 1.03215742, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 3.8592861512728973, + "language_loss": 0.7836833, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80512798, + "num_input_tokens_seen": 168569765, + "step": 7845, + "time_per_iteration": 3.9781062602996826 + }, + { + "auxiliary_loss_clip": 0.01109064, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.04137182, + "balance_loss_mlp": 1.02261341, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.3497701078945694, + "language_loss": 0.73270828, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75415653, + "num_input_tokens_seen": 168591525, + "step": 7846, + "time_per_iteration": 2.507202386856079 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.0103501, + "balance_loss_clip": 1.0424757, + "balance_loss_mlp": 1.02177215, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.29035005717471, + "language_loss": 0.74324763, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76467168, + "num_input_tokens_seen": 168611235, + "step": 7847, + "time_per_iteration": 2.5078487396240234 + }, + { + "auxiliary_loss_clip": 0.01074057, + "auxiliary_loss_mlp": 0.01036044, + "balance_loss_clip": 1.03966618, + "balance_loss_mlp": 1.02275825, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 2.822717250551554, + "language_loss": 0.80836886, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82946986, + "num_input_tokens_seen": 168628710, + "step": 7848, + "time_per_iteration": 2.53439998626709 + }, + { + "auxiliary_loss_clip": 0.01112963, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.04442012, + "balance_loss_mlp": 1.02029943, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 2.199685048376207, + "language_loss": 0.70593548, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72740108, + "num_input_tokens_seen": 168645645, + "step": 7849, + "time_per_iteration": 2.434302806854248 + }, + { + "auxiliary_loss_clip": 0.01096091, + "auxiliary_loss_mlp": 0.01042855, + "balance_loss_clip": 1.04336286, + "balance_loss_mlp": 1.0271194, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.1558873791139015, + "language_loss": 0.70065939, + "learning_rate": 2.277759112022224e-06, + "loss": 0.72204888, + "num_input_tokens_seen": 168664165, + "step": 7850, + "time_per_iteration": 2.5099446773529053 + }, + { + "auxiliary_loss_clip": 0.01065763, + "auxiliary_loss_mlp": 0.01032096, + "balance_loss_clip": 1.03804922, + "balance_loss_mlp": 1.01800549, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.7158288447510912, + "language_loss": 0.7470957, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.76807427, + "num_input_tokens_seen": 168681940, + "step": 7851, + "time_per_iteration": 2.58968448638916 + }, + { + "auxiliary_loss_clip": 0.01059458, + "auxiliary_loss_mlp": 0.01047623, + "balance_loss_clip": 1.03395224, + "balance_loss_mlp": 1.02992082, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 2.137379281694059, + "language_loss": 0.76370263, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78477347, + "num_input_tokens_seen": 168698830, + "step": 7852, + "time_per_iteration": 2.564020872116089 + }, + { + "auxiliary_loss_clip": 0.01089319, + "auxiliary_loss_mlp": 0.0102999, + "balance_loss_clip": 1.04478574, + "balance_loss_mlp": 1.01554799, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.5244850122409652, + "language_loss": 0.69402063, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71521366, + "num_input_tokens_seen": 168718305, + "step": 7853, + "time_per_iteration": 2.543661117553711 + }, + { + "auxiliary_loss_clip": 0.01007899, + "auxiliary_loss_mlp": 0.01003868, + "balance_loss_clip": 1.01693153, + "balance_loss_mlp": 1.00222325, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.7017289581858462, + "language_loss": 0.50218606, + "learning_rate": 2.276216277848432e-06, + "loss": 0.5223037, + "num_input_tokens_seen": 168782365, + "step": 7854, + "time_per_iteration": 3.3023324012756348 + }, + { + "auxiliary_loss_clip": 0.01115024, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.04493999, + "balance_loss_mlp": 1.02233219, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 2.0181562282989414, + "language_loss": 0.64214015, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66366076, + "num_input_tokens_seen": 168800485, + "step": 7855, + "time_per_iteration": 2.5191872119903564 + }, + { + "auxiliary_loss_clip": 0.01111353, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.04308999, + "balance_loss_mlp": 1.02092469, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.9242235949085786, + "language_loss": 0.75599837, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.77747035, + "num_input_tokens_seen": 168818965, + "step": 7856, + "time_per_iteration": 2.5118935108184814 + }, + { + "auxiliary_loss_clip": 0.01095976, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.04030514, + "balance_loss_mlp": 1.02047241, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.8202494333481682, + "language_loss": 0.74867117, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76995927, + "num_input_tokens_seen": 168840355, + "step": 7857, + "time_per_iteration": 2.5428965091705322 + }, + { + "auxiliary_loss_clip": 0.01098688, + "auxiliary_loss_mlp": 0.01040753, + "balance_loss_clip": 1.04190469, + "balance_loss_mlp": 1.02817082, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.9697506299762904, + "language_loss": 0.64748627, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66888064, + "num_input_tokens_seen": 168861765, + "step": 7858, + "time_per_iteration": 2.590208053588867 + }, + { + "auxiliary_loss_clip": 0.01110727, + "auxiliary_loss_mlp": 0.00779806, + "balance_loss_clip": 1.04264212, + "balance_loss_mlp": 1.00026596, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 8.482132095616006, + "language_loss": 0.70612001, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72502542, + "num_input_tokens_seen": 168881310, + "step": 7859, + "time_per_iteration": 3.9970908164978027 + }, + { + "auxiliary_loss_clip": 0.01127565, + "auxiliary_loss_mlp": 0.01039914, + "balance_loss_clip": 1.04467654, + "balance_loss_mlp": 1.02595413, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 7.710777073412638, + "language_loss": 0.61857021, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64024496, + "num_input_tokens_seen": 168899470, + "step": 7860, + "time_per_iteration": 2.436720371246338 + }, + { + "auxiliary_loss_clip": 0.01102281, + "auxiliary_loss_mlp": 0.01042702, + "balance_loss_clip": 1.04312921, + "balance_loss_mlp": 1.02886176, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.5285918651104544, + "language_loss": 0.72137904, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.74282885, + "num_input_tokens_seen": 168921495, + "step": 7861, + "time_per_iteration": 2.6053414344787598 + }, + { + "auxiliary_loss_clip": 0.01099177, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.04182327, + "balance_loss_mlp": 1.02215648, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.8807782124551338, + "language_loss": 0.85010612, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87145787, + "num_input_tokens_seen": 168940515, + "step": 7862, + "time_per_iteration": 2.493136405944824 + }, + { + "auxiliary_loss_clip": 0.01123946, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.04270363, + "balance_loss_mlp": 1.01895332, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.8832985007202503, + "language_loss": 0.84723222, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86879921, + "num_input_tokens_seen": 168958340, + "step": 7863, + "time_per_iteration": 2.4076037406921387 + }, + { + "auxiliary_loss_clip": 0.01101939, + "auxiliary_loss_mlp": 0.01043267, + "balance_loss_clip": 1.04503667, + "balance_loss_mlp": 1.02983212, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.9944764474769805, + "language_loss": 0.65791756, + "learning_rate": 2.272358461271467e-06, + "loss": 0.67936963, + "num_input_tokens_seen": 168974850, + "step": 7864, + "time_per_iteration": 2.484724998474121 + }, + { + "auxiliary_loss_clip": 0.0112263, + "auxiliary_loss_mlp": 0.01036602, + "balance_loss_clip": 1.04353845, + "balance_loss_mlp": 1.02235675, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 2.105096710420786, + "language_loss": 0.64895099, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67054331, + "num_input_tokens_seen": 168992860, + "step": 7865, + "time_per_iteration": 2.3983402252197266 + }, + { + "auxiliary_loss_clip": 0.01093795, + "auxiliary_loss_mlp": 0.00779684, + "balance_loss_clip": 1.04049039, + "balance_loss_mlp": 1.00028324, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 1.6716658254459362, + "language_loss": 0.74190706, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76064181, + "num_input_tokens_seen": 169010325, + "step": 7866, + "time_per_iteration": 2.493596315383911 + }, + { + "auxiliary_loss_clip": 0.01123239, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.04199743, + "balance_loss_mlp": 1.02200055, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 2.022060331175174, + "language_loss": 0.82734251, + "learning_rate": 2.271200914239451e-06, + "loss": 0.84893572, + "num_input_tokens_seen": 169029840, + "step": 7867, + "time_per_iteration": 3.940336227416992 + }, + { + "auxiliary_loss_clip": 0.01107676, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.04150677, + "balance_loss_mlp": 1.02181888, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.6806608500004554, + "language_loss": 0.79337549, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81480432, + "num_input_tokens_seen": 169049975, + "step": 7868, + "time_per_iteration": 2.469966411590576 + }, + { + "auxiliary_loss_clip": 0.01046728, + "auxiliary_loss_mlp": 0.01040441, + "balance_loss_clip": 1.03505731, + "balance_loss_mlp": 1.02447319, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 1.9128343299060593, + "language_loss": 0.75057673, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.77144837, + "num_input_tokens_seen": 169069540, + "step": 7869, + "time_per_iteration": 2.631709098815918 + }, + { + "auxiliary_loss_clip": 0.01103835, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.0428375, + "balance_loss_mlp": 1.0338943, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.5097100718430176, + "language_loss": 0.73925281, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76078713, + "num_input_tokens_seen": 169089940, + "step": 7870, + "time_per_iteration": 3.9042282104492188 + }, + { + "auxiliary_loss_clip": 0.01128535, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.04485011, + "balance_loss_mlp": 1.02493334, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 2.514486763146203, + "language_loss": 0.81802952, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83972025, + "num_input_tokens_seen": 169109650, + "step": 7871, + "time_per_iteration": 2.478135347366333 + }, + { + "auxiliary_loss_clip": 0.01109341, + "auxiliary_loss_mlp": 0.0103525, + "balance_loss_clip": 1.04055274, + "balance_loss_mlp": 1.02063501, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.7086548997476423, + "language_loss": 0.75811327, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77955914, + "num_input_tokens_seen": 169128990, + "step": 7872, + "time_per_iteration": 2.4768075942993164 + }, + { + "auxiliary_loss_clip": 0.01086506, + "auxiliary_loss_mlp": 0.01038192, + "balance_loss_clip": 1.03748739, + "balance_loss_mlp": 1.02398849, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 1.9024747486253948, + "language_loss": 0.67711735, + "learning_rate": 2.268885542903428e-06, + "loss": 0.69836426, + "num_input_tokens_seen": 169154645, + "step": 7873, + "time_per_iteration": 2.732901096343994 + }, + { + "auxiliary_loss_clip": 0.01113412, + "auxiliary_loss_mlp": 0.01034903, + "balance_loss_clip": 1.04348242, + "balance_loss_mlp": 1.02088428, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.4650468520434075, + "language_loss": 0.72688806, + "learning_rate": 2.26849961190881e-06, + "loss": 0.74837112, + "num_input_tokens_seen": 169174995, + "step": 7874, + "time_per_iteration": 2.5451619625091553 + }, + { + "auxiliary_loss_clip": 0.01106316, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.04459238, + "balance_loss_mlp": 1.02469814, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.477123355730147, + "language_loss": 0.65171945, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67316562, + "num_input_tokens_seen": 169191815, + "step": 7875, + "time_per_iteration": 2.4929401874542236 + }, + { + "auxiliary_loss_clip": 0.01078169, + "auxiliary_loss_mlp": 0.0103308, + "balance_loss_clip": 1.04381514, + "balance_loss_mlp": 1.01897717, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.544571422961043, + "language_loss": 0.81177098, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83288348, + "num_input_tokens_seen": 169210430, + "step": 7876, + "time_per_iteration": 2.6381959915161133 + }, + { + "auxiliary_loss_clip": 0.01094661, + "auxiliary_loss_mlp": 0.01050461, + "balance_loss_clip": 1.03954792, + "balance_loss_mlp": 1.03551841, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.6953094871238985, + "language_loss": 0.78892004, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81037128, + "num_input_tokens_seen": 169229295, + "step": 7877, + "time_per_iteration": 2.489874839782715 + }, + { + "auxiliary_loss_clip": 0.01111922, + "auxiliary_loss_mlp": 0.00777908, + "balance_loss_clip": 1.04159427, + "balance_loss_mlp": 1.00029933, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 2.0522257032192326, + "language_loss": 0.70705199, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.7259503, + "num_input_tokens_seen": 169247855, + "step": 7878, + "time_per_iteration": 2.486346483230591 + }, + { + "auxiliary_loss_clip": 0.01084106, + "auxiliary_loss_mlp": 0.01035466, + "balance_loss_clip": 1.04527867, + "balance_loss_mlp": 1.02225196, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.5521121068462327, + "language_loss": 0.75441986, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77561563, + "num_input_tokens_seen": 169268860, + "step": 7879, + "time_per_iteration": 2.617516279220581 + }, + { + "auxiliary_loss_clip": 0.01031152, + "auxiliary_loss_mlp": 0.01005152, + "balance_loss_clip": 1.02443492, + "balance_loss_mlp": 1.00335217, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 2.2882820858076798, + "language_loss": 0.61268389, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63304698, + "num_input_tokens_seen": 169331855, + "step": 7880, + "time_per_iteration": 3.1008124351501465 + }, + { + "auxiliary_loss_clip": 0.01101316, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.04150641, + "balance_loss_mlp": 1.02318883, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.5080689228986401, + "language_loss": 0.68050104, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70189619, + "num_input_tokens_seen": 169352175, + "step": 7881, + "time_per_iteration": 2.5413732528686523 + }, + { + "auxiliary_loss_clip": 0.01063149, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.0464381, + "balance_loss_mlp": 1.01522481, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.8614980384130846, + "language_loss": 0.77213681, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79305053, + "num_input_tokens_seen": 169371215, + "step": 7882, + "time_per_iteration": 2.6341123580932617 + }, + { + "auxiliary_loss_clip": 0.01111913, + "auxiliary_loss_mlp": 0.0103189, + "balance_loss_clip": 1.04344094, + "balance_loss_mlp": 1.01792455, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.5184359727451509, + "language_loss": 0.76092684, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78236485, + "num_input_tokens_seen": 169391745, + "step": 7883, + "time_per_iteration": 2.564847230911255 + }, + { + "auxiliary_loss_clip": 0.01104226, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.04613709, + "balance_loss_mlp": 1.02046704, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7172438544404436, + "language_loss": 0.71802843, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.73940372, + "num_input_tokens_seen": 169409845, + "step": 7884, + "time_per_iteration": 4.024056673049927 + }, + { + "auxiliary_loss_clip": 0.01115722, + "auxiliary_loss_mlp": 0.01037442, + "balance_loss_clip": 1.04432034, + "balance_loss_mlp": 1.02295256, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 2.0645005892497426, + "language_loss": 0.82294387, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84447551, + "num_input_tokens_seen": 169426085, + "step": 7885, + "time_per_iteration": 2.418795585632324 + }, + { + "auxiliary_loss_clip": 0.01092487, + "auxiliary_loss_mlp": 0.01043274, + "balance_loss_clip": 1.04080176, + "balance_loss_mlp": 1.02698982, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.7526431576815071, + "language_loss": 0.73892081, + "learning_rate": 2.263867649999751e-06, + "loss": 0.7602784, + "num_input_tokens_seen": 169444705, + "step": 7886, + "time_per_iteration": 2.4803667068481445 + }, + { + "auxiliary_loss_clip": 0.01102051, + "auxiliary_loss_mlp": 0.01036852, + "balance_loss_clip": 1.04103124, + "balance_loss_mlp": 1.02162933, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 1.9902280164756807, + "language_loss": 0.73910391, + "learning_rate": 2.263481587786849e-06, + "loss": 0.76049292, + "num_input_tokens_seen": 169460850, + "step": 7887, + "time_per_iteration": 2.475148916244507 + }, + { + "auxiliary_loss_clip": 0.01112056, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.0444262, + "balance_loss_mlp": 1.01859272, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.7131860757406918, + "language_loss": 0.77018899, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79162741, + "num_input_tokens_seen": 169478890, + "step": 7888, + "time_per_iteration": 2.4646382331848145 + }, + { + "auxiliary_loss_clip": 0.01112295, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.04291415, + "balance_loss_mlp": 1.0203855, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.5585581256900713, + "language_loss": 0.72567463, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.7471385, + "num_input_tokens_seen": 169499690, + "step": 7889, + "time_per_iteration": 2.5025129318237305 + }, + { + "auxiliary_loss_clip": 0.01047959, + "auxiliary_loss_mlp": 0.01004258, + "balance_loss_clip": 1.02127957, + "balance_loss_mlp": 1.00269639, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.7142498062810468, + "language_loss": 0.56005299, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58057517, + "num_input_tokens_seen": 169560475, + "step": 7890, + "time_per_iteration": 3.0810186862945557 + }, + { + "auxiliary_loss_clip": 0.01115209, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.04620862, + "balance_loss_mlp": 1.01864326, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 2.3567049551324892, + "language_loss": 0.65374589, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67523766, + "num_input_tokens_seen": 169580110, + "step": 7891, + "time_per_iteration": 2.483186721801758 + }, + { + "auxiliary_loss_clip": 0.01128849, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.04524982, + "balance_loss_mlp": 1.02492833, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.7182363996719907, + "language_loss": 0.70487607, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72656977, + "num_input_tokens_seen": 169597510, + "step": 7892, + "time_per_iteration": 2.4236817359924316 + }, + { + "auxiliary_loss_clip": 0.01032016, + "auxiliary_loss_mlp": 0.01006766, + "balance_loss_clip": 1.03122807, + "balance_loss_mlp": 1.0052166, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8140526567989949, + "language_loss": 0.58559436, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60598218, + "num_input_tokens_seen": 169660010, + "step": 7893, + "time_per_iteration": 3.14772367477417 + }, + { + "auxiliary_loss_clip": 0.0111427, + "auxiliary_loss_mlp": 0.01038723, + "balance_loss_clip": 1.04613137, + "balance_loss_mlp": 1.02582431, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 1.7239820706385511, + "language_loss": 0.77798134, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79951131, + "num_input_tokens_seen": 169678485, + "step": 7894, + "time_per_iteration": 2.4620110988616943 + }, + { + "auxiliary_loss_clip": 0.01114339, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.04469228, + "balance_loss_mlp": 1.02042317, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 1.7260681330249004, + "language_loss": 0.74511349, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76659703, + "num_input_tokens_seen": 169697335, + "step": 7895, + "time_per_iteration": 2.4767327308654785 + }, + { + "auxiliary_loss_clip": 0.01108679, + "auxiliary_loss_mlp": 0.01030257, + "balance_loss_clip": 1.04035497, + "balance_loss_mlp": 1.01626778, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 1.9961032272780213, + "language_loss": 0.82785511, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84924448, + "num_input_tokens_seen": 169715395, + "step": 7896, + "time_per_iteration": 2.4599766731262207 + }, + { + "auxiliary_loss_clip": 0.01110124, + "auxiliary_loss_mlp": 0.0103199, + "balance_loss_clip": 1.0426383, + "balance_loss_mlp": 1.01792347, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 3.3630973387333807, + "language_loss": 0.75636792, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77778906, + "num_input_tokens_seen": 169733755, + "step": 7897, + "time_per_iteration": 2.434727191925049 + }, + { + "auxiliary_loss_clip": 0.01102389, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.0434711, + "balance_loss_mlp": 1.02206182, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.1107480264393303, + "language_loss": 0.63398218, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65536535, + "num_input_tokens_seen": 169751390, + "step": 7898, + "time_per_iteration": 2.4784958362579346 + }, + { + "auxiliary_loss_clip": 0.01089053, + "auxiliary_loss_mlp": 0.01047588, + "balance_loss_clip": 1.04154468, + "balance_loss_mlp": 1.03162599, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 3.2730990138774096, + "language_loss": 0.69776553, + "learning_rate": 2.258848066101946e-06, + "loss": 0.71913189, + "num_input_tokens_seen": 169769500, + "step": 7899, + "time_per_iteration": 4.03910756111145 + }, + { + "auxiliary_loss_clip": 0.01111533, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.04120612, + "balance_loss_mlp": 1.02254915, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 1.9080009950113317, + "language_loss": 0.68499875, + "learning_rate": 2.258461875144837e-06, + "loss": 0.7064805, + "num_input_tokens_seen": 169789215, + "step": 7900, + "time_per_iteration": 2.5191962718963623 + }, + { + "auxiliary_loss_clip": 0.01086597, + "auxiliary_loss_mlp": 0.0103946, + "balance_loss_clip": 1.04421568, + "balance_loss_mlp": 1.02603054, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 1.9844330092119915, + "language_loss": 0.70997691, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.73123753, + "num_input_tokens_seen": 169808825, + "step": 7901, + "time_per_iteration": 2.6100962162017822 + }, + { + "auxiliary_loss_clip": 0.01095663, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_clip": 1.03986788, + "balance_loss_mlp": 1.03536201, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.8486819974660809, + "language_loss": 0.73817432, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75963682, + "num_input_tokens_seen": 169827590, + "step": 7902, + "time_per_iteration": 2.499591112136841 + }, + { + "auxiliary_loss_clip": 0.01080645, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.03931677, + "balance_loss_mlp": 1.02335536, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.8779044417519377, + "language_loss": 0.68588233, + "learning_rate": 2.257303243526688e-06, + "loss": 0.70705062, + "num_input_tokens_seen": 169844925, + "step": 7903, + "time_per_iteration": 2.5397067070007324 + }, + { + "auxiliary_loss_clip": 0.01098295, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.04123747, + "balance_loss_mlp": 1.022192, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.5429519009399228, + "language_loss": 0.72115207, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74248081, + "num_input_tokens_seen": 169862705, + "step": 7904, + "time_per_iteration": 2.488450765609741 + }, + { + "auxiliary_loss_clip": 0.01063834, + "auxiliary_loss_mlp": 0.01041983, + "balance_loss_clip": 1.03534544, + "balance_loss_mlp": 1.02791631, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.6502137437946602, + "language_loss": 0.86228621, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88334441, + "num_input_tokens_seen": 169880155, + "step": 7905, + "time_per_iteration": 2.6062569618225098 + }, + { + "auxiliary_loss_clip": 0.01103133, + "auxiliary_loss_mlp": 0.01037871, + "balance_loss_clip": 1.03918147, + "balance_loss_mlp": 1.025038, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.5253973433199288, + "language_loss": 0.82379562, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84520566, + "num_input_tokens_seen": 169901525, + "step": 7906, + "time_per_iteration": 2.5372378826141357 + }, + { + "auxiliary_loss_clip": 0.01020391, + "auxiliary_loss_mlp": 0.01010429, + "balance_loss_clip": 1.02246046, + "balance_loss_mlp": 1.00909984, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6953919287034485, + "language_loss": 0.58956158, + "learning_rate": 2.255758264840002e-06, + "loss": 0.60986972, + "num_input_tokens_seen": 169970345, + "step": 7907, + "time_per_iteration": 4.669548273086548 + }, + { + "auxiliary_loss_clip": 0.01108433, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.04170644, + "balance_loss_mlp": 1.02480495, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.927839163790046, + "language_loss": 0.81114727, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83261168, + "num_input_tokens_seen": 169986440, + "step": 7908, + "time_per_iteration": 2.4644522666931152 + }, + { + "auxiliary_loss_clip": 0.01113854, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.04610658, + "balance_loss_mlp": 1.0224483, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.6517184700272518, + "language_loss": 0.7393353, + "learning_rate": 2.254985717247797e-06, + "loss": 0.76083612, + "num_input_tokens_seen": 170005705, + "step": 7909, + "time_per_iteration": 3.8891196250915527 + }, + { + "auxiliary_loss_clip": 0.01096438, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.04091918, + "balance_loss_mlp": 1.02349281, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.6136496667733813, + "language_loss": 0.75698268, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77832305, + "num_input_tokens_seen": 170023415, + "step": 7910, + "time_per_iteration": 2.5077109336853027 + }, + { + "auxiliary_loss_clip": 0.01108047, + "auxiliary_loss_mlp": 0.01025758, + "balance_loss_clip": 1.04194474, + "balance_loss_mlp": 1.01400971, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.7803071767751635, + "language_loss": 0.79076618, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81210423, + "num_input_tokens_seen": 170042395, + "step": 7911, + "time_per_iteration": 2.468825101852417 + }, + { + "auxiliary_loss_clip": 0.01095025, + "auxiliary_loss_mlp": 0.00780122, + "balance_loss_clip": 1.03895104, + "balance_loss_mlp": 1.0003196, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 2.4820616552806425, + "language_loss": 0.75414324, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77289468, + "num_input_tokens_seen": 170061610, + "step": 7912, + "time_per_iteration": 2.5023367404937744 + }, + { + "auxiliary_loss_clip": 0.01119596, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_clip": 1.04248905, + "balance_loss_mlp": 1.02728939, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.5517613158160055, + "language_loss": 0.74564785, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76724231, + "num_input_tokens_seen": 170083505, + "step": 7913, + "time_per_iteration": 2.4701077938079834 + }, + { + "auxiliary_loss_clip": 0.01103939, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.04468453, + "balance_loss_mlp": 1.01577139, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 3.0468105583973295, + "language_loss": 0.72087526, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74221075, + "num_input_tokens_seen": 170100690, + "step": 7914, + "time_per_iteration": 2.471088409423828 + }, + { + "auxiliary_loss_clip": 0.01102449, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.04897559, + "balance_loss_mlp": 1.02485454, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 4.5462795540323775, + "language_loss": 0.64883769, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.67023921, + "num_input_tokens_seen": 170119240, + "step": 7915, + "time_per_iteration": 2.5104458332061768 + }, + { + "auxiliary_loss_clip": 0.01117773, + "auxiliary_loss_mlp": 0.01032778, + "balance_loss_clip": 1.04237115, + "balance_loss_mlp": 1.01964116, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.6650806057292136, + "language_loss": 0.77147126, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.79297674, + "num_input_tokens_seen": 170136450, + "step": 7916, + "time_per_iteration": 2.39443039894104 + }, + { + "auxiliary_loss_clip": 0.01120727, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.04266238, + "balance_loss_mlp": 1.0187273, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.720388596438965, + "language_loss": 0.64385951, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66537929, + "num_input_tokens_seen": 170155295, + "step": 7917, + "time_per_iteration": 2.444092273712158 + }, + { + "auxiliary_loss_clip": 0.01019608, + "auxiliary_loss_mlp": 0.01000961, + "balance_loss_clip": 1.022017, + "balance_loss_mlp": 0.99954242, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8351877636315734, + "language_loss": 0.65637112, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67657685, + "num_input_tokens_seen": 170222325, + "step": 7918, + "time_per_iteration": 3.1236255168914795 + }, + { + "auxiliary_loss_clip": 0.01110177, + "auxiliary_loss_mlp": 0.00777849, + "balance_loss_clip": 1.04095221, + "balance_loss_mlp": 1.00029886, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.7902694775136312, + "language_loss": 0.68927407, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70815432, + "num_input_tokens_seen": 170241625, + "step": 7919, + "time_per_iteration": 2.4770336151123047 + }, + { + "auxiliary_loss_clip": 0.01098307, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.04025912, + "balance_loss_mlp": 1.02286983, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 1.4614249351901758, + "language_loss": 0.74773288, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.76907247, + "num_input_tokens_seen": 170262470, + "step": 7920, + "time_per_iteration": 2.5677597522735596 + }, + { + "auxiliary_loss_clip": 0.0110433, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.04506946, + "balance_loss_mlp": 1.01751494, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.4731880872470855, + "language_loss": 0.77686262, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.7982192, + "num_input_tokens_seen": 170283460, + "step": 7921, + "time_per_iteration": 2.54129695892334 + }, + { + "auxiliary_loss_clip": 0.01103616, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.04283988, + "balance_loss_mlp": 1.02249146, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 1.6039430510856585, + "language_loss": 0.7826395, + "learning_rate": 2.249963220399845e-06, + "loss": 0.8040455, + "num_input_tokens_seen": 170304225, + "step": 7922, + "time_per_iteration": 2.561525344848633 + }, + { + "auxiliary_loss_clip": 0.01092039, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.04320073, + "balance_loss_mlp": 1.02323306, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.8929559129324722, + "language_loss": 0.72339833, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74469697, + "num_input_tokens_seen": 170322110, + "step": 7923, + "time_per_iteration": 4.085254430770874 + }, + { + "auxiliary_loss_clip": 0.01097312, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.04620051, + "balance_loss_mlp": 1.02405941, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 2.0911544762663103, + "language_loss": 0.82322192, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.8445667, + "num_input_tokens_seen": 170340700, + "step": 7924, + "time_per_iteration": 2.558379888534546 + }, + { + "auxiliary_loss_clip": 0.01119967, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.04669118, + "balance_loss_mlp": 1.0213294, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 2.0498708526292995, + "language_loss": 0.80292284, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82448041, + "num_input_tokens_seen": 170359780, + "step": 7925, + "time_per_iteration": 2.514700412750244 + }, + { + "auxiliary_loss_clip": 0.0109832, + "auxiliary_loss_mlp": 0.01038903, + "balance_loss_clip": 1.04098153, + "balance_loss_mlp": 1.02583766, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 1.5866247954587787, + "language_loss": 0.72177881, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74315107, + "num_input_tokens_seen": 170381260, + "step": 7926, + "time_per_iteration": 2.5434629917144775 + }, + { + "auxiliary_loss_clip": 0.01115122, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.04384375, + "balance_loss_mlp": 1.0205729, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 2.655715924501947, + "language_loss": 0.68568271, + "learning_rate": 2.248031062546432e-06, + "loss": 0.70718509, + "num_input_tokens_seen": 170400595, + "step": 7927, + "time_per_iteration": 2.4964756965637207 + }, + { + "auxiliary_loss_clip": 0.01088514, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.04283941, + "balance_loss_mlp": 1.01524282, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.562398281339081, + "language_loss": 0.68097234, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70213878, + "num_input_tokens_seen": 170421110, + "step": 7928, + "time_per_iteration": 2.5879833698272705 + }, + { + "auxiliary_loss_clip": 0.01121983, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.0431565, + "balance_loss_mlp": 1.01919174, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 1.9807024654365137, + "language_loss": 0.78538394, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.80693424, + "num_input_tokens_seen": 170436700, + "step": 7929, + "time_per_iteration": 2.386503219604492 + }, + { + "auxiliary_loss_clip": 0.01098452, + "auxiliary_loss_mlp": 0.01040502, + "balance_loss_clip": 1.03967047, + "balance_loss_mlp": 1.02830672, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.814049240900952, + "language_loss": 0.67158431, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.69297385, + "num_input_tokens_seen": 170459555, + "step": 7930, + "time_per_iteration": 2.646698236465454 + }, + { + "auxiliary_loss_clip": 0.0110892, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.04333997, + "balance_loss_mlp": 1.02247608, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.8141834095225031, + "language_loss": 0.79978549, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82122922, + "num_input_tokens_seen": 170479175, + "step": 7931, + "time_per_iteration": 2.4989404678344727 + }, + { + "auxiliary_loss_clip": 0.01097498, + "auxiliary_loss_mlp": 0.01038706, + "balance_loss_clip": 1.03943753, + "balance_loss_mlp": 1.02425766, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.8858145498665284, + "language_loss": 0.7623136, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78367561, + "num_input_tokens_seen": 170498450, + "step": 7932, + "time_per_iteration": 2.504600763320923 + }, + { + "auxiliary_loss_clip": 0.01100525, + "auxiliary_loss_mlp": 0.00777386, + "balance_loss_clip": 1.04412079, + "balance_loss_mlp": 1.00024104, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 1.7671511434441323, + "language_loss": 0.79522079, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81399989, + "num_input_tokens_seen": 170516255, + "step": 7933, + "time_per_iteration": 2.487318277359009 + }, + { + "auxiliary_loss_clip": 0.01118063, + "auxiliary_loss_mlp": 0.01043391, + "balance_loss_clip": 1.04359913, + "balance_loss_mlp": 1.02803707, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.8113830941274587, + "language_loss": 0.74698937, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76860392, + "num_input_tokens_seen": 170532705, + "step": 7934, + "time_per_iteration": 2.4402360916137695 + }, + { + "auxiliary_loss_clip": 0.01112946, + "auxiliary_loss_mlp": 0.01036365, + "balance_loss_clip": 1.04233408, + "balance_loss_mlp": 1.02313304, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.905328720267122, + "language_loss": 0.80025941, + "learning_rate": 2.244939121664211e-06, + "loss": 0.82175255, + "num_input_tokens_seen": 170551925, + "step": 7935, + "time_per_iteration": 2.489858865737915 + }, + { + "auxiliary_loss_clip": 0.01100073, + "auxiliary_loss_mlp": 0.01043765, + "balance_loss_clip": 1.04529536, + "balance_loss_mlp": 1.02935815, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.8709852242893397, + "language_loss": 0.7117694, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.7332077, + "num_input_tokens_seen": 170572320, + "step": 7936, + "time_per_iteration": 2.614492654800415 + }, + { + "auxiliary_loss_clip": 0.01123897, + "auxiliary_loss_mlp": 0.01038025, + "balance_loss_clip": 1.04257226, + "balance_loss_mlp": 1.02429175, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 2.246692751272466, + "language_loss": 0.6810137, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.7026329, + "num_input_tokens_seen": 170589470, + "step": 7937, + "time_per_iteration": 2.4667270183563232 + }, + { + "auxiliary_loss_clip": 0.01037983, + "auxiliary_loss_mlp": 0.01005352, + "balance_loss_clip": 1.0217464, + "balance_loss_mlp": 1.00423765, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.721216316849899, + "language_loss": 0.56372303, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58415639, + "num_input_tokens_seen": 170662265, + "step": 7938, + "time_per_iteration": 4.7687342166900635 + }, + { + "auxiliary_loss_clip": 0.01094734, + "auxiliary_loss_mlp": 0.01046772, + "balance_loss_clip": 1.03860295, + "balance_loss_mlp": 1.03097725, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.600439401930489, + "language_loss": 0.88671345, + "learning_rate": 2.243392927839317e-06, + "loss": 0.9081285, + "num_input_tokens_seen": 170679680, + "step": 7939, + "time_per_iteration": 2.494727611541748 + }, + { + "auxiliary_loss_clip": 0.01112468, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.04052663, + "balance_loss_mlp": 1.02283871, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 1.8734883268685834, + "language_loss": 0.76942182, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79090095, + "num_input_tokens_seen": 170697340, + "step": 7940, + "time_per_iteration": 2.4430434703826904 + }, + { + "auxiliary_loss_clip": 0.01099818, + "auxiliary_loss_mlp": 0.01035874, + "balance_loss_clip": 1.0445379, + "balance_loss_mlp": 1.02329707, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.7695119042317753, + "language_loss": 0.84890568, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87026262, + "num_input_tokens_seen": 170714905, + "step": 7941, + "time_per_iteration": 2.4850990772247314 + }, + { + "auxiliary_loss_clip": 0.01106199, + "auxiliary_loss_mlp": 0.01039509, + "balance_loss_clip": 1.04318929, + "balance_loss_mlp": 1.02504277, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 2.6407384964256986, + "language_loss": 0.75866365, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78012067, + "num_input_tokens_seen": 170731810, + "step": 7942, + "time_per_iteration": 2.474424123764038 + }, + { + "auxiliary_loss_clip": 0.01115236, + "auxiliary_loss_mlp": 0.01037853, + "balance_loss_clip": 1.0485816, + "balance_loss_mlp": 1.0242691, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.9909434240597588, + "language_loss": 0.6457355, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66726637, + "num_input_tokens_seen": 170750270, + "step": 7943, + "time_per_iteration": 2.454397678375244 + }, + { + "auxiliary_loss_clip": 0.01086302, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.03814387, + "balance_loss_mlp": 1.02097225, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.803794586282648, + "language_loss": 0.73541451, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.7566359, + "num_input_tokens_seen": 170769015, + "step": 7944, + "time_per_iteration": 2.5254764556884766 + }, + { + "auxiliary_loss_clip": 0.01109723, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.04474926, + "balance_loss_mlp": 1.01696253, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.0792421844123874, + "language_loss": 0.6857608, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.7071768, + "num_input_tokens_seen": 170785725, + "step": 7945, + "time_per_iteration": 2.447272300720215 + }, + { + "auxiliary_loss_clip": 0.01085155, + "auxiliary_loss_mlp": 0.00783177, + "balance_loss_clip": 1.03624988, + "balance_loss_mlp": 1.00024319, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.9232804900191955, + "language_loss": 0.75603217, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77471548, + "num_input_tokens_seen": 170804600, + "step": 7946, + "time_per_iteration": 4.283180236816406 + }, + { + "auxiliary_loss_clip": 0.01101156, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_clip": 1.04133177, + "balance_loss_mlp": 1.02714431, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.694296227758092, + "language_loss": 0.79041791, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81185818, + "num_input_tokens_seen": 170824230, + "step": 7947, + "time_per_iteration": 2.552243709564209 + }, + { + "auxiliary_loss_clip": 0.01088482, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.04121995, + "balance_loss_mlp": 1.02035832, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.9559509789078047, + "language_loss": 0.73872119, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75994647, + "num_input_tokens_seen": 170843365, + "step": 7948, + "time_per_iteration": 2.4721593856811523 + }, + { + "auxiliary_loss_clip": 0.01102607, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.03993905, + "balance_loss_mlp": 1.01745474, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.4326222403579398, + "language_loss": 0.77960515, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80094993, + "num_input_tokens_seen": 170863515, + "step": 7949, + "time_per_iteration": 3.918358564376831 + }, + { + "auxiliary_loss_clip": 0.01093936, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.03868675, + "balance_loss_mlp": 1.02003539, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.1898483873540884, + "language_loss": 0.7399748, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.76124996, + "num_input_tokens_seen": 170881245, + "step": 7950, + "time_per_iteration": 2.4842498302459717 + }, + { + "auxiliary_loss_clip": 0.01092182, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_clip": 1.03892028, + "balance_loss_mlp": 1.02945471, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.708523006556028, + "language_loss": 0.73830116, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.75966835, + "num_input_tokens_seen": 170901285, + "step": 7951, + "time_per_iteration": 2.6001079082489014 + }, + { + "auxiliary_loss_clip": 0.01091435, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.04327035, + "balance_loss_mlp": 1.0180788, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 2.175528021745276, + "language_loss": 0.80519497, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82643723, + "num_input_tokens_seen": 170919740, + "step": 7952, + "time_per_iteration": 2.6060478687286377 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.010407, + "balance_loss_clip": 1.0400517, + "balance_loss_mlp": 1.02609658, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.6184760873138777, + "language_loss": 0.7841748, + "learning_rate": 2.23798009269438e-06, + "loss": 0.8056109, + "num_input_tokens_seen": 170938510, + "step": 7953, + "time_per_iteration": 2.490856409072876 + }, + { + "auxiliary_loss_clip": 0.0111337, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.04168439, + "balance_loss_mlp": 1.01985455, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.2473055789289837, + "language_loss": 0.84004289, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86151421, + "num_input_tokens_seen": 170951170, + "step": 7954, + "time_per_iteration": 2.4165563583374023 + }, + { + "auxiliary_loss_clip": 0.01094907, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.03929675, + "balance_loss_mlp": 1.02150035, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.4354700191617138, + "language_loss": 0.70463014, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72594106, + "num_input_tokens_seen": 170970990, + "step": 7955, + "time_per_iteration": 2.5015382766723633 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.04246521, + "balance_loss_mlp": 1.02335179, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 3.369051626266829, + "language_loss": 0.82029712, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.84165752, + "num_input_tokens_seen": 170991215, + "step": 7956, + "time_per_iteration": 2.5184996128082275 + }, + { + "auxiliary_loss_clip": 0.01101495, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.04377401, + "balance_loss_mlp": 1.01791167, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 2.173024032282296, + "language_loss": 0.84939098, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87073278, + "num_input_tokens_seen": 171007325, + "step": 7957, + "time_per_iteration": 2.4953255653381348 + }, + { + "auxiliary_loss_clip": 0.01110323, + "auxiliary_loss_mlp": 0.01037121, + "balance_loss_clip": 1.04000223, + "balance_loss_mlp": 1.02355456, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.6157217918719569, + "language_loss": 0.79352903, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81500345, + "num_input_tokens_seen": 171025650, + "step": 7958, + "time_per_iteration": 2.5009281635284424 + }, + { + "auxiliary_loss_clip": 0.01082259, + "auxiliary_loss_mlp": 0.00782937, + "balance_loss_clip": 1.03393221, + "balance_loss_mlp": 1.00026727, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 3.2765788049500517, + "language_loss": 0.83394802, + "learning_rate": 2.235659762404047e-06, + "loss": 0.85259992, + "num_input_tokens_seen": 171045045, + "step": 7959, + "time_per_iteration": 2.5752458572387695 + }, + { + "auxiliary_loss_clip": 0.01088839, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.0439558, + "balance_loss_mlp": 1.01966357, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.198771171828669, + "language_loss": 0.7248776, + "learning_rate": 2.235273009326599e-06, + "loss": 0.7460776, + "num_input_tokens_seen": 171062910, + "step": 7960, + "time_per_iteration": 2.5688867568969727 + }, + { + "auxiliary_loss_clip": 0.0108938, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.04347801, + "balance_loss_mlp": 1.02493358, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.7070949281965995, + "language_loss": 0.77483749, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.7961092, + "num_input_tokens_seen": 171080875, + "step": 7961, + "time_per_iteration": 2.529278516769409 + }, + { + "auxiliary_loss_clip": 0.01083076, + "auxiliary_loss_mlp": 0.01032196, + "balance_loss_clip": 1.04280639, + "balance_loss_mlp": 1.01877284, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.6193568821319164, + "language_loss": 0.7783426, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.79949534, + "num_input_tokens_seen": 171099190, + "step": 7962, + "time_per_iteration": 3.998448133468628 + }, + { + "auxiliary_loss_clip": 0.01098225, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.0421505, + "balance_loss_mlp": 1.02486956, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.6473512454884849, + "language_loss": 0.64797688, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.66934818, + "num_input_tokens_seen": 171119060, + "step": 7963, + "time_per_iteration": 2.542081117630005 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.04215956, + "balance_loss_mlp": 1.02035475, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.722914138121308, + "language_loss": 0.77526569, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.79673231, + "num_input_tokens_seen": 171141900, + "step": 7964, + "time_per_iteration": 2.6733670234680176 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01036813, + "balance_loss_clip": 1.04264045, + "balance_loss_mlp": 1.02097595, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.7535502395165163, + "language_loss": 0.76352912, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78504962, + "num_input_tokens_seen": 171161045, + "step": 7965, + "time_per_iteration": 2.480958938598633 + }, + { + "auxiliary_loss_clip": 0.01066628, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_clip": 1.03653121, + "balance_loss_mlp": 1.03905964, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 1.5213185661997635, + "language_loss": 0.74572015, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76693273, + "num_input_tokens_seen": 171179675, + "step": 7966, + "time_per_iteration": 2.5509681701660156 + }, + { + "auxiliary_loss_clip": 0.0109734, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.04181385, + "balance_loss_mlp": 1.02109337, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.5638514265169847, + "language_loss": 0.73097217, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75230098, + "num_input_tokens_seen": 171201175, + "step": 7967, + "time_per_iteration": 2.542832374572754 + }, + { + "auxiliary_loss_clip": 0.01100425, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.04147887, + "balance_loss_mlp": 1.01706815, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 1.8988973155896416, + "language_loss": 0.79027933, + "learning_rate": 2.232178664762267e-06, + "loss": 0.8115949, + "num_input_tokens_seen": 171221750, + "step": 7968, + "time_per_iteration": 2.552809238433838 + }, + { + "auxiliary_loss_clip": 0.01021211, + "auxiliary_loss_mlp": 0.01002742, + "balance_loss_clip": 1.02169228, + "balance_loss_mlp": 1.00141895, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7622352053069216, + "language_loss": 0.62232554, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64256501, + "num_input_tokens_seen": 171292235, + "step": 7969, + "time_per_iteration": 3.259377956390381 + }, + { + "auxiliary_loss_clip": 0.01086102, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.04287112, + "balance_loss_mlp": 1.01905251, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.5777458619456097, + "language_loss": 0.77336538, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79454482, + "num_input_tokens_seen": 171312215, + "step": 7970, + "time_per_iteration": 2.5898263454437256 + }, + { + "auxiliary_loss_clip": 0.01112938, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.04315102, + "balance_loss_mlp": 1.02595246, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.5864893335216348, + "language_loss": 0.70302916, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72455442, + "num_input_tokens_seen": 171332975, + "step": 7971, + "time_per_iteration": 2.6087069511413574 + }, + { + "auxiliary_loss_clip": 0.01074998, + "auxiliary_loss_mlp": 0.01035182, + "balance_loss_clip": 1.04107022, + "balance_loss_mlp": 1.02000666, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.3357353875552394, + "language_loss": 0.80017507, + "learning_rate": 2.230631280709021e-06, + "loss": 0.8212769, + "num_input_tokens_seen": 171353880, + "step": 7972, + "time_per_iteration": 2.5933923721313477 + }, + { + "auxiliary_loss_clip": 0.01111978, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.0420115, + "balance_loss_mlp": 1.01640749, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.0033458552250516, + "language_loss": 0.69923186, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.72065592, + "num_input_tokens_seen": 171370930, + "step": 7973, + "time_per_iteration": 2.457778215408325 + }, + { + "auxiliary_loss_clip": 0.01114502, + "auxiliary_loss_mlp": 0.01039416, + "balance_loss_clip": 1.04633677, + "balance_loss_mlp": 1.02658319, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.7490625084804903, + "language_loss": 0.79207361, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.81361282, + "num_input_tokens_seen": 171387575, + "step": 7974, + "time_per_iteration": 2.513580322265625 + }, + { + "auxiliary_loss_clip": 0.01031202, + "auxiliary_loss_mlp": 0.01008619, + "balance_loss_clip": 1.02296042, + "balance_loss_mlp": 1.00705719, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7694988181237569, + "language_loss": 0.54032934, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56072754, + "num_input_tokens_seen": 171449980, + "step": 7975, + "time_per_iteration": 3.1237974166870117 + }, + { + "auxiliary_loss_clip": 0.01111216, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_clip": 1.04257202, + "balance_loss_mlp": 1.02641892, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.69090338561379, + "language_loss": 0.89999181, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92152381, + "num_input_tokens_seen": 171465290, + "step": 7976, + "time_per_iteration": 2.4826557636260986 + }, + { + "auxiliary_loss_clip": 0.01130965, + "auxiliary_loss_mlp": 0.01044374, + "balance_loss_clip": 1.04707026, + "balance_loss_mlp": 1.02866244, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.3928360190645184, + "language_loss": 0.73858452, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.76033789, + "num_input_tokens_seen": 171481130, + "step": 7977, + "time_per_iteration": 3.8532161712646484 + }, + { + "auxiliary_loss_clip": 0.01106829, + "auxiliary_loss_mlp": 0.00779064, + "balance_loss_clip": 1.0420351, + "balance_loss_mlp": 1.00025988, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.5573562624124955, + "language_loss": 0.7866137, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80547267, + "num_input_tokens_seen": 171501140, + "step": 7978, + "time_per_iteration": 2.511348009109497 + }, + { + "auxiliary_loss_clip": 0.01102155, + "auxiliary_loss_mlp": 0.01040904, + "balance_loss_clip": 1.04321933, + "balance_loss_mlp": 1.0270822, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.6033580328795636, + "language_loss": 0.8929137, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91434431, + "num_input_tokens_seen": 171519835, + "step": 7979, + "time_per_iteration": 2.500337600708008 + }, + { + "auxiliary_loss_clip": 0.01118976, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.04577172, + "balance_loss_mlp": 1.023785, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.5588740040660642, + "language_loss": 0.77279121, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79437119, + "num_input_tokens_seen": 171540980, + "step": 7980, + "time_per_iteration": 2.5160775184631348 + }, + { + "auxiliary_loss_clip": 0.01103131, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.04613566, + "balance_loss_mlp": 1.02585447, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 2.017244768180104, + "language_loss": 0.71768612, + "learning_rate": 2.227149156404295e-06, + "loss": 0.73913074, + "num_input_tokens_seen": 171563600, + "step": 7981, + "time_per_iteration": 2.6545488834381104 + }, + { + "auxiliary_loss_clip": 0.01124378, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.04606175, + "balance_loss_mlp": 1.02325189, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 2.019553838016695, + "language_loss": 0.70070779, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72232258, + "num_input_tokens_seen": 171580700, + "step": 7982, + "time_per_iteration": 2.419969081878662 + }, + { + "auxiliary_loss_clip": 0.01099545, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.04459763, + "balance_loss_mlp": 1.01744115, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 2.6375131645010974, + "language_loss": 0.71451461, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73580563, + "num_input_tokens_seen": 171602035, + "step": 7983, + "time_per_iteration": 2.551204204559326 + }, + { + "auxiliary_loss_clip": 0.01041156, + "auxiliary_loss_mlp": 0.00755401, + "balance_loss_clip": 1.02373624, + "balance_loss_mlp": 1.00027645, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.7980017172860907, + "language_loss": 0.59413171, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61209726, + "num_input_tokens_seen": 171659215, + "step": 7984, + "time_per_iteration": 3.0108609199523926 + }, + { + "auxiliary_loss_clip": 0.01069318, + "auxiliary_loss_mlp": 0.01056791, + "balance_loss_clip": 1.03568101, + "balance_loss_mlp": 1.03991103, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 2.8746822675303134, + "language_loss": 0.66835958, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68962067, + "num_input_tokens_seen": 171675710, + "step": 7985, + "time_per_iteration": 4.076654672622681 + }, + { + "auxiliary_loss_clip": 0.01107653, + "auxiliary_loss_mlp": 0.01039328, + "balance_loss_clip": 1.04297912, + "balance_loss_mlp": 1.02500474, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.7328192998837022, + "language_loss": 0.70110953, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72257936, + "num_input_tokens_seen": 171692510, + "step": 7986, + "time_per_iteration": 2.4774630069732666 + }, + { + "auxiliary_loss_clip": 0.01091397, + "auxiliary_loss_mlp": 0.01042366, + "balance_loss_clip": 1.04414177, + "balance_loss_mlp": 1.02753615, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 4.929696699031717, + "language_loss": 0.79426497, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.81560266, + "num_input_tokens_seen": 171710235, + "step": 7987, + "time_per_iteration": 3.935520887374878 + }, + { + "auxiliary_loss_clip": 0.01074826, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.03914833, + "balance_loss_mlp": 1.02638721, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 1.9432432380923728, + "language_loss": 0.75299656, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77415341, + "num_input_tokens_seen": 171726715, + "step": 7988, + "time_per_iteration": 2.560438394546509 + }, + { + "auxiliary_loss_clip": 0.01097103, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.04552412, + "balance_loss_mlp": 1.02530777, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 2.4104004823589977, + "language_loss": 0.79239613, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81375754, + "num_input_tokens_seen": 171743605, + "step": 7989, + "time_per_iteration": 2.543377637863159 + }, + { + "auxiliary_loss_clip": 0.01106147, + "auxiliary_loss_mlp": 0.0104866, + "balance_loss_clip": 1.04133487, + "balance_loss_mlp": 1.03372872, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.7800998311453315, + "language_loss": 0.7353664, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75691456, + "num_input_tokens_seen": 171765445, + "step": 7990, + "time_per_iteration": 2.6458232402801514 + }, + { + "auxiliary_loss_clip": 0.01037694, + "auxiliary_loss_mlp": 0.0075588, + "balance_loss_clip": 1.02033544, + "balance_loss_mlp": 1.00014615, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.7735559614299474, + "language_loss": 0.59123051, + "learning_rate": 2.223279311579633e-06, + "loss": 0.60916626, + "num_input_tokens_seen": 171830115, + "step": 7991, + "time_per_iteration": 3.1473228931427 + }, + { + "auxiliary_loss_clip": 0.01116389, + "auxiliary_loss_mlp": 0.007802, + "balance_loss_clip": 1.04501355, + "balance_loss_mlp": 1.00032711, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 2.4720355809722347, + "language_loss": 0.66869479, + "learning_rate": 2.222892280287768e-06, + "loss": 0.68766069, + "num_input_tokens_seen": 171849135, + "step": 7992, + "time_per_iteration": 2.5790915489196777 + }, + { + "auxiliary_loss_clip": 0.01098548, + "auxiliary_loss_mlp": 0.010425, + "balance_loss_clip": 1.037233, + "balance_loss_mlp": 1.02812326, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 5.142370596222478, + "language_loss": 0.76415545, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78556591, + "num_input_tokens_seen": 171868880, + "step": 7993, + "time_per_iteration": 2.5277068614959717 + }, + { + "auxiliary_loss_clip": 0.01080498, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.04334962, + "balance_loss_mlp": 1.02230644, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.6959733628322147, + "language_loss": 0.78536093, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80652332, + "num_input_tokens_seen": 171889455, + "step": 7994, + "time_per_iteration": 2.6083836555480957 + }, + { + "auxiliary_loss_clip": 0.01105585, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.04125893, + "balance_loss_mlp": 1.0198729, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 1.8947289358010508, + "language_loss": 0.79506916, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81646019, + "num_input_tokens_seen": 171906070, + "step": 7995, + "time_per_iteration": 2.4848368167877197 + }, + { + "auxiliary_loss_clip": 0.01074511, + "auxiliary_loss_mlp": 0.0103762, + "balance_loss_clip": 1.04154372, + "balance_loss_mlp": 1.02313566, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 2.474789933528854, + "language_loss": 0.82728946, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84841073, + "num_input_tokens_seen": 171926515, + "step": 7996, + "time_per_iteration": 2.6034562587738037 + }, + { + "auxiliary_loss_clip": 0.01054845, + "auxiliary_loss_mlp": 0.0103903, + "balance_loss_clip": 1.03479242, + "balance_loss_mlp": 1.02502847, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.6509499319944043, + "language_loss": 0.80409849, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82503718, + "num_input_tokens_seen": 171943845, + "step": 7997, + "time_per_iteration": 2.5839414596557617 + }, + { + "auxiliary_loss_clip": 0.01078386, + "auxiliary_loss_mlp": 0.01037593, + "balance_loss_clip": 1.03939807, + "balance_loss_mlp": 1.02335393, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.8438190013972118, + "language_loss": 0.73130965, + "learning_rate": 2.220569915556221e-06, + "loss": 0.75246948, + "num_input_tokens_seen": 171964970, + "step": 7998, + "time_per_iteration": 2.627415180206299 + }, + { + "auxiliary_loss_clip": 0.01125504, + "auxiliary_loss_mlp": 0.01037612, + "balance_loss_clip": 1.04453373, + "balance_loss_mlp": 1.02349174, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.6927347631964507, + "language_loss": 0.70660716, + "learning_rate": 2.220182825407892e-06, + "loss": 0.72823834, + "num_input_tokens_seen": 171986340, + "step": 7999, + "time_per_iteration": 2.482776403427124 + }, + { + "auxiliary_loss_clip": 0.01116496, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.04231369, + "balance_loss_mlp": 1.03005481, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 1.5160040751061492, + "language_loss": 0.71093178, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73254204, + "num_input_tokens_seen": 172007300, + "step": 8000, + "time_per_iteration": 2.4876773357391357 + }, + { + "auxiliary_loss_clip": 0.01117307, + "auxiliary_loss_mlp": 0.01043326, + "balance_loss_clip": 1.04667425, + "balance_loss_mlp": 1.02862799, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.335923076021091, + "language_loss": 0.74696547, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.76857173, + "num_input_tokens_seen": 172029585, + "step": 8001, + "time_per_iteration": 4.126708745956421 + }, + { + "auxiliary_loss_clip": 0.01117029, + "auxiliary_loss_mlp": 0.01040441, + "balance_loss_clip": 1.04535151, + "balance_loss_mlp": 1.02570057, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.9038090347016925, + "language_loss": 0.8168751, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83844984, + "num_input_tokens_seen": 172047495, + "step": 8002, + "time_per_iteration": 2.4617204666137695 + }, + { + "auxiliary_loss_clip": 0.01122217, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.04841733, + "balance_loss_mlp": 1.02472746, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.7803491572998704, + "language_loss": 0.71656615, + "learning_rate": 2.218634381467819e-06, + "loss": 0.73818529, + "num_input_tokens_seen": 172067625, + "step": 8003, + "time_per_iteration": 2.5294365882873535 + }, + { + "auxiliary_loss_clip": 0.01110448, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.04301012, + "balance_loss_mlp": 1.02393126, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.5597187032965178, + "language_loss": 0.82276744, + "learning_rate": 2.218247249719507e-06, + "loss": 0.8442452, + "num_input_tokens_seen": 172087885, + "step": 8004, + "time_per_iteration": 2.4701828956604004 + }, + { + "auxiliary_loss_clip": 0.01109002, + "auxiliary_loss_mlp": 0.01045668, + "balance_loss_clip": 1.04396451, + "balance_loss_mlp": 1.02828169, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.4474800547795983, + "language_loss": 0.7786597, + "learning_rate": 2.217860109695239e-06, + "loss": 0.80020642, + "num_input_tokens_seen": 172105815, + "step": 8005, + "time_per_iteration": 2.462063789367676 + }, + { + "auxiliary_loss_clip": 0.01112726, + "auxiliary_loss_mlp": 0.01038468, + "balance_loss_clip": 1.04637742, + "balance_loss_mlp": 1.02362657, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 1.803829626599271, + "language_loss": 0.70954156, + "learning_rate": 2.217472961409692e-06, + "loss": 0.73105353, + "num_input_tokens_seen": 172126125, + "step": 8006, + "time_per_iteration": 2.4908101558685303 + }, + { + "auxiliary_loss_clip": 0.01099722, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.04060316, + "balance_loss_mlp": 1.03123856, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.9265998539677376, + "language_loss": 0.70558786, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72705328, + "num_input_tokens_seen": 172141945, + "step": 8007, + "time_per_iteration": 2.5253615379333496 + }, + { + "auxiliary_loss_clip": 0.0112896, + "auxiliary_loss_mlp": 0.0103967, + "balance_loss_clip": 1.0462153, + "balance_loss_mlp": 1.0251143, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 5.243709140993146, + "language_loss": 0.71997154, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.74165785, + "num_input_tokens_seen": 172161095, + "step": 8008, + "time_per_iteration": 2.4292593002319336 + }, + { + "auxiliary_loss_clip": 0.01095404, + "auxiliary_loss_mlp": 0.01047214, + "balance_loss_clip": 1.04537654, + "balance_loss_mlp": 1.03127587, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 2.0064071546153293, + "language_loss": 0.60756409, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62899029, + "num_input_tokens_seen": 172178750, + "step": 8009, + "time_per_iteration": 2.5255141258239746 + }, + { + "auxiliary_loss_clip": 0.01023185, + "auxiliary_loss_mlp": 0.01002835, + "balance_loss_clip": 1.02434528, + "balance_loss_mlp": 1.00161898, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8919648550707019, + "language_loss": 0.6130923, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63335252, + "num_input_tokens_seen": 172240235, + "step": 8010, + "time_per_iteration": 3.108870029449463 + }, + { + "auxiliary_loss_clip": 0.01120274, + "auxiliary_loss_mlp": 0.01047766, + "balance_loss_clip": 1.04837823, + "balance_loss_mlp": 1.03232813, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 1.9017369495619687, + "language_loss": 0.73370063, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75538111, + "num_input_tokens_seen": 172259875, + "step": 8011, + "time_per_iteration": 2.49646258354187 + }, + { + "auxiliary_loss_clip": 0.01101468, + "auxiliary_loss_mlp": 0.01037341, + "balance_loss_clip": 1.04100358, + "balance_loss_mlp": 1.02390575, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.892485908271602, + "language_loss": 0.79563838, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.8170265, + "num_input_tokens_seen": 172280150, + "step": 8012, + "time_per_iteration": 2.535707473754883 + }, + { + "auxiliary_loss_clip": 0.01095093, + "auxiliary_loss_mlp": 0.01048932, + "balance_loss_clip": 1.05099416, + "balance_loss_mlp": 1.0338943, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 2.1232003138632987, + "language_loss": 0.73429501, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75573528, + "num_input_tokens_seen": 172300810, + "step": 8013, + "time_per_iteration": 2.605523109436035 + }, + { + "auxiliary_loss_clip": 0.01100956, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.04860079, + "balance_loss_mlp": 1.02069271, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 2.3158924646654793, + "language_loss": 0.90434837, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92570269, + "num_input_tokens_seen": 172317930, + "step": 8014, + "time_per_iteration": 2.486048698425293 + }, + { + "auxiliary_loss_clip": 0.01131885, + "auxiliary_loss_mlp": 0.01044676, + "balance_loss_clip": 1.04695535, + "balance_loss_mlp": 1.02938759, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 2.091580379237072, + "language_loss": 0.74432427, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76608986, + "num_input_tokens_seen": 172336340, + "step": 8015, + "time_per_iteration": 2.4174399375915527 + }, + { + "auxiliary_loss_clip": 0.01112498, + "auxiliary_loss_mlp": 0.01043043, + "balance_loss_clip": 1.04536581, + "balance_loss_mlp": 1.02807677, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 1.9266449753581465, + "language_loss": 0.80352539, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82508075, + "num_input_tokens_seen": 172354315, + "step": 8016, + "time_per_iteration": 3.9494500160217285 + }, + { + "auxiliary_loss_clip": 0.01113165, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.0487206, + "balance_loss_mlp": 1.02244854, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 1.8516884260910873, + "language_loss": 0.77599686, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.7974906, + "num_input_tokens_seen": 172372695, + "step": 8017, + "time_per_iteration": 2.4815425872802734 + }, + { + "auxiliary_loss_clip": 0.01113362, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.04676533, + "balance_loss_mlp": 1.01955009, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 1.8445374026596533, + "language_loss": 0.80549073, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82696545, + "num_input_tokens_seen": 172390905, + "step": 8018, + "time_per_iteration": 2.53342866897583 + }, + { + "auxiliary_loss_clip": 0.01104643, + "auxiliary_loss_mlp": 0.01036947, + "balance_loss_clip": 1.05127645, + "balance_loss_mlp": 1.02383423, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.6851504345000494, + "language_loss": 0.76434612, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78576195, + "num_input_tokens_seen": 172412295, + "step": 8019, + "time_per_iteration": 2.674337387084961 + }, + { + "auxiliary_loss_clip": 0.01091747, + "auxiliary_loss_mlp": 0.01043261, + "balance_loss_clip": 1.04586077, + "balance_loss_mlp": 1.02970672, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 2.3479982352924615, + "language_loss": 0.79232001, + "learning_rate": 2.212052026199701e-06, + "loss": 0.8136701, + "num_input_tokens_seen": 172432625, + "step": 8020, + "time_per_iteration": 2.5592093467712402 + }, + { + "auxiliary_loss_clip": 0.01123278, + "auxiliary_loss_mlp": 0.01040089, + "balance_loss_clip": 1.04411745, + "balance_loss_mlp": 1.0253365, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 2.179510816916591, + "language_loss": 0.70220381, + "learning_rate": 2.211664755756855e-06, + "loss": 0.72383744, + "num_input_tokens_seen": 172450010, + "step": 8021, + "time_per_iteration": 2.4085774421691895 + }, + { + "auxiliary_loss_clip": 0.01102006, + "auxiliary_loss_mlp": 0.0104416, + "balance_loss_clip": 1.04260063, + "balance_loss_mlp": 1.02897847, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.7958769798863565, + "language_loss": 0.62629592, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.64775753, + "num_input_tokens_seen": 172469080, + "step": 8022, + "time_per_iteration": 2.525595188140869 + }, + { + "auxiliary_loss_clip": 0.0110478, + "auxiliary_loss_mlp": 0.00779197, + "balance_loss_clip": 1.04707575, + "balance_loss_mlp": 1.0002929, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.6417122612784265, + "language_loss": 0.66381299, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68265277, + "num_input_tokens_seen": 172484850, + "step": 8023, + "time_per_iteration": 2.510216236114502 + }, + { + "auxiliary_loss_clip": 0.0105779, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.03890634, + "balance_loss_mlp": 1.02642679, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 1.7903933542487183, + "language_loss": 0.76769322, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78869772, + "num_input_tokens_seen": 172503525, + "step": 8024, + "time_per_iteration": 2.6455395221710205 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01037709, + "balance_loss_clip": 1.04622507, + "balance_loss_mlp": 1.02205706, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.6820234433178936, + "language_loss": 0.7563318, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77777141, + "num_input_tokens_seen": 172524360, + "step": 8025, + "time_per_iteration": 4.019365549087524 + }, + { + "auxiliary_loss_clip": 0.0112671, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.04588842, + "balance_loss_mlp": 1.02320409, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.6982408997089358, + "language_loss": 0.70673305, + "learning_rate": 2.209728283441112e-06, + "loss": 0.72837996, + "num_input_tokens_seen": 172541480, + "step": 8026, + "time_per_iteration": 2.421562671661377 + }, + { + "auxiliary_loss_clip": 0.01113839, + "auxiliary_loss_mlp": 0.0104621, + "balance_loss_clip": 1.0410651, + "balance_loss_mlp": 1.02991366, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 1.981664409430425, + "language_loss": 0.74732769, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76892811, + "num_input_tokens_seen": 172559005, + "step": 8027, + "time_per_iteration": 3.8702919483184814 + }, + { + "auxiliary_loss_clip": 0.01102456, + "auxiliary_loss_mlp": 0.01039757, + "balance_loss_clip": 1.04430246, + "balance_loss_mlp": 1.02538085, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.6671876351837862, + "language_loss": 0.67184031, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69326246, + "num_input_tokens_seen": 172578435, + "step": 8028, + "time_per_iteration": 2.5200977325439453 + }, + { + "auxiliary_loss_clip": 0.01104358, + "auxiliary_loss_mlp": 0.01042846, + "balance_loss_clip": 1.04355836, + "balance_loss_mlp": 1.02848744, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.4985096985507511, + "language_loss": 0.73022676, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75169885, + "num_input_tokens_seen": 172596095, + "step": 8029, + "time_per_iteration": 2.475353956222534 + }, + { + "auxiliary_loss_clip": 0.0110357, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.04697466, + "balance_loss_mlp": 1.01748371, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 2.0715773237941524, + "language_loss": 0.85251027, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87386918, + "num_input_tokens_seen": 172615255, + "step": 8030, + "time_per_iteration": 2.546407461166382 + }, + { + "auxiliary_loss_clip": 0.01092397, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.03891206, + "balance_loss_mlp": 1.02206898, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 8.789956396009883, + "language_loss": 0.74242383, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76370406, + "num_input_tokens_seen": 172633185, + "step": 8031, + "time_per_iteration": 2.5147886276245117 + }, + { + "auxiliary_loss_clip": 0.01098866, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.04093158, + "balance_loss_mlp": 1.03531694, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 2.4079533334572925, + "language_loss": 0.71406674, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.73556691, + "num_input_tokens_seen": 172654280, + "step": 8032, + "time_per_iteration": 2.577432155609131 + }, + { + "auxiliary_loss_clip": 0.01108169, + "auxiliary_loss_mlp": 0.01046086, + "balance_loss_clip": 1.04037273, + "balance_loss_mlp": 1.03108358, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.6783796580646835, + "language_loss": 0.73835927, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.75990176, + "num_input_tokens_seen": 172675545, + "step": 8033, + "time_per_iteration": 2.4951820373535156 + }, + { + "auxiliary_loss_clip": 0.01076997, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.04923916, + "balance_loss_mlp": 1.02238703, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.5872396951216967, + "language_loss": 0.83634543, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85748255, + "num_input_tokens_seen": 172696455, + "step": 8034, + "time_per_iteration": 2.6493403911590576 + }, + { + "auxiliary_loss_clip": 0.01093623, + "auxiliary_loss_mlp": 0.0103443, + "balance_loss_clip": 1.04550838, + "balance_loss_mlp": 1.02081048, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 1.5933530278760653, + "language_loss": 0.79517072, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81645125, + "num_input_tokens_seen": 172716720, + "step": 8035, + "time_per_iteration": 2.5774450302124023 + }, + { + "auxiliary_loss_clip": 0.01103231, + "auxiliary_loss_mlp": 0.00781843, + "balance_loss_clip": 1.04332137, + "balance_loss_mlp": 1.000296, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.921009589884622, + "language_loss": 0.69395316, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71280384, + "num_input_tokens_seen": 172737435, + "step": 8036, + "time_per_iteration": 2.661067485809326 + }, + { + "auxiliary_loss_clip": 0.01112149, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.0428679, + "balance_loss_mlp": 1.02387834, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 1.7398361907737447, + "language_loss": 0.72982848, + "learning_rate": 2.205467347074847e-06, + "loss": 0.75132787, + "num_input_tokens_seen": 172755700, + "step": 8037, + "time_per_iteration": 2.4670424461364746 + }, + { + "auxiliary_loss_clip": 0.01081859, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_clip": 1.04521251, + "balance_loss_mlp": 1.02257562, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 3.0496616038191307, + "language_loss": 0.69723016, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71844733, + "num_input_tokens_seen": 172775185, + "step": 8038, + "time_per_iteration": 2.6004319190979004 + }, + { + "auxiliary_loss_clip": 0.01089892, + "auxiliary_loss_mlp": 0.01040483, + "balance_loss_clip": 1.04191589, + "balance_loss_mlp": 1.02599287, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.3868070905407681, + "language_loss": 0.7887013, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81000507, + "num_input_tokens_seen": 172796990, + "step": 8039, + "time_per_iteration": 2.6288700103759766 + }, + { + "auxiliary_loss_clip": 0.01114179, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.04499149, + "balance_loss_mlp": 1.02005863, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.52717836485625, + "language_loss": 0.7711395, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79262525, + "num_input_tokens_seen": 172814915, + "step": 8040, + "time_per_iteration": 3.919635534286499 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.04550862, + "balance_loss_mlp": 1.0221293, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.632358716189522, + "language_loss": 0.75501549, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77656877, + "num_input_tokens_seen": 172837060, + "step": 8041, + "time_per_iteration": 2.5833799839019775 + }, + { + "auxiliary_loss_clip": 0.01089485, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.0427897, + "balance_loss_mlp": 1.02127254, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 2.384581841620609, + "language_loss": 0.66896838, + "learning_rate": 2.203530244988624e-06, + "loss": 0.69021446, + "num_input_tokens_seen": 172856545, + "step": 8042, + "time_per_iteration": 2.582956075668335 + }, + { + "auxiliary_loss_clip": 0.01027278, + "auxiliary_loss_mlp": 0.01007476, + "balance_loss_clip": 1.01895845, + "balance_loss_mlp": 1.00627804, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.6917605034106875, + "language_loss": 0.58497185, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60531938, + "num_input_tokens_seen": 172923055, + "step": 8043, + "time_per_iteration": 3.1631617546081543 + }, + { + "auxiliary_loss_clip": 0.01105298, + "auxiliary_loss_mlp": 0.01045585, + "balance_loss_clip": 1.04446721, + "balance_loss_mlp": 1.0286212, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 2.206067734369972, + "language_loss": 0.72371554, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.74522436, + "num_input_tokens_seen": 172940700, + "step": 8044, + "time_per_iteration": 2.477142333984375 + }, + { + "auxiliary_loss_clip": 0.01075762, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.0415988, + "balance_loss_mlp": 1.02209938, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.29697556075965, + "language_loss": 0.75918728, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78032124, + "num_input_tokens_seen": 172961125, + "step": 8045, + "time_per_iteration": 2.5804443359375 + }, + { + "auxiliary_loss_clip": 0.0108118, + "auxiliary_loss_mlp": 0.01035687, + "balance_loss_clip": 1.04561853, + "balance_loss_mlp": 1.02158463, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.6395402175734473, + "language_loss": 0.69170225, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71287096, + "num_input_tokens_seen": 172980405, + "step": 8046, + "time_per_iteration": 2.580824136734009 + }, + { + "auxiliary_loss_clip": 0.01124697, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.0435276, + "balance_loss_mlp": 1.02453411, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 2.0056635180451057, + "language_loss": 0.8265816, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84822321, + "num_input_tokens_seen": 172999105, + "step": 8047, + "time_per_iteration": 2.449455976486206 + }, + { + "auxiliary_loss_clip": 0.01095229, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.04103494, + "balance_loss_mlp": 1.02295053, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.6557280878525216, + "language_loss": 0.80292594, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.8242501, + "num_input_tokens_seen": 173019935, + "step": 8048, + "time_per_iteration": 2.545524835586548 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.0444994, + "balance_loss_mlp": 1.02461851, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.637539348080687, + "language_loss": 0.8112241, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83277839, + "num_input_tokens_seen": 173039700, + "step": 8049, + "time_per_iteration": 2.538059711456299 + }, + { + "auxiliary_loss_clip": 0.01102098, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.04626131, + "balance_loss_mlp": 1.01681232, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.6730461918595265, + "language_loss": 0.72550511, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74681687, + "num_input_tokens_seen": 173059170, + "step": 8050, + "time_per_iteration": 2.5633044242858887 + }, + { + "auxiliary_loss_clip": 0.01034732, + "auxiliary_loss_mlp": 0.0075544, + "balance_loss_clip": 1.0164336, + "balance_loss_mlp": 1.00013316, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.6965324333147513, + "language_loss": 0.56337869, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58128047, + "num_input_tokens_seen": 173119000, + "step": 8051, + "time_per_iteration": 3.093923330307007 + }, + { + "auxiliary_loss_clip": 0.01089115, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.04379237, + "balance_loss_mlp": 1.02249432, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.6937859742913686, + "language_loss": 0.75382304, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77508044, + "num_input_tokens_seen": 173137570, + "step": 8052, + "time_per_iteration": 2.5490520000457764 + }, + { + "auxiliary_loss_clip": 0.01109832, + "auxiliary_loss_mlp": 0.01033777, + "balance_loss_clip": 1.0437212, + "balance_loss_mlp": 1.0204494, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 2.5543047648816772, + "language_loss": 0.66109574, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68253183, + "num_input_tokens_seen": 173154355, + "step": 8053, + "time_per_iteration": 2.4497687816619873 + }, + { + "auxiliary_loss_clip": 0.01115037, + "auxiliary_loss_mlp": 0.01032917, + "balance_loss_clip": 1.04728448, + "balance_loss_mlp": 1.01965523, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 2.2776359553314354, + "language_loss": 0.69260895, + "learning_rate": 2.198880416254091e-06, + "loss": 0.7140885, + "num_input_tokens_seen": 173174845, + "step": 8054, + "time_per_iteration": 2.565274715423584 + }, + { + "auxiliary_loss_clip": 0.01064228, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.04373574, + "balance_loss_mlp": 1.0188297, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 2.872951039405111, + "language_loss": 0.69613051, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71710056, + "num_input_tokens_seen": 173195025, + "step": 8055, + "time_per_iteration": 4.161544561386108 + }, + { + "auxiliary_loss_clip": 0.01115663, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.04606676, + "balance_loss_mlp": 1.02176988, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.2194646043827935, + "language_loss": 0.63228732, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65380281, + "num_input_tokens_seen": 173213065, + "step": 8056, + "time_per_iteration": 2.687865972518921 + }, + { + "auxiliary_loss_clip": 0.01111727, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.04204321, + "balance_loss_mlp": 1.01992035, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.7739474976701683, + "language_loss": 0.67438716, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69585568, + "num_input_tokens_seen": 173234545, + "step": 8057, + "time_per_iteration": 2.5652201175689697 + }, + { + "auxiliary_loss_clip": 0.01094124, + "auxiliary_loss_mlp": 0.01042954, + "balance_loss_clip": 1.04434419, + "balance_loss_mlp": 1.02747488, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.618568616519641, + "language_loss": 0.81626248, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83763331, + "num_input_tokens_seen": 173252175, + "step": 8058, + "time_per_iteration": 2.613323450088501 + }, + { + "auxiliary_loss_clip": 0.01113211, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.04245508, + "balance_loss_mlp": 1.02308393, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.9782220287387773, + "language_loss": 0.79747832, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.81898975, + "num_input_tokens_seen": 173268790, + "step": 8059, + "time_per_iteration": 2.4829938411712646 + }, + { + "auxiliary_loss_clip": 0.01130702, + "auxiliary_loss_mlp": 0.01042908, + "balance_loss_clip": 1.04774976, + "balance_loss_mlp": 1.02800047, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 3.8368555981477233, + "language_loss": 0.66794205, + "learning_rate": 2.196555093055352e-06, + "loss": 0.68967819, + "num_input_tokens_seen": 173288030, + "step": 8060, + "time_per_iteration": 2.5620803833007812 + }, + { + "auxiliary_loss_clip": 0.01115482, + "auxiliary_loss_mlp": 0.01042608, + "balance_loss_clip": 1.0492512, + "balance_loss_mlp": 1.02815425, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.6517276729550008, + "language_loss": 0.67263812, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69421899, + "num_input_tokens_seen": 173305965, + "step": 8061, + "time_per_iteration": 2.4845902919769287 + }, + { + "auxiliary_loss_clip": 0.01112365, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.05034089, + "balance_loss_mlp": 1.02907729, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 3.2487780373596133, + "language_loss": 0.82227576, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84384179, + "num_input_tokens_seen": 173321985, + "step": 8062, + "time_per_iteration": 2.4802048206329346 + }, + { + "auxiliary_loss_clip": 0.01066359, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.04516757, + "balance_loss_mlp": 1.01823258, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.779737801180613, + "language_loss": 0.741283, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76226652, + "num_input_tokens_seen": 173341315, + "step": 8063, + "time_per_iteration": 2.6065423488616943 + }, + { + "auxiliary_loss_clip": 0.0110137, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.04219985, + "balance_loss_mlp": 1.01937914, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.651127454306322, + "language_loss": 0.78827512, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.80962396, + "num_input_tokens_seen": 173361055, + "step": 8064, + "time_per_iteration": 4.062010049819946 + }, + { + "auxiliary_loss_clip": 0.0112286, + "auxiliary_loss_mlp": 0.00777423, + "balance_loss_clip": 1.04656363, + "balance_loss_mlp": 1.00037146, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 2.019796799593025, + "language_loss": 0.79142308, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81042588, + "num_input_tokens_seen": 173379255, + "step": 8065, + "time_per_iteration": 2.4445276260375977 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.00777202, + "balance_loss_clip": 1.04225147, + "balance_loss_mlp": 1.00029802, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 1.772067220669279, + "language_loss": 0.76179779, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78064382, + "num_input_tokens_seen": 173398370, + "step": 8066, + "time_per_iteration": 3.9385483264923096 + }, + { + "auxiliary_loss_clip": 0.01122992, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.04472756, + "balance_loss_mlp": 1.02125359, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.3435376592106125, + "language_loss": 0.72375679, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74533087, + "num_input_tokens_seen": 173419595, + "step": 8067, + "time_per_iteration": 2.495100736618042 + }, + { + "auxiliary_loss_clip": 0.01059061, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.04577947, + "balance_loss_mlp": 1.02005315, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 1.9812636816098192, + "language_loss": 0.78692722, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.80785978, + "num_input_tokens_seen": 173435390, + "step": 8068, + "time_per_iteration": 2.552830934524536 + }, + { + "auxiliary_loss_clip": 0.01094146, + "auxiliary_loss_mlp": 0.01035793, + "balance_loss_clip": 1.0389787, + "balance_loss_mlp": 1.02316916, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.4341465194182026, + "language_loss": 0.8446483, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86594772, + "num_input_tokens_seen": 173454095, + "step": 8069, + "time_per_iteration": 2.4799985885620117 + }, + { + "auxiliary_loss_clip": 0.01091153, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.04433346, + "balance_loss_mlp": 1.01960719, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.7390092686830163, + "language_loss": 0.77982432, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80106246, + "num_input_tokens_seen": 173475300, + "step": 8070, + "time_per_iteration": 2.5858254432678223 + }, + { + "auxiliary_loss_clip": 0.01069962, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.04465568, + "balance_loss_mlp": 1.02057528, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 2.168250386844426, + "language_loss": 0.78135514, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80239308, + "num_input_tokens_seen": 173492005, + "step": 8071, + "time_per_iteration": 2.562629461288452 + }, + { + "auxiliary_loss_clip": 0.01062614, + "auxiliary_loss_mlp": 0.01036068, + "balance_loss_clip": 1.03968108, + "balance_loss_mlp": 1.02194738, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 2.129439327362716, + "language_loss": 0.7178756, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.73886245, + "num_input_tokens_seen": 173511995, + "step": 8072, + "time_per_iteration": 2.649501323699951 + }, + { + "auxiliary_loss_clip": 0.01081023, + "auxiliary_loss_mlp": 0.01043066, + "balance_loss_clip": 1.04430127, + "balance_loss_mlp": 1.02875471, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.1799356159990984, + "language_loss": 0.87939459, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.90063548, + "num_input_tokens_seen": 173530215, + "step": 8073, + "time_per_iteration": 2.54535174369812 + }, + { + "auxiliary_loss_clip": 0.01081982, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.03872514, + "balance_loss_mlp": 1.02384377, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.6624973749814016, + "language_loss": 0.60854781, + "learning_rate": 2.19112830093786e-06, + "loss": 0.62975538, + "num_input_tokens_seen": 173550920, + "step": 8074, + "time_per_iteration": 2.5924291610717773 + }, + { + "auxiliary_loss_clip": 0.01089752, + "auxiliary_loss_mlp": 0.00780569, + "balance_loss_clip": 1.04292941, + "balance_loss_mlp": 1.00043082, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 2.0819104587007327, + "language_loss": 0.7306819, + "learning_rate": 2.19074061809469e-06, + "loss": 0.74938512, + "num_input_tokens_seen": 173569065, + "step": 8075, + "time_per_iteration": 2.5629477500915527 + }, + { + "auxiliary_loss_clip": 0.01119885, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.04467559, + "balance_loss_mlp": 1.02647591, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.5816436917346053, + "language_loss": 0.81867021, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84026265, + "num_input_tokens_seen": 173596085, + "step": 8076, + "time_per_iteration": 2.865473985671997 + }, + { + "auxiliary_loss_clip": 0.0110759, + "auxiliary_loss_mlp": 0.01035242, + "balance_loss_clip": 1.04711306, + "balance_loss_mlp": 1.01992309, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.7491886419307687, + "language_loss": 0.86465502, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.8860833, + "num_input_tokens_seen": 173613900, + "step": 8077, + "time_per_iteration": 2.492405414581299 + }, + { + "auxiliary_loss_clip": 0.01009507, + "auxiliary_loss_mlp": 0.01002422, + "balance_loss_clip": 1.02046084, + "balance_loss_mlp": 1.00102711, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9044285232634764, + "language_loss": 0.58462292, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60474217, + "num_input_tokens_seen": 173671305, + "step": 8078, + "time_per_iteration": 3.0771660804748535 + }, + { + "auxiliary_loss_clip": 0.01129656, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.04808807, + "balance_loss_mlp": 1.02085543, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.6480895860025828, + "language_loss": 0.72506136, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.7467041, + "num_input_tokens_seen": 173692070, + "step": 8079, + "time_per_iteration": 2.495556592941284 + }, + { + "auxiliary_loss_clip": 0.01091637, + "auxiliary_loss_mlp": 0.01035886, + "balance_loss_clip": 1.04773533, + "balance_loss_mlp": 1.02212346, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 3.323363905624295, + "language_loss": 0.79625899, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81753421, + "num_input_tokens_seen": 173709785, + "step": 8080, + "time_per_iteration": 4.073915481567383 + }, + { + "auxiliary_loss_clip": 0.01097451, + "auxiliary_loss_mlp": 0.01038871, + "balance_loss_clip": 1.03988028, + "balance_loss_mlp": 1.02427948, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 2.103768378363285, + "language_loss": 0.84335732, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86472052, + "num_input_tokens_seen": 173728770, + "step": 8081, + "time_per_iteration": 2.4972147941589355 + }, + { + "auxiliary_loss_clip": 0.01109767, + "auxiliary_loss_mlp": 0.01041141, + "balance_loss_clip": 1.04005909, + "balance_loss_mlp": 1.02570975, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.8540984412999402, + "language_loss": 0.8304168, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85192585, + "num_input_tokens_seen": 173747355, + "step": 8082, + "time_per_iteration": 2.4697775840759277 + }, + { + "auxiliary_loss_clip": 0.01105103, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.04795921, + "balance_loss_mlp": 1.02411687, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 2.0807353435185485, + "language_loss": 0.8737241, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89514494, + "num_input_tokens_seen": 173764825, + "step": 8083, + "time_per_iteration": 2.4985384941101074 + }, + { + "auxiliary_loss_clip": 0.01079958, + "auxiliary_loss_mlp": 0.01043572, + "balance_loss_clip": 1.04720426, + "balance_loss_mlp": 1.03106654, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.684905579819136, + "language_loss": 0.80719388, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.8284291, + "num_input_tokens_seen": 173783215, + "step": 8084, + "time_per_iteration": 2.5435597896575928 + }, + { + "auxiliary_loss_clip": 0.01113713, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.04483652, + "balance_loss_mlp": 1.0247041, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 3.368520847806667, + "language_loss": 0.68439889, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70592093, + "num_input_tokens_seen": 173801905, + "step": 8085, + "time_per_iteration": 2.4823381900787354 + }, + { + "auxiliary_loss_clip": 0.01113979, + "auxiliary_loss_mlp": 0.01040038, + "balance_loss_clip": 1.04524589, + "balance_loss_mlp": 1.02631128, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.518332881078115, + "language_loss": 0.77477109, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79631126, + "num_input_tokens_seen": 173824690, + "step": 8086, + "time_per_iteration": 2.548119068145752 + }, + { + "auxiliary_loss_clip": 0.01123351, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.04476666, + "balance_loss_mlp": 1.01905656, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 2.291269817833539, + "language_loss": 0.69509256, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.71665752, + "num_input_tokens_seen": 173844450, + "step": 8087, + "time_per_iteration": 2.560316324234009 + }, + { + "auxiliary_loss_clip": 0.0111635, + "auxiliary_loss_mlp": 0.01040228, + "balance_loss_clip": 1.04342771, + "balance_loss_mlp": 1.02521932, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.8692984800166765, + "language_loss": 0.72341686, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.74498266, + "num_input_tokens_seen": 173864975, + "step": 8088, + "time_per_iteration": 2.574941873550415 + }, + { + "auxiliary_loss_clip": 0.01101099, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.04288912, + "balance_loss_mlp": 1.02462316, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.8345275941904986, + "language_loss": 0.75438905, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77578366, + "num_input_tokens_seen": 173883805, + "step": 8089, + "time_per_iteration": 2.5009751319885254 + }, + { + "auxiliary_loss_clip": 0.01092985, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.04420519, + "balance_loss_mlp": 1.0197928, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.7386952812611303, + "language_loss": 0.84103769, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86230808, + "num_input_tokens_seen": 173903520, + "step": 8090, + "time_per_iteration": 2.544318914413452 + }, + { + "auxiliary_loss_clip": 0.01122663, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.04672289, + "balance_loss_mlp": 1.02097368, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 2.0043973704832267, + "language_loss": 0.75983548, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78140879, + "num_input_tokens_seen": 173924255, + "step": 8091, + "time_per_iteration": 2.454754590988159 + }, + { + "auxiliary_loss_clip": 0.01113745, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.04303765, + "balance_loss_mlp": 1.01755333, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.5227860404196158, + "language_loss": 0.80597919, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82743073, + "num_input_tokens_seen": 173943285, + "step": 8092, + "time_per_iteration": 2.5148072242736816 + }, + { + "auxiliary_loss_clip": 0.0109835, + "auxiliary_loss_mlp": 0.00784619, + "balance_loss_clip": 1.04184282, + "balance_loss_mlp": 1.00045121, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.4965593435932076, + "language_loss": 0.71802998, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73685968, + "num_input_tokens_seen": 173962205, + "step": 8093, + "time_per_iteration": 2.4957454204559326 + }, + { + "auxiliary_loss_clip": 0.0112325, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.0443238, + "balance_loss_mlp": 1.02105784, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.828657307273878, + "language_loss": 0.67648363, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.69806468, + "num_input_tokens_seen": 173980945, + "step": 8094, + "time_per_iteration": 2.453927755355835 + }, + { + "auxiliary_loss_clip": 0.01107964, + "auxiliary_loss_mlp": 0.0104434, + "balance_loss_clip": 1.04686439, + "balance_loss_mlp": 1.02934361, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 2.2780408639591654, + "language_loss": 0.66423011, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68575317, + "num_input_tokens_seen": 173998860, + "step": 8095, + "time_per_iteration": 4.197656869888306 + }, + { + "auxiliary_loss_clip": 0.01109886, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.04325819, + "balance_loss_mlp": 1.02321422, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 1.9836196990087778, + "language_loss": 0.78706288, + "learning_rate": 2.182597630229345e-06, + "loss": 0.80854785, + "num_input_tokens_seen": 174016665, + "step": 8096, + "time_per_iteration": 2.452028751373291 + }, + { + "auxiliary_loss_clip": 0.01094298, + "auxiliary_loss_mlp": 0.01040128, + "balance_loss_clip": 1.03736687, + "balance_loss_mlp": 1.02528036, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.856406738893639, + "language_loss": 0.67366534, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69500965, + "num_input_tokens_seen": 174034800, + "step": 8097, + "time_per_iteration": 2.5296566486358643 + }, + { + "auxiliary_loss_clip": 0.01097185, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.04244661, + "balance_loss_mlp": 1.02268529, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.7089793228306538, + "language_loss": 0.71469283, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73603439, + "num_input_tokens_seen": 174054445, + "step": 8098, + "time_per_iteration": 2.524340867996216 + }, + { + "auxiliary_loss_clip": 0.01118932, + "auxiliary_loss_mlp": 0.01039249, + "balance_loss_clip": 1.04496157, + "balance_loss_mlp": 1.02401996, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 1.947913566107484, + "language_loss": 0.65935361, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68093538, + "num_input_tokens_seen": 174077890, + "step": 8099, + "time_per_iteration": 2.670588970184326 + }, + { + "auxiliary_loss_clip": 0.01068725, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.03465891, + "balance_loss_mlp": 1.02731848, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.8965779157366427, + "language_loss": 0.67297447, + "learning_rate": 2.181046234549138e-06, + "loss": 0.69407368, + "num_input_tokens_seen": 174097460, + "step": 8100, + "time_per_iteration": 2.583772659301758 + }, + { + "auxiliary_loss_clip": 0.01090094, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.04357982, + "balance_loss_mlp": 1.02262056, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.36845888066012, + "language_loss": 0.76701319, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78827673, + "num_input_tokens_seen": 174120775, + "step": 8101, + "time_per_iteration": 2.597431182861328 + }, + { + "auxiliary_loss_clip": 0.01042582, + "auxiliary_loss_mlp": 0.01008741, + "balance_loss_clip": 1.01606345, + "balance_loss_mlp": 1.00726306, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6994129605292798, + "language_loss": 0.52320564, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54371887, + "num_input_tokens_seen": 174189135, + "step": 8102, + "time_per_iteration": 3.1605777740478516 + }, + { + "auxiliary_loss_clip": 0.01098225, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.04806018, + "balance_loss_mlp": 1.01963639, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 2.2164353208960366, + "language_loss": 0.73797894, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75929719, + "num_input_tokens_seen": 174203250, + "step": 8103, + "time_per_iteration": 2.449112892150879 + }, + { + "auxiliary_loss_clip": 0.0111442, + "auxiliary_loss_mlp": 0.01045116, + "balance_loss_clip": 1.04468417, + "balance_loss_mlp": 1.03053069, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 1.6212821391662655, + "language_loss": 0.63399476, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65559006, + "num_input_tokens_seen": 174224145, + "step": 8104, + "time_per_iteration": 3.9950060844421387 + }, + { + "auxiliary_loss_clip": 0.01123986, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.04579353, + "balance_loss_mlp": 1.01939034, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 1.608798259546893, + "language_loss": 0.68900818, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71058178, + "num_input_tokens_seen": 174244435, + "step": 8105, + "time_per_iteration": 3.900933265686035 + }, + { + "auxiliary_loss_clip": 0.01085587, + "auxiliary_loss_mlp": 0.01031532, + "balance_loss_clip": 1.04119706, + "balance_loss_mlp": 1.0175308, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.794124768163697, + "language_loss": 0.73446256, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75563377, + "num_input_tokens_seen": 174262710, + "step": 8106, + "time_per_iteration": 2.50935959815979 + }, + { + "auxiliary_loss_clip": 0.01107489, + "auxiliary_loss_mlp": 0.00781406, + "balance_loss_clip": 1.04616332, + "balance_loss_mlp": 1.00051725, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 1.9624712864834066, + "language_loss": 0.76538789, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.78427684, + "num_input_tokens_seen": 174281545, + "step": 8107, + "time_per_iteration": 2.5372047424316406 + }, + { + "auxiliary_loss_clip": 0.01071011, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.04453015, + "balance_loss_mlp": 1.01912749, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 2.0581204970875646, + "language_loss": 0.75524569, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77628005, + "num_input_tokens_seen": 174300290, + "step": 8108, + "time_per_iteration": 2.5804879665374756 + }, + { + "auxiliary_loss_clip": 0.01108728, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.04361176, + "balance_loss_mlp": 1.02062798, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.9901110920045653, + "language_loss": 0.73452163, + "learning_rate": 2.177555194083212e-06, + "loss": 0.75593638, + "num_input_tokens_seen": 174318490, + "step": 8109, + "time_per_iteration": 2.4599339962005615 + }, + { + "auxiliary_loss_clip": 0.01110814, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.04466987, + "balance_loss_mlp": 1.0205338, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 2.050506533687165, + "language_loss": 0.78709793, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80855167, + "num_input_tokens_seen": 174335505, + "step": 8110, + "time_per_iteration": 2.4847803115844727 + }, + { + "auxiliary_loss_clip": 0.01112723, + "auxiliary_loss_mlp": 0.01041356, + "balance_loss_clip": 1.04595017, + "balance_loss_mlp": 1.02768922, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.765254402970385, + "language_loss": 0.72563243, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74717325, + "num_input_tokens_seen": 174353990, + "step": 8111, + "time_per_iteration": 2.437246561050415 + }, + { + "auxiliary_loss_clip": 0.01111083, + "auxiliary_loss_mlp": 0.01039937, + "balance_loss_clip": 1.04524994, + "balance_loss_mlp": 1.02595341, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5925473603530824, + "language_loss": 0.76681197, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78832221, + "num_input_tokens_seen": 174373425, + "step": 8112, + "time_per_iteration": 2.4669272899627686 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.04417634, + "balance_loss_mlp": 1.0271033, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.7694130894948699, + "language_loss": 0.75163722, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77320546, + "num_input_tokens_seen": 174393070, + "step": 8113, + "time_per_iteration": 2.470911979675293 + }, + { + "auxiliary_loss_clip": 0.01025246, + "auxiliary_loss_mlp": 0.00755444, + "balance_loss_clip": 1.01741099, + "balance_loss_mlp": 1.00046337, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.7941510656224192, + "language_loss": 0.48821044, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50601739, + "num_input_tokens_seen": 174446880, + "step": 8114, + "time_per_iteration": 2.9904708862304688 + }, + { + "auxiliary_loss_clip": 0.01096913, + "auxiliary_loss_mlp": 0.01043242, + "balance_loss_clip": 1.04635406, + "balance_loss_mlp": 1.02842402, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.4456733151627084, + "language_loss": 0.7654078, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78680933, + "num_input_tokens_seen": 174468485, + "step": 8115, + "time_per_iteration": 2.642334222793579 + }, + { + "auxiliary_loss_clip": 0.01104199, + "auxiliary_loss_mlp": 0.01039353, + "balance_loss_clip": 1.04457545, + "balance_loss_mlp": 1.02535796, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 1.9213870586585047, + "language_loss": 0.72118151, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74261701, + "num_input_tokens_seen": 174486360, + "step": 8116, + "time_per_iteration": 2.5077872276306152 + }, + { + "auxiliary_loss_clip": 0.01088791, + "auxiliary_loss_mlp": 0.01036821, + "balance_loss_clip": 1.04142451, + "balance_loss_mlp": 1.02345228, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 2.2962506792851074, + "language_loss": 0.63308024, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65433633, + "num_input_tokens_seen": 174505075, + "step": 8117, + "time_per_iteration": 2.5241880416870117 + }, + { + "auxiliary_loss_clip": 0.01091532, + "auxiliary_loss_mlp": 0.01046423, + "balance_loss_clip": 1.03821111, + "balance_loss_mlp": 1.03171253, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.700342163006105, + "language_loss": 0.79074001, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81211954, + "num_input_tokens_seen": 174523385, + "step": 8118, + "time_per_iteration": 2.514204740524292 + }, + { + "auxiliary_loss_clip": 0.01098758, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.04079485, + "balance_loss_mlp": 1.02356899, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 1.7642959480673048, + "language_loss": 0.63329118, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65465438, + "num_input_tokens_seen": 174542200, + "step": 8119, + "time_per_iteration": 4.0075976848602295 + }, + { + "auxiliary_loss_clip": 0.0106577, + "auxiliary_loss_mlp": 0.00778172, + "balance_loss_clip": 1.04538107, + "balance_loss_mlp": 1.00037956, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 2.515010802200704, + "language_loss": 0.72931254, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74775189, + "num_input_tokens_seen": 174563620, + "step": 8120, + "time_per_iteration": 2.644801139831543 + }, + { + "auxiliary_loss_clip": 0.01114306, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.04308426, + "balance_loss_mlp": 1.01802945, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.9618099430063358, + "language_loss": 0.63854432, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.66000748, + "num_input_tokens_seen": 174586465, + "step": 8121, + "time_per_iteration": 2.590177536010742 + }, + { + "auxiliary_loss_clip": 0.01112802, + "auxiliary_loss_mlp": 0.01041855, + "balance_loss_clip": 1.04181659, + "balance_loss_mlp": 1.02684093, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.8200496449578616, + "language_loss": 0.82607305, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84761965, + "num_input_tokens_seen": 174604035, + "step": 8122, + "time_per_iteration": 2.471820592880249 + }, + { + "auxiliary_loss_clip": 0.01111313, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.04176903, + "balance_loss_mlp": 1.02936959, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 1.6633729186407236, + "language_loss": 0.85780025, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87936997, + "num_input_tokens_seen": 174621715, + "step": 8123, + "time_per_iteration": 2.448122501373291 + }, + { + "auxiliary_loss_clip": 0.01091519, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.04664862, + "balance_loss_mlp": 1.01779723, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 1.9303105637237727, + "language_loss": 0.85402393, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.875247, + "num_input_tokens_seen": 174643835, + "step": 8124, + "time_per_iteration": 2.5770890712738037 + }, + { + "auxiliary_loss_clip": 0.01100316, + "auxiliary_loss_mlp": 0.01036683, + "balance_loss_clip": 1.04396129, + "balance_loss_mlp": 1.02352262, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 1.8094817500419047, + "language_loss": 0.79329991, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81466985, + "num_input_tokens_seen": 174660955, + "step": 8125, + "time_per_iteration": 2.498074531555176 + }, + { + "auxiliary_loss_clip": 0.01075258, + "auxiliary_loss_mlp": 0.01041479, + "balance_loss_clip": 1.03961861, + "balance_loss_mlp": 1.02729344, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.0605021388430926, + "language_loss": 0.72793901, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74910635, + "num_input_tokens_seen": 174678270, + "step": 8126, + "time_per_iteration": 2.5497028827667236 + }, + { + "auxiliary_loss_clip": 0.01111388, + "auxiliary_loss_mlp": 0.01036585, + "balance_loss_clip": 1.04056287, + "balance_loss_mlp": 1.02265549, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.5761626274752054, + "language_loss": 0.69201994, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.71349967, + "num_input_tokens_seen": 174698360, + "step": 8127, + "time_per_iteration": 2.5328423976898193 + }, + { + "auxiliary_loss_clip": 0.01124774, + "auxiliary_loss_mlp": 0.01038148, + "balance_loss_clip": 1.04200435, + "balance_loss_mlp": 1.02349758, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 1.687857312204896, + "language_loss": 0.76066208, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78229129, + "num_input_tokens_seen": 174716755, + "step": 8128, + "time_per_iteration": 2.4109654426574707 + }, + { + "auxiliary_loss_clip": 0.01123335, + "auxiliary_loss_mlp": 0.01033779, + "balance_loss_clip": 1.04421365, + "balance_loss_mlp": 1.02045774, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.5926947738243036, + "language_loss": 0.76145488, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78302604, + "num_input_tokens_seen": 174735560, + "step": 8129, + "time_per_iteration": 2.4225354194641113 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.04361737, + "balance_loss_mlp": 1.0222497, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 7.742660298939345, + "language_loss": 0.65229821, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67378217, + "num_input_tokens_seen": 174752730, + "step": 8130, + "time_per_iteration": 2.427860736846924 + }, + { + "auxiliary_loss_clip": 0.01086461, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.03842735, + "balance_loss_mlp": 1.02114177, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 5.328015213453795, + "language_loss": 0.72144771, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74265695, + "num_input_tokens_seen": 174772520, + "step": 8131, + "time_per_iteration": 2.5660367012023926 + }, + { + "auxiliary_loss_clip": 0.01113162, + "auxiliary_loss_mlp": 0.01043869, + "balance_loss_clip": 1.04255331, + "balance_loss_mlp": 1.02984357, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.3587638995741345, + "language_loss": 0.69576466, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71733499, + "num_input_tokens_seen": 174796540, + "step": 8132, + "time_per_iteration": 2.6097521781921387 + }, + { + "auxiliary_loss_clip": 0.01104532, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.04205382, + "balance_loss_mlp": 1.01969862, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.5700523220840965, + "language_loss": 0.69945979, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72084355, + "num_input_tokens_seen": 174817840, + "step": 8133, + "time_per_iteration": 2.513667583465576 + }, + { + "auxiliary_loss_clip": 0.01068427, + "auxiliary_loss_mlp": 0.01044684, + "balance_loss_clip": 1.03815162, + "balance_loss_mlp": 1.03003931, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.6791601287094742, + "language_loss": 0.70651066, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72764182, + "num_input_tokens_seen": 174837885, + "step": 8134, + "time_per_iteration": 4.0455873012542725 + }, + { + "auxiliary_loss_clip": 0.01080699, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.04080558, + "balance_loss_mlp": 1.02163303, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 2.588779870909645, + "language_loss": 0.80721849, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82839, + "num_input_tokens_seen": 174855240, + "step": 8135, + "time_per_iteration": 2.596742630004883 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.04243672, + "balance_loss_mlp": 1.02557182, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.6658227984078515, + "language_loss": 0.74932933, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.77091587, + "num_input_tokens_seen": 174875145, + "step": 8136, + "time_per_iteration": 2.437396287918091 + }, + { + "auxiliary_loss_clip": 0.01094061, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.03945684, + "balance_loss_mlp": 1.02852166, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 1.533441131561574, + "language_loss": 0.73461527, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75598073, + "num_input_tokens_seen": 174894770, + "step": 8137, + "time_per_iteration": 2.5228195190429688 + }, + { + "auxiliary_loss_clip": 0.01052034, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.03818488, + "balance_loss_mlp": 1.01933479, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 2.4687837836339113, + "language_loss": 0.75583148, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.776685, + "num_input_tokens_seen": 174912780, + "step": 8138, + "time_per_iteration": 2.6349847316741943 + }, + { + "auxiliary_loss_clip": 0.01093467, + "auxiliary_loss_mlp": 0.010357, + "balance_loss_clip": 1.04922605, + "balance_loss_mlp": 1.0227958, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.5942820238557653, + "language_loss": 0.74261236, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76390409, + "num_input_tokens_seen": 174931250, + "step": 8139, + "time_per_iteration": 2.553730010986328 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.04041076, + "balance_loss_mlp": 1.02511489, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 2.0101913869142, + "language_loss": 0.62194282, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64343596, + "num_input_tokens_seen": 174951105, + "step": 8140, + "time_per_iteration": 2.473914384841919 + }, + { + "auxiliary_loss_clip": 0.01091605, + "auxiliary_loss_mlp": 0.01041617, + "balance_loss_clip": 1.04366624, + "balance_loss_mlp": 1.02732384, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 3.4006008524065274, + "language_loss": 0.82457811, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84591031, + "num_input_tokens_seen": 174969120, + "step": 8141, + "time_per_iteration": 2.5884501934051514 + }, + { + "auxiliary_loss_clip": 0.01093594, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.04759598, + "balance_loss_mlp": 1.02056384, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.8630640374175278, + "language_loss": 0.72126067, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74254346, + "num_input_tokens_seen": 174991295, + "step": 8142, + "time_per_iteration": 2.590928077697754 + }, + { + "auxiliary_loss_clip": 0.01120118, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.04230309, + "balance_loss_mlp": 1.02257216, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.6182413684977337, + "language_loss": 0.66933131, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.69088984, + "num_input_tokens_seen": 175012830, + "step": 8143, + "time_per_iteration": 2.4874086380004883 + }, + { + "auxiliary_loss_clip": 0.01109609, + "auxiliary_loss_mlp": 0.00777014, + "balance_loss_clip": 1.04254532, + "balance_loss_mlp": 1.00033295, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.5296954571204777, + "language_loss": 0.75159121, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77045739, + "num_input_tokens_seen": 175035695, + "step": 8144, + "time_per_iteration": 4.047656774520874 + }, + { + "auxiliary_loss_clip": 0.01095616, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.03788304, + "balance_loss_mlp": 1.01685941, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.3838750203896386, + "language_loss": 0.76015115, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.78141552, + "num_input_tokens_seen": 175056425, + "step": 8145, + "time_per_iteration": 3.917693614959717 + }, + { + "auxiliary_loss_clip": 0.01101861, + "auxiliary_loss_mlp": 0.00780591, + "balance_loss_clip": 1.03986573, + "balance_loss_mlp": 1.00043178, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.7496953683789975, + "language_loss": 0.80651605, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82534051, + "num_input_tokens_seen": 175074800, + "step": 8146, + "time_per_iteration": 2.4873287677764893 + }, + { + "auxiliary_loss_clip": 0.01105696, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.04016066, + "balance_loss_mlp": 1.01834428, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.629354747768459, + "language_loss": 0.74243176, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76380885, + "num_input_tokens_seen": 175094500, + "step": 8147, + "time_per_iteration": 2.483207941055298 + }, + { + "auxiliary_loss_clip": 0.01090509, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.04047918, + "balance_loss_mlp": 1.01749671, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.2101352023281944, + "language_loss": 0.82631755, + "learning_rate": 2.162421187770864e-06, + "loss": 0.84753555, + "num_input_tokens_seen": 175112920, + "step": 8148, + "time_per_iteration": 2.459606170654297 + }, + { + "auxiliary_loss_clip": 0.01086669, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.03785574, + "balance_loss_mlp": 1.02077377, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.705222217816374, + "language_loss": 0.73685551, + "learning_rate": 2.162033009418015e-06, + "loss": 0.75805104, + "num_input_tokens_seen": 175129910, + "step": 8149, + "time_per_iteration": 2.4960544109344482 + }, + { + "auxiliary_loss_clip": 0.01127593, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.04559255, + "balance_loss_mlp": 1.01926804, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 1.9577402139132662, + "language_loss": 0.7641747, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78578961, + "num_input_tokens_seen": 175148705, + "step": 8150, + "time_per_iteration": 2.465202808380127 + }, + { + "auxiliary_loss_clip": 0.0110016, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.0443511, + "balance_loss_mlp": 1.02021992, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 2.6806378167069034, + "language_loss": 0.72822356, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.74956751, + "num_input_tokens_seen": 175167425, + "step": 8151, + "time_per_iteration": 2.4887495040893555 + }, + { + "auxiliary_loss_clip": 0.01007026, + "auxiliary_loss_mlp": 0.01001302, + "balance_loss_clip": 1.01787734, + "balance_loss_mlp": 0.99993747, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8374074289331582, + "language_loss": 0.54327118, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56335449, + "num_input_tokens_seen": 175227985, + "step": 8152, + "time_per_iteration": 3.0969791412353516 + }, + { + "auxiliary_loss_clip": 0.01065315, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.03831363, + "balance_loss_mlp": 1.01901555, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.8372919220228852, + "language_loss": 0.61577892, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.63675874, + "num_input_tokens_seen": 175251895, + "step": 8153, + "time_per_iteration": 2.773054838180542 + }, + { + "auxiliary_loss_clip": 0.01086984, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.0399251, + "balance_loss_mlp": 1.02453375, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.5924634767043122, + "language_loss": 0.77067101, + "learning_rate": 2.160092025783549e-06, + "loss": 0.79192507, + "num_input_tokens_seen": 175272770, + "step": 8154, + "time_per_iteration": 2.600536584854126 + }, + { + "auxiliary_loss_clip": 0.01022336, + "auxiliary_loss_mlp": 0.01004457, + "balance_loss_clip": 1.01462126, + "balance_loss_mlp": 1.00313997, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9826407393106396, + "language_loss": 0.67034447, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69061238, + "num_input_tokens_seen": 175336320, + "step": 8155, + "time_per_iteration": 3.146655559539795 + }, + { + "auxiliary_loss_clip": 0.01123453, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.04409373, + "balance_loss_mlp": 1.0157876, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 2.3559943792298395, + "language_loss": 0.76413214, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.78565133, + "num_input_tokens_seen": 175353540, + "step": 8156, + "time_per_iteration": 2.426912784576416 + }, + { + "auxiliary_loss_clip": 0.01110857, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.04255319, + "balance_loss_mlp": 1.01847053, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.0405066501891596, + "language_loss": 0.83527231, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85669458, + "num_input_tokens_seen": 175370445, + "step": 8157, + "time_per_iteration": 2.4789271354675293 + }, + { + "auxiliary_loss_clip": 0.01112641, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.04196882, + "balance_loss_mlp": 1.01789415, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.8179419687523146, + "language_loss": 0.79888213, + "learning_rate": 2.158539129514956e-06, + "loss": 0.82032061, + "num_input_tokens_seen": 175389020, + "step": 8158, + "time_per_iteration": 4.011245489120483 + }, + { + "auxiliary_loss_clip": 0.0112602, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.04508924, + "balance_loss_mlp": 1.01669168, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 1.561765506376334, + "language_loss": 0.69265234, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71421653, + "num_input_tokens_seen": 175409545, + "step": 8159, + "time_per_iteration": 2.4800994396209717 + }, + { + "auxiliary_loss_clip": 0.01105538, + "auxiliary_loss_mlp": 0.01042894, + "balance_loss_clip": 1.04026198, + "balance_loss_mlp": 1.02870166, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 2.1763555804415975, + "language_loss": 0.73294723, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75443155, + "num_input_tokens_seen": 175429335, + "step": 8160, + "time_per_iteration": 2.4627785682678223 + }, + { + "auxiliary_loss_clip": 0.01114363, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.04696357, + "balance_loss_mlp": 1.02639496, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 1.8776659737377237, + "language_loss": 0.71617508, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73773146, + "num_input_tokens_seen": 175446955, + "step": 8161, + "time_per_iteration": 2.438272476196289 + }, + { + "auxiliary_loss_clip": 0.01076613, + "auxiliary_loss_mlp": 0.01038404, + "balance_loss_clip": 1.03936779, + "balance_loss_mlp": 1.02484989, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 3.244242411267998, + "language_loss": 0.68840873, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70955896, + "num_input_tokens_seen": 175468195, + "step": 8162, + "time_per_iteration": 2.6133029460906982 + }, + { + "auxiliary_loss_clip": 0.01110281, + "auxiliary_loss_mlp": 0.01040809, + "balance_loss_clip": 1.04328275, + "balance_loss_mlp": 1.0252645, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.875808288942691, + "language_loss": 0.63666809, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65817893, + "num_input_tokens_seen": 175487455, + "step": 8163, + "time_per_iteration": 2.4834303855895996 + }, + { + "auxiliary_loss_clip": 0.01083246, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.03886521, + "balance_loss_mlp": 1.01891673, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 2.509858435193745, + "language_loss": 0.77246213, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.79361451, + "num_input_tokens_seen": 175504450, + "step": 8164, + "time_per_iteration": 2.6518285274505615 + }, + { + "auxiliary_loss_clip": 0.01105484, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.03809011, + "balance_loss_mlp": 1.01985514, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.6080774249413063, + "language_loss": 0.76963007, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.79104006, + "num_input_tokens_seen": 175523600, + "step": 8165, + "time_per_iteration": 2.451504707336426 + }, + { + "auxiliary_loss_clip": 0.0110291, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.04541469, + "balance_loss_mlp": 1.02393627, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 2.0526086461928355, + "language_loss": 0.77736008, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79876161, + "num_input_tokens_seen": 175542720, + "step": 8166, + "time_per_iteration": 2.5027594566345215 + }, + { + "auxiliary_loss_clip": 0.01031414, + "auxiliary_loss_mlp": 0.01000674, + "balance_loss_clip": 1.01457262, + "balance_loss_mlp": 0.99940413, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7956459293146992, + "language_loss": 0.54170418, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56202507, + "num_input_tokens_seen": 175598640, + "step": 8167, + "time_per_iteration": 3.060723066329956 + }, + { + "auxiliary_loss_clip": 0.01079564, + "auxiliary_loss_mlp": 0.01030444, + "balance_loss_clip": 1.04594278, + "balance_loss_mlp": 1.01740789, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 6.325350604629843, + "language_loss": 0.85729796, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.878398, + "num_input_tokens_seen": 175615675, + "step": 8168, + "time_per_iteration": 2.549424886703491 + }, + { + "auxiliary_loss_clip": 0.01103664, + "auxiliary_loss_mlp": 0.01042249, + "balance_loss_clip": 1.03992212, + "balance_loss_mlp": 1.02832556, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6253990321690808, + "language_loss": 0.73168689, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75314605, + "num_input_tokens_seen": 175632255, + "step": 8169, + "time_per_iteration": 2.449575185775757 + }, + { + "auxiliary_loss_clip": 0.01105474, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.03927863, + "balance_loss_mlp": 1.01449156, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.901229680174477, + "language_loss": 0.77973151, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80105579, + "num_input_tokens_seen": 175651625, + "step": 8170, + "time_per_iteration": 2.4776077270507812 + }, + { + "auxiliary_loss_clip": 0.01093482, + "auxiliary_loss_mlp": 0.01037341, + "balance_loss_clip": 1.03790259, + "balance_loss_mlp": 1.02423358, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 2.162950039396376, + "language_loss": 0.76125383, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78256202, + "num_input_tokens_seen": 175669265, + "step": 8171, + "time_per_iteration": 2.4796478748321533 + }, + { + "auxiliary_loss_clip": 0.01101525, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.04067278, + "balance_loss_mlp": 1.02596104, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 2.6143612312990756, + "language_loss": 0.8113929, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83280516, + "num_input_tokens_seen": 175686065, + "step": 8172, + "time_per_iteration": 2.4913077354431152 + }, + { + "auxiliary_loss_clip": 0.01029995, + "auxiliary_loss_mlp": 0.01002063, + "balance_loss_clip": 1.01369298, + "balance_loss_mlp": 1.00077569, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6924545378542969, + "language_loss": 0.53340912, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55372971, + "num_input_tokens_seen": 175748595, + "step": 8173, + "time_per_iteration": 3.053560256958008 + }, + { + "auxiliary_loss_clip": 0.01115101, + "auxiliary_loss_mlp": 0.00779272, + "balance_loss_clip": 1.04392529, + "balance_loss_mlp": 1.00035822, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.760385396825082, + "language_loss": 0.62586027, + "learning_rate": 2.152326591972107e-06, + "loss": 0.644804, + "num_input_tokens_seen": 175766770, + "step": 8174, + "time_per_iteration": 3.974029302597046 + }, + { + "auxiliary_loss_clip": 0.01087312, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.04242682, + "balance_loss_mlp": 1.03073847, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.6785167972882287, + "language_loss": 0.69432354, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71565771, + "num_input_tokens_seen": 175783605, + "step": 8175, + "time_per_iteration": 2.5289621353149414 + }, + { + "auxiliary_loss_clip": 0.01112478, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.04377544, + "balance_loss_mlp": 1.01879311, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.5610766959654627, + "language_loss": 0.74573922, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76717967, + "num_input_tokens_seen": 175801390, + "step": 8176, + "time_per_iteration": 2.472407817840576 + }, + { + "auxiliary_loss_clip": 0.01113677, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_clip": 1.04312563, + "balance_loss_mlp": 1.02933598, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.6944827091613122, + "language_loss": 0.69687498, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.71844006, + "num_input_tokens_seen": 175819830, + "step": 8177, + "time_per_iteration": 2.478095293045044 + }, + { + "auxiliary_loss_clip": 0.01021616, + "auxiliary_loss_mlp": 0.00755457, + "balance_loss_clip": 1.0151664, + "balance_loss_mlp": 1.00015855, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.724621776691244, + "language_loss": 0.46229008, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48006085, + "num_input_tokens_seen": 175881765, + "step": 8178, + "time_per_iteration": 3.070598602294922 + }, + { + "auxiliary_loss_clip": 0.01126835, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.04446435, + "balance_loss_mlp": 1.02275503, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.972885499363524, + "language_loss": 0.65792924, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.67957389, + "num_input_tokens_seen": 175901795, + "step": 8179, + "time_per_iteration": 2.4172699451446533 + }, + { + "auxiliary_loss_clip": 0.01044792, + "auxiliary_loss_mlp": 0.01049592, + "balance_loss_clip": 1.03920233, + "balance_loss_mlp": 1.03327179, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 1.8844904685867716, + "language_loss": 0.69979769, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72074151, + "num_input_tokens_seen": 175917770, + "step": 8180, + "time_per_iteration": 2.803297758102417 + }, + { + "auxiliary_loss_clip": 0.0109771, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.04132366, + "balance_loss_mlp": 1.02397847, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.6358832288507916, + "language_loss": 0.84603739, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.8673923, + "num_input_tokens_seen": 175937000, + "step": 8181, + "time_per_iteration": 2.7883872985839844 + }, + { + "auxiliary_loss_clip": 0.01120015, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.04363298, + "balance_loss_mlp": 1.02457023, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 1.9781005761987744, + "language_loss": 0.72473878, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74631155, + "num_input_tokens_seen": 175955170, + "step": 8182, + "time_per_iteration": 2.4268412590026855 + }, + { + "auxiliary_loss_clip": 0.01083498, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.04032326, + "balance_loss_mlp": 1.02663732, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.048714403959269, + "language_loss": 0.72310293, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.7443341, + "num_input_tokens_seen": 175973725, + "step": 8183, + "time_per_iteration": 4.056917667388916 + }, + { + "auxiliary_loss_clip": 0.01068992, + "auxiliary_loss_mlp": 0.0103659, + "balance_loss_clip": 1.03826547, + "balance_loss_mlp": 1.02142692, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 2.75536897452953, + "language_loss": 0.77244627, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79350209, + "num_input_tokens_seen": 175993885, + "step": 8184, + "time_per_iteration": 4.279320240020752 + }, + { + "auxiliary_loss_clip": 0.01095852, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.04414034, + "balance_loss_mlp": 1.02331233, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.6721341541547807, + "language_loss": 0.70661277, + "learning_rate": 2.148054610995789e-06, + "loss": 0.72794187, + "num_input_tokens_seen": 176014210, + "step": 8185, + "time_per_iteration": 2.5420589447021484 + }, + { + "auxiliary_loss_clip": 0.01103978, + "auxiliary_loss_mlp": 0.01041407, + "balance_loss_clip": 1.04284465, + "balance_loss_mlp": 1.02633345, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 1.6987024055837765, + "language_loss": 0.75622517, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77767903, + "num_input_tokens_seen": 176033890, + "step": 8186, + "time_per_iteration": 2.5215277671813965 + }, + { + "auxiliary_loss_clip": 0.0111066, + "auxiliary_loss_mlp": 0.01042289, + "balance_loss_clip": 1.04205441, + "balance_loss_mlp": 1.02847266, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.1744769838985696, + "language_loss": 0.67893195, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.70046145, + "num_input_tokens_seen": 176052720, + "step": 8187, + "time_per_iteration": 2.4612581729888916 + }, + { + "auxiliary_loss_clip": 0.01076189, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.03687727, + "balance_loss_mlp": 1.02628684, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.7639757031630212, + "language_loss": 0.66634482, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.68750751, + "num_input_tokens_seen": 176072545, + "step": 8188, + "time_per_iteration": 2.567960739135742 + }, + { + "auxiliary_loss_clip": 0.01113026, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.04391265, + "balance_loss_mlp": 1.01938486, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.8255480580121113, + "language_loss": 0.74787807, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.7693271, + "num_input_tokens_seen": 176091490, + "step": 8189, + "time_per_iteration": 2.5024187564849854 + }, + { + "auxiliary_loss_clip": 0.01100655, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.0414083, + "balance_loss_mlp": 1.0175252, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.6913202807773657, + "language_loss": 0.64551425, + "learning_rate": 2.146112575713104e-06, + "loss": 0.6668334, + "num_input_tokens_seen": 176113200, + "step": 8190, + "time_per_iteration": 2.615647792816162 + }, + { + "auxiliary_loss_clip": 0.01122081, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.0446533, + "balance_loss_mlp": 1.01924968, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 1.8895424910571932, + "language_loss": 0.71245754, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73400933, + "num_input_tokens_seen": 176132485, + "step": 8191, + "time_per_iteration": 2.405914306640625 + }, + { + "auxiliary_loss_clip": 0.01119397, + "auxiliary_loss_mlp": 0.00778616, + "balance_loss_clip": 1.0413115, + "balance_loss_mlp": 1.0004375, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.508070784571063, + "language_loss": 0.7207495, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.7397297, + "num_input_tokens_seen": 176155755, + "step": 8192, + "time_per_iteration": 2.5833396911621094 + }, + { + "auxiliary_loss_clip": 0.01019894, + "auxiliary_loss_mlp": 0.01003766, + "balance_loss_clip": 1.01324224, + "balance_loss_mlp": 1.00241876, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.718746148278456, + "language_loss": 0.52110362, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54134023, + "num_input_tokens_seen": 176216295, + "step": 8193, + "time_per_iteration": 3.1422934532165527 + }, + { + "auxiliary_loss_clip": 0.01122314, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_clip": 1.04507184, + "balance_loss_mlp": 1.0292449, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.5866817639227009, + "language_loss": 0.77087158, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79252279, + "num_input_tokens_seen": 176235925, + "step": 8194, + "time_per_iteration": 2.443000555038452 + }, + { + "auxiliary_loss_clip": 0.01092624, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.03532422, + "balance_loss_mlp": 1.01730871, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 1.9804655808068619, + "language_loss": 0.70306158, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72429174, + "num_input_tokens_seen": 176253865, + "step": 8195, + "time_per_iteration": 2.5235519409179688 + }, + { + "auxiliary_loss_clip": 0.01086654, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.04793882, + "balance_loss_mlp": 1.01753592, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 3.233412105037161, + "language_loss": 0.81134987, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83252394, + "num_input_tokens_seen": 176271525, + "step": 8196, + "time_per_iteration": 2.5987284183502197 + }, + { + "auxiliary_loss_clip": 0.01089235, + "auxiliary_loss_mlp": 0.01035739, + "balance_loss_clip": 1.03743255, + "balance_loss_mlp": 1.02166605, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.6840806117946434, + "language_loss": 0.70573133, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72698104, + "num_input_tokens_seen": 176290810, + "step": 8197, + "time_per_iteration": 4.043405771255493 + }, + { + "auxiliary_loss_clip": 0.01106629, + "auxiliary_loss_mlp": 0.0103258, + "balance_loss_clip": 1.04194903, + "balance_loss_mlp": 1.01962781, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 1.8416025574166515, + "language_loss": 0.8424294, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86382151, + "num_input_tokens_seen": 176309165, + "step": 8198, + "time_per_iteration": 2.4386980533599854 + }, + { + "auxiliary_loss_clip": 0.0111506, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.04343987, + "balance_loss_mlp": 1.01912725, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 2.1348732325072235, + "language_loss": 0.75817746, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.77965933, + "num_input_tokens_seen": 176324960, + "step": 8199, + "time_per_iteration": 2.437831401824951 + }, + { + "auxiliary_loss_clip": 0.01098242, + "auxiliary_loss_mlp": 0.01034978, + "balance_loss_clip": 1.03914607, + "balance_loss_mlp": 1.02024364, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.382053389115302, + "language_loss": 0.60026968, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62160194, + "num_input_tokens_seen": 176346195, + "step": 8200, + "time_per_iteration": 2.530191421508789 + }, + { + "auxiliary_loss_clip": 0.0110905, + "auxiliary_loss_mlp": 0.01038, + "balance_loss_clip": 1.04338241, + "balance_loss_mlp": 1.02473211, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.380136911713962, + "language_loss": 0.79601669, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81748724, + "num_input_tokens_seen": 176366735, + "step": 8201, + "time_per_iteration": 2.4836628437042236 + }, + { + "auxiliary_loss_clip": 0.01114562, + "auxiliary_loss_mlp": 0.010376, + "balance_loss_clip": 1.03975022, + "balance_loss_mlp": 1.02275801, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 3.0963797198054395, + "language_loss": 0.67468303, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69620466, + "num_input_tokens_seen": 176384475, + "step": 8202, + "time_per_iteration": 2.433300733566284 + }, + { + "auxiliary_loss_clip": 0.01096196, + "auxiliary_loss_mlp": 0.01034001, + "balance_loss_clip": 1.04155612, + "balance_loss_mlp": 1.0210017, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 1.9654427956981275, + "language_loss": 0.74892902, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77023101, + "num_input_tokens_seen": 176402645, + "step": 8203, + "time_per_iteration": 2.542567014694214 + }, + { + "auxiliary_loss_clip": 0.01069865, + "auxiliary_loss_mlp": 0.01037654, + "balance_loss_clip": 1.0421443, + "balance_loss_mlp": 1.02424359, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.1779647449739463, + "language_loss": 0.8059994, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82707465, + "num_input_tokens_seen": 176416715, + "step": 8204, + "time_per_iteration": 2.5318803787231445 + }, + { + "auxiliary_loss_clip": 0.01107414, + "auxiliary_loss_mlp": 0.01042785, + "balance_loss_clip": 1.04182255, + "balance_loss_mlp": 1.03022051, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 2.4386468842885605, + "language_loss": 0.66374737, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68524933, + "num_input_tokens_seen": 176435755, + "step": 8205, + "time_per_iteration": 2.4933252334594727 + }, + { + "auxiliary_loss_clip": 0.01127556, + "auxiliary_loss_mlp": 0.01041077, + "balance_loss_clip": 1.04424214, + "balance_loss_mlp": 1.02567577, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 2.572110183638827, + "language_loss": 0.66738528, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68907166, + "num_input_tokens_seen": 176453915, + "step": 8206, + "time_per_iteration": 2.4283721446990967 + }, + { + "auxiliary_loss_clip": 0.01076701, + "auxiliary_loss_mlp": 0.01043103, + "balance_loss_clip": 1.03746223, + "balance_loss_mlp": 1.02860737, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.8208533904000361, + "language_loss": 0.76496083, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78615892, + "num_input_tokens_seen": 176475175, + "step": 8207, + "time_per_iteration": 2.584275245666504 + }, + { + "auxiliary_loss_clip": 0.01100378, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.0432446, + "balance_loss_mlp": 1.02339911, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.693998745314644, + "language_loss": 0.60705483, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62843651, + "num_input_tokens_seen": 176494250, + "step": 8208, + "time_per_iteration": 2.5068318843841553 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.04568708, + "balance_loss_mlp": 1.02253759, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 1.6926275704118552, + "language_loss": 0.78224522, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80367631, + "num_input_tokens_seen": 176513325, + "step": 8209, + "time_per_iteration": 2.510270833969116 + }, + { + "auxiliary_loss_clip": 0.01090943, + "auxiliary_loss_mlp": 0.00784088, + "balance_loss_clip": 1.03521729, + "balance_loss_mlp": 1.00042236, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 2.0016324156287295, + "language_loss": 0.78847301, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80722326, + "num_input_tokens_seen": 176532915, + "step": 8210, + "time_per_iteration": 2.499490976333618 + }, + { + "auxiliary_loss_clip": 0.01116896, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.0486784, + "balance_loss_mlp": 1.02280784, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 1.6574414399157198, + "language_loss": 0.81475681, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.8362968, + "num_input_tokens_seen": 176552775, + "step": 8211, + "time_per_iteration": 2.4873011112213135 + }, + { + "auxiliary_loss_clip": 0.01078811, + "auxiliary_loss_mlp": 0.01057589, + "balance_loss_clip": 1.03623641, + "balance_loss_mlp": 1.04148364, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 3.881970299814091, + "language_loss": 0.9142518, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93561578, + "num_input_tokens_seen": 176572185, + "step": 8212, + "time_per_iteration": 4.058124303817749 + }, + { + "auxiliary_loss_clip": 0.0108201, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.04047668, + "balance_loss_mlp": 1.02963471, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 2.0514290147001035, + "language_loss": 0.6485045, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.66976738, + "num_input_tokens_seen": 176591490, + "step": 8213, + "time_per_iteration": 2.553393840789795 + }, + { + "auxiliary_loss_clip": 0.01069529, + "auxiliary_loss_mlp": 0.00779939, + "balance_loss_clip": 1.03549147, + "balance_loss_mlp": 1.00051212, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 2.8610035702628345, + "language_loss": 0.75851005, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77700472, + "num_input_tokens_seen": 176612715, + "step": 8214, + "time_per_iteration": 2.670917272567749 + }, + { + "auxiliary_loss_clip": 0.01125085, + "auxiliary_loss_mlp": 0.01039333, + "balance_loss_clip": 1.04507542, + "balance_loss_mlp": 1.02578497, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.9480908983174696, + "language_loss": 0.84590209, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86754632, + "num_input_tokens_seen": 176631950, + "step": 8215, + "time_per_iteration": 2.4452993869781494 + }, + { + "auxiliary_loss_clip": 0.01104535, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.04091883, + "balance_loss_mlp": 1.02236176, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.644308575230269, + "language_loss": 0.83336186, + "learning_rate": 2.136011800934292e-06, + "loss": 0.85475528, + "num_input_tokens_seen": 176653060, + "step": 8216, + "time_per_iteration": 2.5522022247314453 + }, + { + "auxiliary_loss_clip": 0.01092364, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.04046202, + "balance_loss_mlp": 1.02035093, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.5068200765918445, + "language_loss": 0.74707401, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76833403, + "num_input_tokens_seen": 176673895, + "step": 8217, + "time_per_iteration": 2.5400092601776123 + }, + { + "auxiliary_loss_clip": 0.01118913, + "auxiliary_loss_mlp": 0.00778407, + "balance_loss_clip": 1.04319477, + "balance_loss_mlp": 1.00047398, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 2.350550855241812, + "language_loss": 0.78388059, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80285382, + "num_input_tokens_seen": 176692550, + "step": 8218, + "time_per_iteration": 2.432011842727661 + }, + { + "auxiliary_loss_clip": 0.01073271, + "auxiliary_loss_mlp": 0.00777733, + "balance_loss_clip": 1.04051936, + "balance_loss_mlp": 1.00041294, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 1.9268058998098254, + "language_loss": 0.76447177, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78298175, + "num_input_tokens_seen": 176709335, + "step": 8219, + "time_per_iteration": 2.5230398178100586 + }, + { + "auxiliary_loss_clip": 0.01100405, + "auxiliary_loss_mlp": 0.01034748, + "balance_loss_clip": 1.04261088, + "balance_loss_mlp": 1.02103305, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.9460005588910005, + "language_loss": 0.62242353, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64377499, + "num_input_tokens_seen": 176727715, + "step": 8220, + "time_per_iteration": 2.458935499191284 + }, + { + "auxiliary_loss_clip": 0.01120602, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.04131722, + "balance_loss_mlp": 1.0147481, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 2.983605617211477, + "language_loss": 0.72147793, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74296725, + "num_input_tokens_seen": 176747530, + "step": 8221, + "time_per_iteration": 2.427720069885254 + }, + { + "auxiliary_loss_clip": 0.01085493, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.0458194, + "balance_loss_mlp": 1.02152276, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.6263613938737087, + "language_loss": 0.79379922, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81499624, + "num_input_tokens_seen": 176765260, + "step": 8222, + "time_per_iteration": 2.50423264503479 + }, + { + "auxiliary_loss_clip": 0.01111742, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.04414785, + "balance_loss_mlp": 1.02191162, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 3.605251293135799, + "language_loss": 0.7220574, + "learning_rate": 2.133291755093088e-06, + "loss": 0.74354041, + "num_input_tokens_seen": 176781770, + "step": 8223, + "time_per_iteration": 5.256489992141724 + }, + { + "auxiliary_loss_clip": 0.01110989, + "auxiliary_loss_mlp": 0.0103968, + "balance_loss_clip": 1.04186177, + "balance_loss_mlp": 1.02532721, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.5760133200829598, + "language_loss": 0.75463963, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77614629, + "num_input_tokens_seen": 176800655, + "step": 8224, + "time_per_iteration": 2.4598164558410645 + }, + { + "auxiliary_loss_clip": 0.01098758, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.04328084, + "balance_loss_mlp": 1.02135372, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.3110485757686234, + "language_loss": 0.63820148, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.65954161, + "num_input_tokens_seen": 176820610, + "step": 8225, + "time_per_iteration": 2.5183403491973877 + }, + { + "auxiliary_loss_clip": 0.01101233, + "auxiliary_loss_mlp": 0.01036758, + "balance_loss_clip": 1.04302478, + "balance_loss_mlp": 1.02377009, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 2.424634375924101, + "language_loss": 0.76272207, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78410196, + "num_input_tokens_seen": 176840520, + "step": 8226, + "time_per_iteration": 2.5026557445526123 + }, + { + "auxiliary_loss_clip": 0.01124447, + "auxiliary_loss_mlp": 0.0103617, + "balance_loss_clip": 1.04324102, + "balance_loss_mlp": 1.02130497, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.7947339609704935, + "language_loss": 0.70931512, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73092127, + "num_input_tokens_seen": 176860265, + "step": 8227, + "time_per_iteration": 2.462801218032837 + }, + { + "auxiliary_loss_clip": 0.01103843, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.04357922, + "balance_loss_mlp": 1.02566683, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.6195304680011249, + "language_loss": 0.71578598, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73722374, + "num_input_tokens_seen": 176882910, + "step": 8228, + "time_per_iteration": 2.56010365486145 + }, + { + "auxiliary_loss_clip": 0.0111929, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.04210877, + "balance_loss_mlp": 1.01853585, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.6298068049540517, + "language_loss": 0.84120953, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.86272639, + "num_input_tokens_seen": 176903030, + "step": 8229, + "time_per_iteration": 2.4539880752563477 + }, + { + "auxiliary_loss_clip": 0.01112087, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.04078972, + "balance_loss_mlp": 1.02231741, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 1.8282920040679282, + "language_loss": 0.74284625, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.76434445, + "num_input_tokens_seen": 176919025, + "step": 8230, + "time_per_iteration": 2.444765329360962 + }, + { + "auxiliary_loss_clip": 0.01105232, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.04336405, + "balance_loss_mlp": 1.01736724, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 6.60404274847812, + "language_loss": 0.79685658, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.81821322, + "num_input_tokens_seen": 176937945, + "step": 8231, + "time_per_iteration": 2.4243626594543457 + }, + { + "auxiliary_loss_clip": 0.01030407, + "auxiliary_loss_mlp": 0.01004709, + "balance_loss_clip": 1.01467657, + "balance_loss_mlp": 1.00338602, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.747610201648865, + "language_loss": 0.60214865, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62249982, + "num_input_tokens_seen": 177004575, + "step": 8232, + "time_per_iteration": 3.159236192703247 + }, + { + "auxiliary_loss_clip": 0.01102768, + "auxiliary_loss_mlp": 0.01042099, + "balance_loss_clip": 1.04003751, + "balance_loss_mlp": 1.02714443, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 2.680660584666449, + "language_loss": 0.69206709, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71351576, + "num_input_tokens_seen": 177024155, + "step": 8233, + "time_per_iteration": 2.520739793777466 + }, + { + "auxiliary_loss_clip": 0.01065524, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.0354327, + "balance_loss_mlp": 1.02286601, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 1.9687530536865172, + "language_loss": 0.66493332, + "learning_rate": 2.129016898898633e-06, + "loss": 0.6859675, + "num_input_tokens_seen": 177046185, + "step": 8234, + "time_per_iteration": 2.649113893508911 + }, + { + "auxiliary_loss_clip": 0.01028309, + "auxiliary_loss_mlp": 0.01010689, + "balance_loss_clip": 1.02065158, + "balance_loss_mlp": 1.00950873, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8020943689743343, + "language_loss": 0.58012611, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60051614, + "num_input_tokens_seen": 177099025, + "step": 8235, + "time_per_iteration": 2.9946224689483643 + }, + { + "auxiliary_loss_clip": 0.01090985, + "auxiliary_loss_mlp": 0.01038851, + "balance_loss_clip": 1.03690982, + "balance_loss_mlp": 1.02454567, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.6564210536010604, + "language_loss": 0.77422851, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.79552686, + "num_input_tokens_seen": 177118365, + "step": 8236, + "time_per_iteration": 2.5351459980010986 + }, + { + "auxiliary_loss_clip": 0.01085336, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.04493904, + "balance_loss_mlp": 1.02067137, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.6427024504156604, + "language_loss": 0.72566867, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74686503, + "num_input_tokens_seen": 177136415, + "step": 8237, + "time_per_iteration": 4.16965651512146 + }, + { + "auxiliary_loss_clip": 0.01119071, + "auxiliary_loss_mlp": 0.01031384, + "balance_loss_clip": 1.04180503, + "balance_loss_mlp": 1.01779449, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.7422509566626778, + "language_loss": 0.75688362, + "learning_rate": 2.127462257935406e-06, + "loss": 0.7783882, + "num_input_tokens_seen": 177155690, + "step": 8238, + "time_per_iteration": 2.4674336910247803 + }, + { + "auxiliary_loss_clip": 0.01080841, + "auxiliary_loss_mlp": 0.01041903, + "balance_loss_clip": 1.04043245, + "balance_loss_mlp": 1.02553582, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.8106653733344, + "language_loss": 0.73679042, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.7580179, + "num_input_tokens_seen": 177173350, + "step": 8239, + "time_per_iteration": 2.5036251544952393 + }, + { + "auxiliary_loss_clip": 0.01042065, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.04176962, + "balance_loss_mlp": 1.01968277, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 3.8497147957763964, + "language_loss": 0.78470945, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80548441, + "num_input_tokens_seen": 177191115, + "step": 8240, + "time_per_iteration": 2.745262861251831 + }, + { + "auxiliary_loss_clip": 0.01110326, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.04341948, + "balance_loss_mlp": 1.02858591, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.776984128996074, + "language_loss": 0.85480618, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87632465, + "num_input_tokens_seen": 177206155, + "step": 8241, + "time_per_iteration": 2.661341667175293 + }, + { + "auxiliary_loss_clip": 0.01072269, + "auxiliary_loss_mlp": 0.01039871, + "balance_loss_clip": 1.04553199, + "balance_loss_mlp": 1.02661514, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.8615572710427168, + "language_loss": 0.77312624, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79424763, + "num_input_tokens_seen": 177224815, + "step": 8242, + "time_per_iteration": 2.5608761310577393 + }, + { + "auxiliary_loss_clip": 0.0109994, + "auxiliary_loss_mlp": 0.00779509, + "balance_loss_clip": 1.04245925, + "balance_loss_mlp": 1.00042677, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 2.004482356984139, + "language_loss": 0.67244005, + "learning_rate": 2.125518848090833e-06, + "loss": 0.69123459, + "num_input_tokens_seen": 177244490, + "step": 8243, + "time_per_iteration": 2.546527147293091 + }, + { + "auxiliary_loss_clip": 0.01113644, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.05135167, + "balance_loss_mlp": 1.01859391, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.5924004415267003, + "language_loss": 0.68066782, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70211899, + "num_input_tokens_seen": 177264340, + "step": 8244, + "time_per_iteration": 2.5047643184661865 + }, + { + "auxiliary_loss_clip": 0.01096159, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.04131722, + "balance_loss_mlp": 1.02407944, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.8166627906034671, + "language_loss": 0.7503615, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77171528, + "num_input_tokens_seen": 177283055, + "step": 8245, + "time_per_iteration": 2.479642629623413 + }, + { + "auxiliary_loss_clip": 0.01112327, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.04545033, + "balance_loss_mlp": 1.01835012, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 1.8976021351179297, + "language_loss": 0.82142615, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.84286737, + "num_input_tokens_seen": 177301140, + "step": 8246, + "time_per_iteration": 2.445523738861084 + }, + { + "auxiliary_loss_clip": 0.01087638, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.04323661, + "balance_loss_mlp": 1.02508235, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.6542803514662043, + "language_loss": 0.83756292, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85884261, + "num_input_tokens_seen": 177323095, + "step": 8247, + "time_per_iteration": 2.5982134342193604 + }, + { + "auxiliary_loss_clip": 0.01097836, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.0481801, + "balance_loss_mlp": 1.01611042, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 1.830225213818812, + "language_loss": 0.838175, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85944676, + "num_input_tokens_seen": 177339845, + "step": 8248, + "time_per_iteration": 2.5756728649139404 + }, + { + "auxiliary_loss_clip": 0.01115341, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.04496932, + "balance_loss_mlp": 1.01735628, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 1.6693046537702907, + "language_loss": 0.73325026, + "learning_rate": 2.123186599369812e-06, + "loss": 0.7547226, + "num_input_tokens_seen": 177359980, + "step": 8249, + "time_per_iteration": 2.5244743824005127 + }, + { + "auxiliary_loss_clip": 0.01107331, + "auxiliary_loss_mlp": 0.01043251, + "balance_loss_clip": 1.04572451, + "balance_loss_mlp": 1.02863574, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 2.7951432304101136, + "language_loss": 0.75964165, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78114748, + "num_input_tokens_seen": 177378580, + "step": 8250, + "time_per_iteration": 2.4782938957214355 + }, + { + "auxiliary_loss_clip": 0.01125221, + "auxiliary_loss_mlp": 0.01041543, + "balance_loss_clip": 1.04475737, + "balance_loss_mlp": 1.02688646, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 2.047980077186191, + "language_loss": 0.70202684, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.7236945, + "num_input_tokens_seen": 177398790, + "step": 8251, + "time_per_iteration": 2.441359043121338 + }, + { + "auxiliary_loss_clip": 0.01089916, + "auxiliary_loss_mlp": 0.00778196, + "balance_loss_clip": 1.04684412, + "balance_loss_mlp": 1.00039339, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 2.146605182440886, + "language_loss": 0.79513961, + "learning_rate": 2.122020411748461e-06, + "loss": 0.81382072, + "num_input_tokens_seen": 177416515, + "step": 8252, + "time_per_iteration": 4.204503297805786 + }, + { + "auxiliary_loss_clip": 0.01125037, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.04583478, + "balance_loss_mlp": 1.01579237, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 4.266697729223639, + "language_loss": 0.80845463, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.8300221, + "num_input_tokens_seen": 177434425, + "step": 8253, + "time_per_iteration": 2.4729461669921875 + }, + { + "auxiliary_loss_clip": 0.01091345, + "auxiliary_loss_mlp": 0.01034992, + "balance_loss_clip": 1.0408833, + "balance_loss_mlp": 1.02172351, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.3497533802192179, + "language_loss": 0.67264438, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69390774, + "num_input_tokens_seen": 177459675, + "step": 8254, + "time_per_iteration": 2.6613192558288574 + }, + { + "auxiliary_loss_clip": 0.01089053, + "auxiliary_loss_mlp": 0.01044054, + "balance_loss_clip": 1.04285431, + "balance_loss_mlp": 1.02815175, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.7739688137432268, + "language_loss": 0.73884624, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76017725, + "num_input_tokens_seen": 177478895, + "step": 8255, + "time_per_iteration": 2.556046724319458 + }, + { + "auxiliary_loss_clip": 0.01094058, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.03956401, + "balance_loss_mlp": 1.02359033, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.961639484913403, + "language_loss": 0.81363404, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.83494943, + "num_input_tokens_seen": 177494920, + "step": 8256, + "time_per_iteration": 2.467015266418457 + }, + { + "auxiliary_loss_clip": 0.011019, + "auxiliary_loss_mlp": 0.01030195, + "balance_loss_clip": 1.04885852, + "balance_loss_mlp": 1.01646161, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.5257684051961717, + "language_loss": 0.81047297, + "learning_rate": 2.120076673368901e-06, + "loss": 0.8317939, + "num_input_tokens_seen": 177515455, + "step": 8257, + "time_per_iteration": 2.5445780754089355 + }, + { + "auxiliary_loss_clip": 0.01128917, + "auxiliary_loss_mlp": 0.01043475, + "balance_loss_clip": 1.04429209, + "balance_loss_mlp": 1.02834737, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 2.582054673493522, + "language_loss": 0.66042888, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68215275, + "num_input_tokens_seen": 177534040, + "step": 8258, + "time_per_iteration": 2.427663564682007 + }, + { + "auxiliary_loss_clip": 0.01109697, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.04215491, + "balance_loss_mlp": 1.01784742, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 1.4333543822233719, + "language_loss": 0.77813238, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79953527, + "num_input_tokens_seen": 177554510, + "step": 8259, + "time_per_iteration": 2.495640754699707 + }, + { + "auxiliary_loss_clip": 0.01096629, + "auxiliary_loss_mlp": 0.01035196, + "balance_loss_clip": 1.04162705, + "balance_loss_mlp": 1.02107573, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.4835332479402918, + "language_loss": 0.78507966, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80639791, + "num_input_tokens_seen": 177575780, + "step": 8260, + "time_per_iteration": 2.549696683883667 + }, + { + "auxiliary_loss_clip": 0.01104583, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.04366171, + "balance_loss_mlp": 1.01830328, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 3.023044379000426, + "language_loss": 0.76150751, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78288674, + "num_input_tokens_seen": 177588965, + "step": 8261, + "time_per_iteration": 2.514026641845703 + }, + { + "auxiliary_loss_clip": 0.01075467, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.04067409, + "balance_loss_mlp": 1.01918054, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 2.097648685023103, + "language_loss": 0.89614105, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91722262, + "num_input_tokens_seen": 177608425, + "step": 8262, + "time_per_iteration": 5.560045957565308 + }, + { + "auxiliary_loss_clip": 0.01074693, + "auxiliary_loss_mlp": 0.01033889, + "balance_loss_clip": 1.04426789, + "balance_loss_mlp": 1.02045441, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.498482790897896, + "language_loss": 0.74077433, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76186013, + "num_input_tokens_seen": 177628240, + "step": 8263, + "time_per_iteration": 2.6264138221740723 + }, + { + "auxiliary_loss_clip": 0.010822, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.04725623, + "balance_loss_mlp": 1.02028584, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 2.2304209183015464, + "language_loss": 0.69500452, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71618491, + "num_input_tokens_seen": 177645920, + "step": 8264, + "time_per_iteration": 2.55737566947937 + }, + { + "auxiliary_loss_clip": 0.01096133, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.04156411, + "balance_loss_mlp": 1.01800084, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.5090382458710228, + "language_loss": 0.64532936, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.66661811, + "num_input_tokens_seen": 177667185, + "step": 8265, + "time_per_iteration": 2.526773691177368 + }, + { + "auxiliary_loss_clip": 0.01026061, + "auxiliary_loss_mlp": 0.0101913, + "balance_loss_clip": 1.01800275, + "balance_loss_mlp": 1.01753294, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 1.13752444558686, + "language_loss": 0.53499538, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55544728, + "num_input_tokens_seen": 177733020, + "step": 8266, + "time_per_iteration": 3.133758068084717 + }, + { + "auxiliary_loss_clip": 0.01110483, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.04368055, + "balance_loss_mlp": 1.01881695, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 1.575555246438026, + "language_loss": 0.79757309, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81900615, + "num_input_tokens_seen": 177753370, + "step": 8267, + "time_per_iteration": 2.5097436904907227 + }, + { + "auxiliary_loss_clip": 0.01107145, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.05135882, + "balance_loss_mlp": 1.01883745, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.260734919420013, + "language_loss": 0.74676073, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76816869, + "num_input_tokens_seen": 177771530, + "step": 8268, + "time_per_iteration": 2.584524631500244 + }, + { + "auxiliary_loss_clip": 0.01113229, + "auxiliary_loss_mlp": 0.00780185, + "balance_loss_clip": 1.04174399, + "balance_loss_mlp": 1.0003469, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.3920273265759682, + "language_loss": 0.67638165, + "learning_rate": 2.115411240328073e-06, + "loss": 0.69531578, + "num_input_tokens_seen": 177796355, + "step": 8269, + "time_per_iteration": 2.7015621662139893 + }, + { + "auxiliary_loss_clip": 0.01095738, + "auxiliary_loss_mlp": 0.01038985, + "balance_loss_clip": 1.04213285, + "balance_loss_mlp": 1.02449512, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.4359598097274993, + "language_loss": 0.85365331, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.8750006, + "num_input_tokens_seen": 177814300, + "step": 8270, + "time_per_iteration": 2.502110242843628 + }, + { + "auxiliary_loss_clip": 0.01081713, + "auxiliary_loss_mlp": 0.00780088, + "balance_loss_clip": 1.0422554, + "balance_loss_mlp": 1.00039864, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.9007437534720137, + "language_loss": 0.70272326, + "learning_rate": 2.114633606196899e-06, + "loss": 0.72134125, + "num_input_tokens_seen": 177833615, + "step": 8271, + "time_per_iteration": 2.5957741737365723 + }, + { + "auxiliary_loss_clip": 0.01108824, + "auxiliary_loss_mlp": 0.01038173, + "balance_loss_clip": 1.04257286, + "balance_loss_mlp": 1.02325392, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 3.118481070683676, + "language_loss": 0.78548753, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80695748, + "num_input_tokens_seen": 177855315, + "step": 8272, + "time_per_iteration": 2.497051239013672 + }, + { + "auxiliary_loss_clip": 0.01089084, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.0436033, + "balance_loss_mlp": 1.024786, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.769406209402496, + "language_loss": 0.66658902, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68786693, + "num_input_tokens_seen": 177875590, + "step": 8273, + "time_per_iteration": 2.6700000762939453 + }, + { + "auxiliary_loss_clip": 0.01089137, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.04300463, + "balance_loss_mlp": 1.02454424, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.6478226163429743, + "language_loss": 0.78246939, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80374277, + "num_input_tokens_seen": 177894175, + "step": 8274, + "time_per_iteration": 2.5474488735198975 + }, + { + "auxiliary_loss_clip": 0.0109206, + "auxiliary_loss_mlp": 0.01036142, + "balance_loss_clip": 1.04525948, + "balance_loss_mlp": 1.02145565, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 1.871532337646671, + "language_loss": 0.75626147, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77754349, + "num_input_tokens_seen": 177913920, + "step": 8275, + "time_per_iteration": 2.628251075744629 + }, + { + "auxiliary_loss_clip": 0.0111161, + "auxiliary_loss_mlp": 0.01038268, + "balance_loss_clip": 1.04223442, + "balance_loss_mlp": 1.02186441, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 1.9593863225374768, + "language_loss": 0.83575225, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.85725105, + "num_input_tokens_seen": 177930425, + "step": 8276, + "time_per_iteration": 2.454225778579712 + }, + { + "auxiliary_loss_clip": 0.01118757, + "auxiliary_loss_mlp": 0.00778823, + "balance_loss_clip": 1.04342341, + "balance_loss_mlp": 1.00030041, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.3862965989100597, + "language_loss": 0.70289171, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72186744, + "num_input_tokens_seen": 177949885, + "step": 8277, + "time_per_iteration": 3.9638781547546387 + }, + { + "auxiliary_loss_clip": 0.01106209, + "auxiliary_loss_mlp": 0.01040535, + "balance_loss_clip": 1.04272687, + "balance_loss_mlp": 1.02601552, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.8678970095727276, + "language_loss": 0.82516074, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84662825, + "num_input_tokens_seen": 177965720, + "step": 8278, + "time_per_iteration": 2.5394675731658936 + }, + { + "auxiliary_loss_clip": 0.01115738, + "auxiliary_loss_mlp": 0.01040412, + "balance_loss_clip": 1.04568386, + "balance_loss_mlp": 1.0259937, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 2.8015667528326493, + "language_loss": 0.67283392, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69439542, + "num_input_tokens_seen": 177983190, + "step": 8279, + "time_per_iteration": 2.4452450275421143 + }, + { + "auxiliary_loss_clip": 0.0111512, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.04236948, + "balance_loss_mlp": 1.02500272, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 2.3680231455679737, + "language_loss": 0.70700759, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72856784, + "num_input_tokens_seen": 178000155, + "step": 8280, + "time_per_iteration": 2.460114002227783 + }, + { + "auxiliary_loss_clip": 0.01090736, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_clip": 1.04181027, + "balance_loss_mlp": 1.0254786, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.53573863256701, + "language_loss": 0.64406455, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66537058, + "num_input_tokens_seen": 178021060, + "step": 8281, + "time_per_iteration": 2.5931081771850586 + }, + { + "auxiliary_loss_clip": 0.01118404, + "auxiliary_loss_mlp": 0.01039972, + "balance_loss_clip": 1.0459342, + "balance_loss_mlp": 1.02520156, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 2.017008315378647, + "language_loss": 0.72909534, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.75067908, + "num_input_tokens_seen": 178038180, + "step": 8282, + "time_per_iteration": 2.4343390464782715 + }, + { + "auxiliary_loss_clip": 0.0109929, + "auxiliary_loss_mlp": 0.01033371, + "balance_loss_clip": 1.04763985, + "balance_loss_mlp": 1.02047288, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.8742414157379723, + "language_loss": 0.73589659, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75722319, + "num_input_tokens_seen": 178057565, + "step": 8283, + "time_per_iteration": 2.565124750137329 + }, + { + "auxiliary_loss_clip": 0.01067217, + "auxiliary_loss_mlp": 0.01046857, + "balance_loss_clip": 1.03677011, + "balance_loss_mlp": 1.02948833, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 3.9539945398237886, + "language_loss": 0.78798711, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.80912787, + "num_input_tokens_seen": 178076965, + "step": 8284, + "time_per_iteration": 2.5611965656280518 + }, + { + "auxiliary_loss_clip": 0.01106176, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.0478543, + "balance_loss_mlp": 1.0253849, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 2.148452565229074, + "language_loss": 0.73887295, + "learning_rate": 2.109189687029526e-06, + "loss": 0.76033765, + "num_input_tokens_seen": 178095105, + "step": 8285, + "time_per_iteration": 2.5638535022735596 + }, + { + "auxiliary_loss_clip": 0.01114134, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.0482167, + "balance_loss_mlp": 1.0194428, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.9128957180504886, + "language_loss": 0.74221975, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76370692, + "num_input_tokens_seen": 178114505, + "step": 8286, + "time_per_iteration": 2.472641944885254 + }, + { + "auxiliary_loss_clip": 0.01104984, + "auxiliary_loss_mlp": 0.0104409, + "balance_loss_clip": 1.04599571, + "balance_loss_mlp": 1.02986288, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.7827037630558549, + "language_loss": 0.85336494, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87485576, + "num_input_tokens_seen": 178131595, + "step": 8287, + "time_per_iteration": 2.51495623588562 + }, + { + "auxiliary_loss_clip": 0.01076675, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.04051626, + "balance_loss_mlp": 1.01782072, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.7051070412506037, + "language_loss": 0.7258364, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74693072, + "num_input_tokens_seen": 178152055, + "step": 8288, + "time_per_iteration": 2.6713449954986572 + }, + { + "auxiliary_loss_clip": 0.011052, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.04131794, + "balance_loss_mlp": 1.02316797, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 4.647364373680803, + "language_loss": 0.79929018, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82073712, + "num_input_tokens_seen": 178168150, + "step": 8289, + "time_per_iteration": 2.4703900814056396 + }, + { + "auxiliary_loss_clip": 0.01110508, + "auxiliary_loss_mlp": 0.01038621, + "balance_loss_clip": 1.04095232, + "balance_loss_mlp": 1.0243752, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.199467986699838, + "language_loss": 0.73356396, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75505531, + "num_input_tokens_seen": 178186150, + "step": 8290, + "time_per_iteration": 2.46760630607605 + }, + { + "auxiliary_loss_clip": 0.01116623, + "auxiliary_loss_mlp": 0.01045081, + "balance_loss_clip": 1.04658651, + "balance_loss_mlp": 1.02858829, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.5681979161286, + "language_loss": 0.84334385, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86496091, + "num_input_tokens_seen": 178207665, + "step": 8291, + "time_per_iteration": 2.5106072425842285 + }, + { + "auxiliary_loss_clip": 0.01101054, + "auxiliary_loss_mlp": 0.01046833, + "balance_loss_clip": 1.0398252, + "balance_loss_mlp": 1.02954769, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.7092177391151784, + "language_loss": 0.67137504, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69285393, + "num_input_tokens_seen": 178226325, + "step": 8292, + "time_per_iteration": 3.964078903198242 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01038739, + "balance_loss_clip": 1.04467535, + "balance_loss_mlp": 1.02556038, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.7135183922902064, + "language_loss": 0.67118168, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69280577, + "num_input_tokens_seen": 178244960, + "step": 8293, + "time_per_iteration": 2.425293207168579 + }, + { + "auxiliary_loss_clip": 0.01110859, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.04249644, + "balance_loss_mlp": 1.01564133, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 2.299664478522598, + "language_loss": 0.82309651, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84449744, + "num_input_tokens_seen": 178265400, + "step": 8294, + "time_per_iteration": 2.5188546180725098 + }, + { + "auxiliary_loss_clip": 0.01111809, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.04108357, + "balance_loss_mlp": 1.01680875, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.5934518789527934, + "language_loss": 0.72739851, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.74882793, + "num_input_tokens_seen": 178284535, + "step": 8295, + "time_per_iteration": 2.472257137298584 + }, + { + "auxiliary_loss_clip": 0.01066908, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.04688537, + "balance_loss_mlp": 1.02103901, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.8093021760438621, + "language_loss": 0.67875659, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69976544, + "num_input_tokens_seen": 178302425, + "step": 8296, + "time_per_iteration": 2.618860960006714 + }, + { + "auxiliary_loss_clip": 0.01101763, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.04406452, + "balance_loss_mlp": 1.02555275, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 1.9744604271095285, + "language_loss": 0.64515948, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.66657674, + "num_input_tokens_seen": 178323065, + "step": 8297, + "time_per_iteration": 2.6153416633605957 + }, + { + "auxiliary_loss_clip": 0.01074583, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.04747343, + "balance_loss_mlp": 1.0223217, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.8515822751318993, + "language_loss": 0.69925404, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.72034442, + "num_input_tokens_seen": 178343985, + "step": 8298, + "time_per_iteration": 2.602980136871338 + }, + { + "auxiliary_loss_clip": 0.01118351, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.04106247, + "balance_loss_mlp": 1.02442694, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 1.7625275368912678, + "language_loss": 0.84620857, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86777413, + "num_input_tokens_seen": 178362345, + "step": 8299, + "time_per_iteration": 2.424811363220215 + }, + { + "auxiliary_loss_clip": 0.01098523, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_clip": 1.04421782, + "balance_loss_mlp": 1.02607703, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 2.2218143319578334, + "language_loss": 0.69475514, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71615624, + "num_input_tokens_seen": 178383190, + "step": 8300, + "time_per_iteration": 2.5711252689361572 + }, + { + "auxiliary_loss_clip": 0.01026843, + "auxiliary_loss_mlp": 0.0100081, + "balance_loss_clip": 1.0265944, + "balance_loss_mlp": 0.99954015, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7610388142588219, + "language_loss": 0.51098269, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53125918, + "num_input_tokens_seen": 178444250, + "step": 8301, + "time_per_iteration": 4.7026207447052 + }, + { + "auxiliary_loss_clip": 0.0109299, + "auxiliary_loss_mlp": 0.01043988, + "balance_loss_clip": 1.03945422, + "balance_loss_mlp": 1.03017139, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.779356616868583, + "language_loss": 0.84237814, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86374784, + "num_input_tokens_seen": 178463250, + "step": 8302, + "time_per_iteration": 3.87316632270813 + }, + { + "auxiliary_loss_clip": 0.01110081, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.04434168, + "balance_loss_mlp": 1.0190171, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 2.0421305694453467, + "language_loss": 0.69218957, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71360457, + "num_input_tokens_seen": 178481340, + "step": 8303, + "time_per_iteration": 2.4701879024505615 + }, + { + "auxiliary_loss_clip": 0.01124427, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.0439415, + "balance_loss_mlp": 1.02484202, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.7300758725202803, + "language_loss": 0.73198617, + "learning_rate": 2.101800220681144e-06, + "loss": 0.7536217, + "num_input_tokens_seen": 178501545, + "step": 8304, + "time_per_iteration": 2.5271575450897217 + }, + { + "auxiliary_loss_clip": 0.01112144, + "auxiliary_loss_mlp": 0.01038283, + "balance_loss_clip": 1.04444516, + "balance_loss_mlp": 1.02514589, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 1.984593781025252, + "language_loss": 0.80808449, + "learning_rate": 2.10141126191199e-06, + "loss": 0.82958877, + "num_input_tokens_seen": 178519700, + "step": 8305, + "time_per_iteration": 2.503561019897461 + }, + { + "auxiliary_loss_clip": 0.01020144, + "auxiliary_loss_mlp": 0.01002954, + "balance_loss_clip": 1.02161586, + "balance_loss_mlp": 1.00168407, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7092701854629219, + "language_loss": 0.56943929, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.5896703, + "num_input_tokens_seen": 178576740, + "step": 8306, + "time_per_iteration": 3.2055628299713135 + }, + { + "auxiliary_loss_clip": 0.01124754, + "auxiliary_loss_mlp": 0.01039648, + "balance_loss_clip": 1.04620206, + "balance_loss_mlp": 1.02430558, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.8707713091650562, + "language_loss": 0.82729566, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84893966, + "num_input_tokens_seen": 178594745, + "step": 8307, + "time_per_iteration": 2.4069924354553223 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.04211032, + "balance_loss_mlp": 1.02060211, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 1.6791626139691596, + "language_loss": 0.61357546, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.63512558, + "num_input_tokens_seen": 178614110, + "step": 8308, + "time_per_iteration": 2.5400543212890625 + }, + { + "auxiliary_loss_clip": 0.01117895, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.04177833, + "balance_loss_mlp": 1.02127051, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5854542585179474, + "language_loss": 0.74670744, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76823187, + "num_input_tokens_seen": 178634170, + "step": 8309, + "time_per_iteration": 2.464564323425293 + }, + { + "auxiliary_loss_clip": 0.01100968, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.04210186, + "balance_loss_mlp": 1.02385664, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 1.9481391465534112, + "language_loss": 0.79656672, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.81794864, + "num_input_tokens_seen": 178651775, + "step": 8310, + "time_per_iteration": 2.4783501625061035 + }, + { + "auxiliary_loss_clip": 0.01108482, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.04204297, + "balance_loss_mlp": 1.0262146, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.6744630589451206, + "language_loss": 0.70792377, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.72940493, + "num_input_tokens_seen": 178669720, + "step": 8311, + "time_per_iteration": 2.4300730228424072 + }, + { + "auxiliary_loss_clip": 0.01095695, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.04484379, + "balance_loss_mlp": 1.02167988, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.9447081179685903, + "language_loss": 0.77267087, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79398066, + "num_input_tokens_seen": 178686765, + "step": 8312, + "time_per_iteration": 2.4751152992248535 + }, + { + "auxiliary_loss_clip": 0.01092156, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.04340601, + "balance_loss_mlp": 1.02248263, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.7022565792252464, + "language_loss": 0.84528208, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86656547, + "num_input_tokens_seen": 178705845, + "step": 8313, + "time_per_iteration": 2.5865323543548584 + }, + { + "auxiliary_loss_clip": 0.01095733, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.041924, + "balance_loss_mlp": 1.01625419, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 1.7727598087331902, + "language_loss": 0.80976886, + "learning_rate": 2.097910461710939e-06, + "loss": 0.8310256, + "num_input_tokens_seen": 178723410, + "step": 8314, + "time_per_iteration": 2.50723934173584 + }, + { + "auxiliary_loss_clip": 0.01090438, + "auxiliary_loss_mlp": 0.00779784, + "balance_loss_clip": 1.0413568, + "balance_loss_mlp": 1.00038195, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 1.8428357193717906, + "language_loss": 0.79371542, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81241763, + "num_input_tokens_seen": 178743560, + "step": 8315, + "time_per_iteration": 2.5715880393981934 + }, + { + "auxiliary_loss_clip": 0.01120097, + "auxiliary_loss_mlp": 0.0103383, + "balance_loss_clip": 1.0431751, + "balance_loss_mlp": 1.02079475, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 1.6577714405033956, + "language_loss": 0.74522722, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76676643, + "num_input_tokens_seen": 178767225, + "step": 8316, + "time_per_iteration": 4.178116083145142 + }, + { + "auxiliary_loss_clip": 0.01104858, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.04569554, + "balance_loss_mlp": 1.02140248, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.5383363881533763, + "language_loss": 0.8157599, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83714634, + "num_input_tokens_seen": 178786810, + "step": 8317, + "time_per_iteration": 2.5125908851623535 + }, + { + "auxiliary_loss_clip": 0.01098328, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.04033065, + "balance_loss_mlp": 1.02389574, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.5951463237797046, + "language_loss": 0.83422804, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85560125, + "num_input_tokens_seen": 178805660, + "step": 8318, + "time_per_iteration": 2.4996466636657715 + }, + { + "auxiliary_loss_clip": 0.01110557, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.04287052, + "balance_loss_mlp": 1.01929939, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 2.57548839958949, + "language_loss": 0.81171918, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83314848, + "num_input_tokens_seen": 178824780, + "step": 8319, + "time_per_iteration": 2.457920789718628 + }, + { + "auxiliary_loss_clip": 0.01079467, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.03668988, + "balance_loss_mlp": 1.01921308, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.9689389651828932, + "language_loss": 0.71992463, + "learning_rate": 2.095576427171635e-06, + "loss": 0.7410382, + "num_input_tokens_seen": 178845640, + "step": 8320, + "time_per_iteration": 2.5784828662872314 + }, + { + "auxiliary_loss_clip": 0.01090119, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.04268718, + "balance_loss_mlp": 1.0292201, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 2.749358900735247, + "language_loss": 0.76653707, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.78788006, + "num_input_tokens_seen": 178862290, + "step": 8321, + "time_per_iteration": 2.4925084114074707 + }, + { + "auxiliary_loss_clip": 0.01113719, + "auxiliary_loss_mlp": 0.00779585, + "balance_loss_clip": 1.04511333, + "balance_loss_mlp": 1.00041783, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.7496052590086768, + "language_loss": 0.82762504, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.84655809, + "num_input_tokens_seen": 178879805, + "step": 8322, + "time_per_iteration": 2.4449946880340576 + }, + { + "auxiliary_loss_clip": 0.01113423, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.04189265, + "balance_loss_mlp": 1.02073967, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.3161810648049865, + "language_loss": 0.73315525, + "learning_rate": 2.094409360775228e-06, + "loss": 0.75463217, + "num_input_tokens_seen": 178896985, + "step": 8323, + "time_per_iteration": 2.460815668106079 + }, + { + "auxiliary_loss_clip": 0.01086839, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.04621291, + "balance_loss_mlp": 1.02303183, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.6211099505672841, + "language_loss": 0.69149482, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71273172, + "num_input_tokens_seen": 178920605, + "step": 8324, + "time_per_iteration": 2.608225107192993 + }, + { + "auxiliary_loss_clip": 0.01110843, + "auxiliary_loss_mlp": 0.00779748, + "balance_loss_clip": 1.04609692, + "balance_loss_mlp": 1.00035346, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 2.391615325733461, + "language_loss": 0.72075784, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.73966372, + "num_input_tokens_seen": 178937760, + "step": 8325, + "time_per_iteration": 2.4749932289123535 + }, + { + "auxiliary_loss_clip": 0.0108764, + "auxiliary_loss_mlp": 0.01045234, + "balance_loss_clip": 1.03829789, + "balance_loss_mlp": 1.02971315, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 46.80463417951947, + "language_loss": 0.73575687, + "learning_rate": 2.093242262158709e-06, + "loss": 0.75708568, + "num_input_tokens_seen": 178957985, + "step": 8326, + "time_per_iteration": 2.565495014190674 + }, + { + "auxiliary_loss_clip": 0.01093923, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.04094708, + "balance_loss_mlp": 1.01873994, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.4957861095146034, + "language_loss": 0.7801497, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80140734, + "num_input_tokens_seen": 178977070, + "step": 8327, + "time_per_iteration": 2.486536741256714 + }, + { + "auxiliary_loss_clip": 0.0112426, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.04413629, + "balance_loss_mlp": 1.02774811, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 6.5253962725070815, + "language_loss": 0.8704772, + "learning_rate": 2.092464178710997e-06, + "loss": 0.89213574, + "num_input_tokens_seen": 178994175, + "step": 8328, + "time_per_iteration": 2.41475248336792 + }, + { + "auxiliary_loss_clip": 0.01089101, + "auxiliary_loss_mlp": 0.01034954, + "balance_loss_clip": 1.04068494, + "balance_loss_mlp": 1.02162075, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 1.99037634315027, + "language_loss": 0.74311829, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76435876, + "num_input_tokens_seen": 179013710, + "step": 8329, + "time_per_iteration": 2.546623945236206 + }, + { + "auxiliary_loss_clip": 0.01120855, + "auxiliary_loss_mlp": 0.0103726, + "balance_loss_clip": 1.04404879, + "balance_loss_mlp": 1.02437901, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 2.0113452384004247, + "language_loss": 0.79340696, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81498814, + "num_input_tokens_seen": 179035255, + "step": 8330, + "time_per_iteration": 2.526352643966675 + }, + { + "auxiliary_loss_clip": 0.01025242, + "auxiliary_loss_mlp": 0.00754606, + "balance_loss_clip": 1.02599621, + "balance_loss_mlp": 0.99980336, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7279041400610088, + "language_loss": 0.56064975, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.57844824, + "num_input_tokens_seen": 179090915, + "step": 8331, + "time_per_iteration": 4.307205438613892 + }, + { + "auxiliary_loss_clip": 0.01109551, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.04356337, + "balance_loss_mlp": 1.01971948, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 2.819199518828288, + "language_loss": 0.65908241, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.6805023, + "num_input_tokens_seen": 179109160, + "step": 8332, + "time_per_iteration": 2.513761281967163 + }, + { + "auxiliary_loss_clip": 0.01119045, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.04293656, + "balance_loss_mlp": 1.02551842, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.6759356191736197, + "language_loss": 0.74851871, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.77009535, + "num_input_tokens_seen": 179130610, + "step": 8333, + "time_per_iteration": 2.4806182384490967 + }, + { + "auxiliary_loss_clip": 0.01122674, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.04236913, + "balance_loss_mlp": 1.02170193, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 2.8300500302019134, + "language_loss": 0.8093009, + "learning_rate": 2.090129844689929e-06, + "loss": 0.83087397, + "num_input_tokens_seen": 179147860, + "step": 8334, + "time_per_iteration": 2.422412395477295 + }, + { + "auxiliary_loss_clip": 0.01031937, + "auxiliary_loss_mlp": 0.01005978, + "balance_loss_clip": 1.01554418, + "balance_loss_mlp": 1.0048039, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8934393444007083, + "language_loss": 0.62690312, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64728224, + "num_input_tokens_seen": 179210490, + "step": 8335, + "time_per_iteration": 3.0183753967285156 + }, + { + "auxiliary_loss_clip": 0.01107196, + "auxiliary_loss_mlp": 0.01032116, + "balance_loss_clip": 1.03948259, + "balance_loss_mlp": 1.01911068, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.8562225760464859, + "language_loss": 0.79709959, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81849277, + "num_input_tokens_seen": 179231360, + "step": 8336, + "time_per_iteration": 2.5110726356506348 + }, + { + "auxiliary_loss_clip": 0.01081375, + "auxiliary_loss_mlp": 0.01035495, + "balance_loss_clip": 1.03653097, + "balance_loss_mlp": 1.02093995, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.8031916711117502, + "language_loss": 0.80319369, + "learning_rate": 2.088962631340836e-06, + "loss": 0.8243624, + "num_input_tokens_seen": 179250625, + "step": 8337, + "time_per_iteration": 2.545550584793091 + }, + { + "auxiliary_loss_clip": 0.01125731, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.04281092, + "balance_loss_mlp": 1.02095973, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.8317305636925674, + "language_loss": 0.79415947, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81576854, + "num_input_tokens_seen": 179267360, + "step": 8338, + "time_per_iteration": 2.4194977283477783 + }, + { + "auxiliary_loss_clip": 0.01096892, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.04017675, + "balance_loss_mlp": 1.0182339, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6892284551012648, + "language_loss": 0.84937882, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87066817, + "num_input_tokens_seen": 179289810, + "step": 8339, + "time_per_iteration": 2.5465199947357178 + }, + { + "auxiliary_loss_clip": 0.01107301, + "auxiliary_loss_mlp": 0.01034808, + "balance_loss_clip": 1.04055679, + "balance_loss_mlp": 1.02145076, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 2.1551304962466684, + "language_loss": 0.70931089, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.73073196, + "num_input_tokens_seen": 179310620, + "step": 8340, + "time_per_iteration": 4.010798692703247 + }, + { + "auxiliary_loss_clip": 0.01088089, + "auxiliary_loss_mlp": 0.01043847, + "balance_loss_clip": 1.03871775, + "balance_loss_mlp": 1.02787292, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 2.689292901289623, + "language_loss": 0.78304625, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.80436563, + "num_input_tokens_seen": 179329005, + "step": 8341, + "time_per_iteration": 3.9171903133392334 + }, + { + "auxiliary_loss_clip": 0.01092498, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.04137623, + "balance_loss_mlp": 1.02395833, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 3.0199531321828283, + "language_loss": 0.89643312, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91774344, + "num_input_tokens_seen": 179343785, + "step": 8342, + "time_per_iteration": 2.490506410598755 + }, + { + "auxiliary_loss_clip": 0.01099578, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.04167223, + "balance_loss_mlp": 1.02187896, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 1.8081484452492922, + "language_loss": 0.7669031, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.78825605, + "num_input_tokens_seen": 179364070, + "step": 8343, + "time_per_iteration": 2.541440725326538 + }, + { + "auxiliary_loss_clip": 0.01107974, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.04082823, + "balance_loss_mlp": 1.01712513, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 1.708750459197784, + "language_loss": 0.66962534, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69100076, + "num_input_tokens_seen": 179384225, + "step": 8344, + "time_per_iteration": 2.471508264541626 + }, + { + "auxiliary_loss_clip": 0.01101266, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.04127288, + "balance_loss_mlp": 1.01991844, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 2.0888021301330033, + "language_loss": 0.75570905, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77705073, + "num_input_tokens_seen": 179402595, + "step": 8345, + "time_per_iteration": 2.5526177883148193 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.04509795, + "balance_loss_mlp": 1.01860321, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 1.9551216776013771, + "language_loss": 0.78196198, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80337179, + "num_input_tokens_seen": 179419635, + "step": 8346, + "time_per_iteration": 2.464884042739868 + }, + { + "auxiliary_loss_clip": 0.01099542, + "auxiliary_loss_mlp": 0.00778242, + "balance_loss_clip": 1.03946745, + "balance_loss_mlp": 1.00025606, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.45021123219327, + "language_loss": 0.68934882, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.70812666, + "num_input_tokens_seen": 179438770, + "step": 8347, + "time_per_iteration": 2.501147985458374 + }, + { + "auxiliary_loss_clip": 0.01083584, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.03664923, + "balance_loss_mlp": 1.02218366, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 1.919211434461686, + "language_loss": 0.7095378, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73073733, + "num_input_tokens_seen": 179457475, + "step": 8348, + "time_per_iteration": 2.528087854385376 + }, + { + "auxiliary_loss_clip": 0.01107151, + "auxiliary_loss_mlp": 0.01034631, + "balance_loss_clip": 1.04156077, + "balance_loss_mlp": 1.02232885, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.5337537104786811, + "language_loss": 0.74274015, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76415801, + "num_input_tokens_seen": 179478140, + "step": 8349, + "time_per_iteration": 2.481516122817993 + }, + { + "auxiliary_loss_clip": 0.01109967, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.04062736, + "balance_loss_mlp": 1.0176338, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.009810677487088, + "language_loss": 0.63593256, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.65735453, + "num_input_tokens_seen": 179494325, + "step": 8350, + "time_per_iteration": 2.4495041370391846 + }, + { + "auxiliary_loss_clip": 0.01012535, + "auxiliary_loss_mlp": 0.01003629, + "balance_loss_clip": 1.01440334, + "balance_loss_mlp": 1.00215113, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 0.7713936981751303, + "language_loss": 0.59757483, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.61773646, + "num_input_tokens_seen": 179553545, + "step": 8351, + "time_per_iteration": 3.237375497817993 + }, + { + "auxiliary_loss_clip": 0.01095124, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.03792286, + "balance_loss_mlp": 1.0217886, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 1.7750256983560933, + "language_loss": 0.75237238, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77368128, + "num_input_tokens_seen": 179573645, + "step": 8352, + "time_per_iteration": 2.522357225418091 + }, + { + "auxiliary_loss_clip": 0.01095026, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.04068482, + "balance_loss_mlp": 1.01911616, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.8460394551135038, + "language_loss": 0.72143781, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74271822, + "num_input_tokens_seen": 179591435, + "step": 8353, + "time_per_iteration": 2.4928336143493652 + }, + { + "auxiliary_loss_clip": 0.01118317, + "auxiliary_loss_mlp": 0.01037798, + "balance_loss_clip": 1.04739428, + "balance_loss_mlp": 1.02267051, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 2.1147192719172643, + "language_loss": 0.73826319, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.75982434, + "num_input_tokens_seen": 179609955, + "step": 8354, + "time_per_iteration": 2.4670469760894775 + }, + { + "auxiliary_loss_clip": 0.01097566, + "auxiliary_loss_mlp": 0.01039343, + "balance_loss_clip": 1.04059136, + "balance_loss_mlp": 1.02536559, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.6830084089264368, + "language_loss": 0.72401881, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74538791, + "num_input_tokens_seen": 179630875, + "step": 8355, + "time_per_iteration": 2.575582265853882 + }, + { + "auxiliary_loss_clip": 0.01114093, + "auxiliary_loss_mlp": 0.01037303, + "balance_loss_clip": 1.04398108, + "balance_loss_mlp": 1.02175856, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.5839417513367353, + "language_loss": 0.81310439, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83461827, + "num_input_tokens_seen": 179649835, + "step": 8356, + "time_per_iteration": 4.00567102432251 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.04302263, + "balance_loss_mlp": 1.02364147, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 5.0552840761961235, + "language_loss": 0.76337641, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78493011, + "num_input_tokens_seen": 179667605, + "step": 8357, + "time_per_iteration": 2.466205358505249 + }, + { + "auxiliary_loss_clip": 0.01113351, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.04418969, + "balance_loss_mlp": 1.02086782, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.9228719075987202, + "language_loss": 0.76264906, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78414083, + "num_input_tokens_seen": 179686910, + "step": 8358, + "time_per_iteration": 2.564803123474121 + }, + { + "auxiliary_loss_clip": 0.01099949, + "auxiliary_loss_mlp": 0.01038737, + "balance_loss_clip": 1.04023516, + "balance_loss_mlp": 1.02439022, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.449224835993534, + "language_loss": 0.72251177, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74389863, + "num_input_tokens_seen": 179706395, + "step": 8359, + "time_per_iteration": 2.5255236625671387 + }, + { + "auxiliary_loss_clip": 0.01093725, + "auxiliary_loss_mlp": 0.01043569, + "balance_loss_clip": 1.04013598, + "balance_loss_mlp": 1.02900743, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.6983835707131372, + "language_loss": 0.76984298, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79121596, + "num_input_tokens_seen": 179725735, + "step": 8360, + "time_per_iteration": 2.505389928817749 + }, + { + "auxiliary_loss_clip": 0.01081669, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.04421842, + "balance_loss_mlp": 1.02226305, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.5610043451029305, + "language_loss": 0.76841867, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78959054, + "num_input_tokens_seen": 179746150, + "step": 8361, + "time_per_iteration": 2.536986827850342 + }, + { + "auxiliary_loss_clip": 0.01092369, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.03939629, + "balance_loss_mlp": 1.01828909, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.6249054521666855, + "language_loss": 0.8517288, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87298501, + "num_input_tokens_seen": 179767550, + "step": 8362, + "time_per_iteration": 2.5968666076660156 + }, + { + "auxiliary_loss_clip": 0.01098775, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.04158652, + "balance_loss_mlp": 1.01625597, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.6001017224597063, + "language_loss": 0.78490192, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80618811, + "num_input_tokens_seen": 179790075, + "step": 8363, + "time_per_iteration": 2.5424084663391113 + }, + { + "auxiliary_loss_clip": 0.01107622, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.04264593, + "balance_loss_mlp": 1.01879478, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 2.712726302389151, + "language_loss": 0.75876713, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.78016806, + "num_input_tokens_seen": 179806515, + "step": 8364, + "time_per_iteration": 2.4810001850128174 + }, + { + "auxiliary_loss_clip": 0.01121041, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.04396045, + "balance_loss_mlp": 1.01760006, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.510056338334513, + "language_loss": 0.69363075, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.71514916, + "num_input_tokens_seen": 179826450, + "step": 8365, + "time_per_iteration": 2.4297983646392822 + }, + { + "auxiliary_loss_clip": 0.01101775, + "auxiliary_loss_mlp": 0.0103488, + "balance_loss_clip": 1.04190063, + "balance_loss_mlp": 1.02039623, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.4895290821491165, + "language_loss": 0.73348129, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75484776, + "num_input_tokens_seen": 179846770, + "step": 8366, + "time_per_iteration": 2.643720865249634 + }, + { + "auxiliary_loss_clip": 0.01111877, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.04493475, + "balance_loss_mlp": 1.01943159, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.4822248218546799, + "language_loss": 0.78127164, + "learning_rate": 2.077288893713735e-06, + "loss": 0.8027215, + "num_input_tokens_seen": 179866585, + "step": 8367, + "time_per_iteration": 2.5232443809509277 + }, + { + "auxiliary_loss_clip": 0.01111413, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.041574, + "balance_loss_mlp": 1.01787877, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 2.208065387771614, + "language_loss": 0.70147127, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.7228989, + "num_input_tokens_seen": 179885575, + "step": 8368, + "time_per_iteration": 2.479663133621216 + }, + { + "auxiliary_loss_clip": 0.01030084, + "auxiliary_loss_mlp": 0.01002248, + "balance_loss_clip": 1.01221085, + "balance_loss_mlp": 1.00097203, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.9504820216149809, + "language_loss": 0.63371587, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65403914, + "num_input_tokens_seen": 179939650, + "step": 8369, + "time_per_iteration": 3.0340847969055176 + }, + { + "auxiliary_loss_clip": 0.01112244, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.04903328, + "balance_loss_mlp": 1.01853609, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 2.164309206953638, + "language_loss": 0.60451096, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62594515, + "num_input_tokens_seen": 179961765, + "step": 8370, + "time_per_iteration": 4.060229301452637 + }, + { + "auxiliary_loss_clip": 0.01074783, + "auxiliary_loss_mlp": 0.01043097, + "balance_loss_clip": 1.03792524, + "balance_loss_mlp": 1.02745676, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 2.8749203773813576, + "language_loss": 0.68460017, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.7057789, + "num_input_tokens_seen": 179983015, + "step": 8371, + "time_per_iteration": 2.6910741329193115 + }, + { + "auxiliary_loss_clip": 0.01098194, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.04064417, + "balance_loss_mlp": 1.01757073, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 1.7091836896778043, + "language_loss": 0.6760965, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.69741017, + "num_input_tokens_seen": 180003210, + "step": 8372, + "time_per_iteration": 2.61716365814209 + }, + { + "auxiliary_loss_clip": 0.01083515, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.0386889, + "balance_loss_mlp": 1.02602541, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 1.598300121880864, + "language_loss": 0.66503847, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68630004, + "num_input_tokens_seen": 180025530, + "step": 8373, + "time_per_iteration": 2.641731023788452 + }, + { + "auxiliary_loss_clip": 0.01097149, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.04167032, + "balance_loss_mlp": 1.02102494, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.7188189399037228, + "language_loss": 0.74910879, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.77042741, + "num_input_tokens_seen": 180043180, + "step": 8374, + "time_per_iteration": 2.5257842540740967 + }, + { + "auxiliary_loss_clip": 0.01097934, + "auxiliary_loss_mlp": 0.01038434, + "balance_loss_clip": 1.04146934, + "balance_loss_mlp": 1.02381873, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 2.0329236257145333, + "language_loss": 0.6816287, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70299238, + "num_input_tokens_seen": 180062905, + "step": 8375, + "time_per_iteration": 2.52756667137146 + }, + { + "auxiliary_loss_clip": 0.01076453, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.04279351, + "balance_loss_mlp": 1.02067494, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.671343949496694, + "language_loss": 0.78775233, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.80888444, + "num_input_tokens_seen": 180082000, + "step": 8376, + "time_per_iteration": 2.5870542526245117 + }, + { + "auxiliary_loss_clip": 0.0111587, + "auxiliary_loss_mlp": 0.00780057, + "balance_loss_clip": 1.04393435, + "balance_loss_mlp": 1.00041246, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 1.9030456923260601, + "language_loss": 0.59180474, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61076391, + "num_input_tokens_seen": 180101340, + "step": 8377, + "time_per_iteration": 2.558443784713745 + }, + { + "auxiliary_loss_clip": 0.01099556, + "auxiliary_loss_mlp": 0.01040585, + "balance_loss_clip": 1.04207349, + "balance_loss_mlp": 1.02621424, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 1.949109485877706, + "language_loss": 0.76183581, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78323722, + "num_input_tokens_seen": 180119160, + "step": 8378, + "time_per_iteration": 2.4751064777374268 + }, + { + "auxiliary_loss_clip": 0.01081241, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.03963566, + "balance_loss_mlp": 1.02461505, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.7586621272662613, + "language_loss": 0.74706018, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.76825416, + "num_input_tokens_seen": 180138730, + "step": 8379, + "time_per_iteration": 2.571211814880371 + }, + { + "auxiliary_loss_clip": 0.01109792, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.04541564, + "balance_loss_mlp": 1.02304196, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 3.581146213159492, + "language_loss": 0.66587812, + "learning_rate": 2.072229431544548e-06, + "loss": 0.68733883, + "num_input_tokens_seen": 180158810, + "step": 8380, + "time_per_iteration": 4.05018162727356 + }, + { + "auxiliary_loss_clip": 0.01066947, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.03959227, + "balance_loss_mlp": 1.02314758, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.8172972837651156, + "language_loss": 0.6373648, + "learning_rate": 2.071840222561051e-06, + "loss": 0.65839219, + "num_input_tokens_seen": 180179700, + "step": 8381, + "time_per_iteration": 4.052227020263672 + }, + { + "auxiliary_loss_clip": 0.01099345, + "auxiliary_loss_mlp": 0.01040274, + "balance_loss_clip": 1.04049027, + "balance_loss_mlp": 1.02707791, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.444771951523214, + "language_loss": 0.67495638, + "learning_rate": 2.071451010853365e-06, + "loss": 0.6963526, + "num_input_tokens_seen": 180199890, + "step": 8382, + "time_per_iteration": 2.562023401260376 + }, + { + "auxiliary_loss_clip": 0.01112458, + "auxiliary_loss_mlp": 0.01041155, + "balance_loss_clip": 1.04641032, + "balance_loss_mlp": 1.02640855, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 1.900178626028111, + "language_loss": 0.62153685, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64307296, + "num_input_tokens_seen": 180217840, + "step": 8383, + "time_per_iteration": 2.4875590801239014 + }, + { + "auxiliary_loss_clip": 0.01086266, + "auxiliary_loss_mlp": 0.01035099, + "balance_loss_clip": 1.04249549, + "balance_loss_mlp": 1.02168214, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 1.8751815348892622, + "language_loss": 0.66577601, + "learning_rate": 2.070672579324465e-06, + "loss": 0.68698967, + "num_input_tokens_seen": 180236465, + "step": 8384, + "time_per_iteration": 2.546863317489624 + }, + { + "auxiliary_loss_clip": 0.01104201, + "auxiliary_loss_mlp": 0.01041737, + "balance_loss_clip": 1.04216528, + "balance_loss_mlp": 1.02878451, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.5837716827645756, + "language_loss": 0.70991719, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.73137653, + "num_input_tokens_seen": 180258025, + "step": 8385, + "time_per_iteration": 2.624561309814453 + }, + { + "auxiliary_loss_clip": 0.01107272, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.04082537, + "balance_loss_mlp": 1.01970458, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 2.1774529357524437, + "language_loss": 0.83156669, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85296869, + "num_input_tokens_seen": 180277825, + "step": 8386, + "time_per_iteration": 2.5058066844940186 + }, + { + "auxiliary_loss_clip": 0.011144, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.04955828, + "balance_loss_mlp": 1.02128887, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.7785820959687613, + "language_loss": 0.66407919, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.68557948, + "num_input_tokens_seen": 180300465, + "step": 8387, + "time_per_iteration": 2.5765469074249268 + }, + { + "auxiliary_loss_clip": 0.0106716, + "auxiliary_loss_mlp": 0.01035833, + "balance_loss_clip": 1.03899646, + "balance_loss_mlp": 1.02311349, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.6736363172296154, + "language_loss": 0.80457306, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82560301, + "num_input_tokens_seen": 180321050, + "step": 8388, + "time_per_iteration": 2.6510298252105713 + }, + { + "auxiliary_loss_clip": 0.01110317, + "auxiliary_loss_mlp": 0.01035348, + "balance_loss_clip": 1.04331338, + "balance_loss_mlp": 1.02222276, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 3.6999842853040312, + "language_loss": 0.69790232, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.71935898, + "num_input_tokens_seen": 180338870, + "step": 8389, + "time_per_iteration": 2.5323684215545654 + }, + { + "auxiliary_loss_clip": 0.01096537, + "auxiliary_loss_mlp": 0.0104222, + "balance_loss_clip": 1.04545856, + "balance_loss_mlp": 1.02888632, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.5851942216452004, + "language_loss": 0.69185305, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71324062, + "num_input_tokens_seen": 180361285, + "step": 8390, + "time_per_iteration": 2.5838818550109863 + }, + { + "auxiliary_loss_clip": 0.01024939, + "auxiliary_loss_mlp": 0.01016197, + "balance_loss_clip": 1.01754546, + "balance_loss_mlp": 1.01506424, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.828900735944493, + "language_loss": 0.52863765, + "learning_rate": 2.067947985330974e-06, + "loss": 0.54904902, + "num_input_tokens_seen": 180415170, + "step": 8391, + "time_per_iteration": 2.8905255794525146 + }, + { + "auxiliary_loss_clip": 0.01036868, + "auxiliary_loss_mlp": 0.01004065, + "balance_loss_clip": 1.04428542, + "balance_loss_mlp": 1.00286698, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8702857102058245, + "language_loss": 0.60728306, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.6276924, + "num_input_tokens_seen": 180468060, + "step": 8392, + "time_per_iteration": 2.952604293823242 + }, + { + "auxiliary_loss_clip": 0.0108701, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.04133713, + "balance_loss_mlp": 1.02246284, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.689798445894869, + "language_loss": 0.843503, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86472839, + "num_input_tokens_seen": 180486610, + "step": 8393, + "time_per_iteration": 2.6070315837860107 + }, + { + "auxiliary_loss_clip": 0.01090047, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.04287314, + "balance_loss_mlp": 1.01948798, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 1.9515238798546186, + "language_loss": 0.51211762, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53334266, + "num_input_tokens_seen": 180508135, + "step": 8394, + "time_per_iteration": 4.248631954193115 + }, + { + "auxiliary_loss_clip": 0.01119208, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.04059672, + "balance_loss_mlp": 1.02111149, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.5107142146382075, + "language_loss": 0.75422037, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.7757687, + "num_input_tokens_seen": 180527000, + "step": 8395, + "time_per_iteration": 2.517484188079834 + }, + { + "auxiliary_loss_clip": 0.01103476, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.03990686, + "balance_loss_mlp": 1.02187526, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 1.8377006164222431, + "language_loss": 0.68118727, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.70257753, + "num_input_tokens_seen": 180544715, + "step": 8396, + "time_per_iteration": 2.4832053184509277 + }, + { + "auxiliary_loss_clip": 0.01110735, + "auxiliary_loss_mlp": 0.01033083, + "balance_loss_clip": 1.04400134, + "balance_loss_mlp": 1.02004743, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 2.126529613692863, + "language_loss": 0.78809994, + "learning_rate": 2.065612518371792e-06, + "loss": 0.80953813, + "num_input_tokens_seen": 180565365, + "step": 8397, + "time_per_iteration": 2.56264066696167 + }, + { + "auxiliary_loss_clip": 0.01080636, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.04265237, + "balance_loss_mlp": 1.01646709, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.5082429789557765, + "language_loss": 0.66408801, + "learning_rate": 2.065223265084376e-06, + "loss": 0.6851868, + "num_input_tokens_seen": 180586670, + "step": 8398, + "time_per_iteration": 2.6326327323913574 + }, + { + "auxiliary_loss_clip": 0.01108723, + "auxiliary_loss_mlp": 0.00779065, + "balance_loss_clip": 1.04313207, + "balance_loss_mlp": 1.00027633, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.8646590510128587, + "language_loss": 0.71818924, + "learning_rate": 2.064834009323688e-06, + "loss": 0.7370671, + "num_input_tokens_seen": 180605085, + "step": 8399, + "time_per_iteration": 2.5130701065063477 + }, + { + "auxiliary_loss_clip": 0.01089658, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.03931332, + "balance_loss_mlp": 1.03624701, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.7668004051434578, + "language_loss": 0.81368208, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.8350901, + "num_input_tokens_seen": 180624370, + "step": 8400, + "time_per_iteration": 2.5402045249938965 + }, + { + "auxiliary_loss_clip": 0.01080521, + "auxiliary_loss_mlp": 0.01040788, + "balance_loss_clip": 1.03822517, + "balance_loss_mlp": 1.02575541, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.9625339132368276, + "language_loss": 0.78874648, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.80995953, + "num_input_tokens_seen": 180642450, + "step": 8401, + "time_per_iteration": 2.5816903114318848 + }, + { + "auxiliary_loss_clip": 0.01120449, + "auxiliary_loss_mlp": 0.00778636, + "balance_loss_clip": 1.04113483, + "balance_loss_mlp": 1.00037932, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.5967537089477315, + "language_loss": 0.70224786, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72123873, + "num_input_tokens_seen": 180665250, + "step": 8402, + "time_per_iteration": 2.5761868953704834 + }, + { + "auxiliary_loss_clip": 0.01109063, + "auxiliary_loss_mlp": 0.00779225, + "balance_loss_clip": 1.03969383, + "balance_loss_mlp": 1.00030613, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.656105994193941, + "language_loss": 0.69521892, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71410179, + "num_input_tokens_seen": 180687425, + "step": 8403, + "time_per_iteration": 2.5670254230499268 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01036983, + "balance_loss_clip": 1.04084301, + "balance_loss_mlp": 1.02384639, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.661405125905616, + "language_loss": 0.85645252, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87788844, + "num_input_tokens_seen": 180708725, + "step": 8404, + "time_per_iteration": 2.5401721000671387 + }, + { + "auxiliary_loss_clip": 0.01087551, + "auxiliary_loss_mlp": 0.00777021, + "balance_loss_clip": 1.04358053, + "balance_loss_mlp": 1.0003612, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.6367239938879714, + "language_loss": 0.75395441, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77260011, + "num_input_tokens_seen": 180727990, + "step": 8405, + "time_per_iteration": 2.6420607566833496 + }, + { + "auxiliary_loss_clip": 0.01121327, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.04279172, + "balance_loss_mlp": 1.01213515, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 2.156024067215698, + "language_loss": 0.73101646, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75248963, + "num_input_tokens_seen": 180749765, + "step": 8406, + "time_per_iteration": 2.650423765182495 + }, + { + "auxiliary_loss_clip": 0.0108608, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.04646873, + "balance_loss_mlp": 1.02210402, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.8968910607322635, + "language_loss": 0.76919156, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.79039836, + "num_input_tokens_seen": 180769580, + "step": 8407, + "time_per_iteration": 2.632394313812256 + }, + { + "auxiliary_loss_clip": 0.01084837, + "auxiliary_loss_mlp": 0.01034116, + "balance_loss_clip": 1.03779936, + "balance_loss_mlp": 1.02156305, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.8709871779635925, + "language_loss": 0.63398445, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65517396, + "num_input_tokens_seen": 180790295, + "step": 8408, + "time_per_iteration": 2.652318000793457 + }, + { + "auxiliary_loss_clip": 0.01089924, + "auxiliary_loss_mlp": 0.01043464, + "balance_loss_clip": 1.0384649, + "balance_loss_mlp": 1.02717972, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 2.670188343325811, + "language_loss": 0.63954675, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.66088057, + "num_input_tokens_seen": 180807875, + "step": 8409, + "time_per_iteration": 4.057764291763306 + }, + { + "auxiliary_loss_clip": 0.01095984, + "auxiliary_loss_mlp": 0.01024194, + "balance_loss_clip": 1.04160738, + "balance_loss_mlp": 1.01213551, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.3411170194984194, + "language_loss": 0.71058375, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73178554, + "num_input_tokens_seen": 180831300, + "step": 8410, + "time_per_iteration": 2.6134750843048096 + }, + { + "auxiliary_loss_clip": 0.01094449, + "auxiliary_loss_mlp": 0.01040452, + "balance_loss_clip": 1.03870976, + "balance_loss_mlp": 1.02641511, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.5256083542946604, + "language_loss": 0.79033166, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81168067, + "num_input_tokens_seen": 180849055, + "step": 8411, + "time_per_iteration": 2.5327367782592773 + }, + { + "auxiliary_loss_clip": 0.01120446, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.04138017, + "balance_loss_mlp": 1.02432942, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.7372556773244188, + "language_loss": 0.81786287, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83945572, + "num_input_tokens_seen": 180867395, + "step": 8412, + "time_per_iteration": 2.5825393199920654 + }, + { + "auxiliary_loss_clip": 0.01101092, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.04158282, + "balance_loss_mlp": 1.02689731, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 1.9125323174265678, + "language_loss": 0.80769551, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82911229, + "num_input_tokens_seen": 180886670, + "step": 8413, + "time_per_iteration": 2.5954232215881348 + }, + { + "auxiliary_loss_clip": 0.01087859, + "auxiliary_loss_mlp": 0.00779437, + "balance_loss_clip": 1.04075837, + "balance_loss_mlp": 1.00038922, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 2.054134957231423, + "language_loss": 0.80439985, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82307279, + "num_input_tokens_seen": 180904645, + "step": 8414, + "time_per_iteration": 2.6414401531219482 + }, + { + "auxiliary_loss_clip": 0.01106892, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.03802705, + "balance_loss_mlp": 1.01732063, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 1.9979064930558204, + "language_loss": 0.62284875, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64422607, + "num_input_tokens_seen": 180922340, + "step": 8415, + "time_per_iteration": 2.6074025630950928 + }, + { + "auxiliary_loss_clip": 0.01084421, + "auxiliary_loss_mlp": 0.01029611, + "balance_loss_clip": 1.03779674, + "balance_loss_mlp": 1.01636648, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.607071457348839, + "language_loss": 0.82261634, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.84375656, + "num_input_tokens_seen": 180941350, + "step": 8416, + "time_per_iteration": 2.5813982486724854 + }, + { + "auxiliary_loss_clip": 0.01089432, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.04653418, + "balance_loss_mlp": 1.02191591, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.6358428043530249, + "language_loss": 0.79296988, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81420815, + "num_input_tokens_seen": 180960720, + "step": 8417, + "time_per_iteration": 2.591970205307007 + }, + { + "auxiliary_loss_clip": 0.01068832, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.03695238, + "balance_loss_mlp": 1.02206993, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.9036817611898023, + "language_loss": 0.62799454, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.64903337, + "num_input_tokens_seen": 180979725, + "step": 8418, + "time_per_iteration": 2.675737142562866 + }, + { + "auxiliary_loss_clip": 0.01089379, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.04144406, + "balance_loss_mlp": 1.01888359, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 1.9851725248248897, + "language_loss": 0.7759912, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79719841, + "num_input_tokens_seen": 180998980, + "step": 8419, + "time_per_iteration": 7.391027212142944 + }, + { + "auxiliary_loss_clip": 0.01058521, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.03817153, + "balance_loss_mlp": 1.01966119, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.8676272367722861, + "language_loss": 0.77033424, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79125309, + "num_input_tokens_seen": 181019165, + "step": 8420, + "time_per_iteration": 4.112907409667969 + }, + { + "auxiliary_loss_clip": 0.01118974, + "auxiliary_loss_mlp": 0.01036577, + "balance_loss_clip": 1.04105949, + "balance_loss_mlp": 1.02264154, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 2.0750888874927615, + "language_loss": 0.7745654, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79612094, + "num_input_tokens_seen": 181037110, + "step": 8421, + "time_per_iteration": 2.476841688156128 + }, + { + "auxiliary_loss_clip": 0.01102618, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.03798711, + "balance_loss_mlp": 1.02240753, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 3.2389204301969143, + "language_loss": 0.66650087, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.68788809, + "num_input_tokens_seen": 181057775, + "step": 8422, + "time_per_iteration": 2.5523555278778076 + }, + { + "auxiliary_loss_clip": 0.01118553, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.04258704, + "balance_loss_mlp": 1.023633, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.7101864298009422, + "language_loss": 0.81763703, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83919108, + "num_input_tokens_seen": 181078260, + "step": 8423, + "time_per_iteration": 2.5269336700439453 + }, + { + "auxiliary_loss_clip": 0.01121467, + "auxiliary_loss_mlp": 0.00779804, + "balance_loss_clip": 1.04194069, + "balance_loss_mlp": 1.00030243, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.8659033898977648, + "language_loss": 0.74867523, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76768792, + "num_input_tokens_seen": 181098755, + "step": 8424, + "time_per_iteration": 2.4902703762054443 + }, + { + "auxiliary_loss_clip": 0.01116074, + "auxiliary_loss_mlp": 0.0103635, + "balance_loss_clip": 1.04063451, + "balance_loss_mlp": 1.02259898, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.4367240835138462, + "language_loss": 0.71085, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73237425, + "num_input_tokens_seen": 181121570, + "step": 8425, + "time_per_iteration": 2.5723533630371094 + }, + { + "auxiliary_loss_clip": 0.01082399, + "auxiliary_loss_mlp": 0.01042544, + "balance_loss_clip": 1.03950107, + "balance_loss_mlp": 1.02830434, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.7382619560042036, + "language_loss": 0.78647852, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80772799, + "num_input_tokens_seen": 181140240, + "step": 8426, + "time_per_iteration": 2.6071460247039795 + }, + { + "auxiliary_loss_clip": 0.01112246, + "auxiliary_loss_mlp": 0.01040288, + "balance_loss_clip": 1.04516304, + "balance_loss_mlp": 1.02718723, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.530971299202497, + "language_loss": 0.78170264, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80322796, + "num_input_tokens_seen": 181158630, + "step": 8427, + "time_per_iteration": 2.523798704147339 + }, + { + "auxiliary_loss_clip": 0.01116377, + "auxiliary_loss_mlp": 0.0102945, + "balance_loss_clip": 1.04105449, + "balance_loss_mlp": 1.01628947, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.7490311182301852, + "language_loss": 0.71315706, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73461527, + "num_input_tokens_seen": 181176405, + "step": 8428, + "time_per_iteration": 2.4677765369415283 + }, + { + "auxiliary_loss_clip": 0.01102964, + "auxiliary_loss_mlp": 0.00777693, + "balance_loss_clip": 1.03994966, + "balance_loss_mlp": 1.00033832, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.6036332370856872, + "language_loss": 0.83032262, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.8491292, + "num_input_tokens_seen": 181197595, + "step": 8429, + "time_per_iteration": 2.546358346939087 + }, + { + "auxiliary_loss_clip": 0.01092591, + "auxiliary_loss_mlp": 0.01038665, + "balance_loss_clip": 1.04518104, + "balance_loss_mlp": 1.02389503, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 1.9754428492803529, + "language_loss": 0.73287904, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75419152, + "num_input_tokens_seen": 181218560, + "step": 8430, + "time_per_iteration": 2.6564714908599854 + }, + { + "auxiliary_loss_clip": 0.01058698, + "auxiliary_loss_mlp": 0.01044125, + "balance_loss_clip": 1.03460836, + "balance_loss_mlp": 1.02895021, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.5975160238774935, + "language_loss": 0.76625109, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78727937, + "num_input_tokens_seen": 181237095, + "step": 8431, + "time_per_iteration": 2.6338906288146973 + }, + { + "auxiliary_loss_clip": 0.0110502, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.03946614, + "balance_loss_mlp": 1.02581, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.9708454889874194, + "language_loss": 0.7266413, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74808466, + "num_input_tokens_seen": 181255940, + "step": 8432, + "time_per_iteration": 2.5299441814422607 + }, + { + "auxiliary_loss_clip": 0.01004641, + "auxiliary_loss_mlp": 0.01009076, + "balance_loss_clip": 1.02007937, + "balance_loss_mlp": 1.00722814, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7567180455577421, + "language_loss": 0.63649642, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65663362, + "num_input_tokens_seen": 181316945, + "step": 8433, + "time_per_iteration": 3.223116159439087 + }, + { + "auxiliary_loss_clip": 0.01086809, + "auxiliary_loss_mlp": 0.01042414, + "balance_loss_clip": 1.04137278, + "balance_loss_mlp": 1.0287292, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 2.6925182272888457, + "language_loss": 0.77345252, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79474473, + "num_input_tokens_seen": 181335555, + "step": 8434, + "time_per_iteration": 4.089850425720215 + }, + { + "auxiliary_loss_clip": 0.01098994, + "auxiliary_loss_mlp": 0.01035205, + "balance_loss_clip": 1.03979588, + "balance_loss_mlp": 1.02148414, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.9483380594889246, + "language_loss": 0.70563734, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.72697932, + "num_input_tokens_seen": 181354580, + "step": 8435, + "time_per_iteration": 2.620759963989258 + }, + { + "auxiliary_loss_clip": 0.0111652, + "auxiliary_loss_mlp": 0.01041983, + "balance_loss_clip": 1.04615068, + "balance_loss_mlp": 1.02670074, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 4.686786045887443, + "language_loss": 0.72168034, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74326539, + "num_input_tokens_seen": 181374320, + "step": 8436, + "time_per_iteration": 2.574646472930908 + }, + { + "auxiliary_loss_clip": 0.01123014, + "auxiliary_loss_mlp": 0.01039507, + "balance_loss_clip": 1.04457736, + "balance_loss_mlp": 1.02455235, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.5879085954516738, + "language_loss": 0.83837879, + "learning_rate": 2.050040603565483e-06, + "loss": 0.86000395, + "num_input_tokens_seen": 181392190, + "step": 8437, + "time_per_iteration": 2.5262715816497803 + }, + { + "auxiliary_loss_clip": 0.01108658, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.04355741, + "balance_loss_mlp": 1.01751804, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.4503813473736764, + "language_loss": 0.80725354, + "learning_rate": 2.049651262861309e-06, + "loss": 0.82864618, + "num_input_tokens_seen": 181413890, + "step": 8438, + "time_per_iteration": 2.608825206756592 + }, + { + "auxiliary_loss_clip": 0.01085998, + "auxiliary_loss_mlp": 0.01040341, + "balance_loss_clip": 1.04889274, + "balance_loss_mlp": 1.02465296, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.544098045156807, + "language_loss": 0.79433084, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81559426, + "num_input_tokens_seen": 181433240, + "step": 8439, + "time_per_iteration": 2.6526150703430176 + }, + { + "auxiliary_loss_clip": 0.01090201, + "auxiliary_loss_mlp": 0.00782533, + "balance_loss_clip": 1.04224777, + "balance_loss_mlp": 1.00034547, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.693356379081181, + "language_loss": 0.71440566, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73313296, + "num_input_tokens_seen": 181453535, + "step": 8440, + "time_per_iteration": 2.6096317768096924 + }, + { + "auxiliary_loss_clip": 0.01096348, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.04029191, + "balance_loss_mlp": 1.02604258, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.5222336036750186, + "language_loss": 0.70984459, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73120821, + "num_input_tokens_seen": 181474195, + "step": 8441, + "time_per_iteration": 2.615694046020508 + }, + { + "auxiliary_loss_clip": 0.01111605, + "auxiliary_loss_mlp": 0.00779375, + "balance_loss_clip": 1.04200494, + "balance_loss_mlp": 1.00029707, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.5831790697865855, + "language_loss": 0.63572013, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65462995, + "num_input_tokens_seen": 181494000, + "step": 8442, + "time_per_iteration": 2.5023536682128906 + }, + { + "auxiliary_loss_clip": 0.010669, + "auxiliary_loss_mlp": 0.01027673, + "balance_loss_clip": 1.04053926, + "balance_loss_mlp": 1.01492953, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.4989719649357782, + "language_loss": 0.71282864, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73377442, + "num_input_tokens_seen": 181515955, + "step": 8443, + "time_per_iteration": 2.718599557876587 + }, + { + "auxiliary_loss_clip": 0.0103895, + "auxiliary_loss_mlp": 0.0103998, + "balance_loss_clip": 1.03084397, + "balance_loss_mlp": 1.0241369, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.205367460573575, + "language_loss": 0.6214574, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64224672, + "num_input_tokens_seen": 181540225, + "step": 8444, + "time_per_iteration": 2.973125457763672 + }, + { + "auxiliary_loss_clip": 0.01085011, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.03782344, + "balance_loss_mlp": 1.01962507, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.746094988924522, + "language_loss": 0.63990468, + "learning_rate": 2.046925826041012e-06, + "loss": 0.66108125, + "num_input_tokens_seen": 181560125, + "step": 8445, + "time_per_iteration": 3.108436107635498 + }, + { + "auxiliary_loss_clip": 0.01014408, + "auxiliary_loss_mlp": 0.01003189, + "balance_loss_clip": 1.02469599, + "balance_loss_mlp": 1.00172865, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8237212582625716, + "language_loss": 0.61885631, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.63903224, + "num_input_tokens_seen": 181618830, + "step": 8446, + "time_per_iteration": 3.162151336669922 + }, + { + "auxiliary_loss_clip": 0.01082479, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.04126096, + "balance_loss_mlp": 1.01519847, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.6245403271911987, + "language_loss": 0.80428851, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82539505, + "num_input_tokens_seen": 181637120, + "step": 8447, + "time_per_iteration": 2.592041254043579 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.04334235, + "balance_loss_mlp": 1.01830006, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.8374056907881533, + "language_loss": 0.70454109, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72595263, + "num_input_tokens_seen": 181659965, + "step": 8448, + "time_per_iteration": 4.03373384475708 + }, + { + "auxiliary_loss_clip": 0.01118992, + "auxiliary_loss_mlp": 0.00776715, + "balance_loss_clip": 1.04398441, + "balance_loss_mlp": 1.00029325, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.5309466034137857, + "language_loss": 0.71836215, + "learning_rate": 2.045368394099955e-06, + "loss": 0.73731923, + "num_input_tokens_seen": 181685290, + "step": 8449, + "time_per_iteration": 2.6550958156585693 + }, + { + "auxiliary_loss_clip": 0.01094662, + "auxiliary_loss_mlp": 0.01032263, + "balance_loss_clip": 1.03937387, + "balance_loss_mlp": 1.01912069, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.477683677539269, + "language_loss": 0.72875762, + "learning_rate": 2.044979031776844e-06, + "loss": 0.75002688, + "num_input_tokens_seen": 181706080, + "step": 8450, + "time_per_iteration": 2.6459176540374756 + }, + { + "auxiliary_loss_clip": 0.01122394, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.0436703, + "balance_loss_mlp": 1.01870131, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.6977572847646758, + "language_loss": 0.77065963, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79220831, + "num_input_tokens_seen": 181724805, + "step": 8451, + "time_per_iteration": 2.700913429260254 + }, + { + "auxiliary_loss_clip": 0.01122711, + "auxiliary_loss_mlp": 0.01041831, + "balance_loss_clip": 1.04394853, + "balance_loss_mlp": 1.02874207, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.7061293902087213, + "language_loss": 0.8499279, + "learning_rate": 2.044200302028559e-06, + "loss": 0.87157333, + "num_input_tokens_seen": 181743725, + "step": 8452, + "time_per_iteration": 2.49804949760437 + }, + { + "auxiliary_loss_clip": 0.01127273, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.04577458, + "balance_loss_mlp": 1.02211547, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.73424782210946, + "language_loss": 0.77797496, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.79961538, + "num_input_tokens_seen": 181757720, + "step": 8453, + "time_per_iteration": 2.4379756450653076 + }, + { + "auxiliary_loss_clip": 0.01084525, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.04015648, + "balance_loss_mlp": 1.02318811, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.683906810685258, + "language_loss": 0.7689721, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.79017973, + "num_input_tokens_seen": 181778545, + "step": 8454, + "time_per_iteration": 2.636054277420044 + }, + { + "auxiliary_loss_clip": 0.01100343, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.04064202, + "balance_loss_mlp": 1.02524066, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.842670869474053, + "language_loss": 0.89351726, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91491389, + "num_input_tokens_seen": 181799495, + "step": 8455, + "time_per_iteration": 2.5748705863952637 + }, + { + "auxiliary_loss_clip": 0.01106463, + "auxiliary_loss_mlp": 0.00780772, + "balance_loss_clip": 1.04327381, + "balance_loss_mlp": 1.00039721, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 1.731307388576067, + "language_loss": 0.61880386, + "learning_rate": 2.042642822537149e-06, + "loss": 0.63767612, + "num_input_tokens_seen": 181818400, + "step": 8456, + "time_per_iteration": 2.6323225498199463 + }, + { + "auxiliary_loss_clip": 0.01029276, + "auxiliary_loss_mlp": 0.0099966, + "balance_loss_clip": 1.01331294, + "balance_loss_mlp": 0.99846184, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.8023225519001412, + "language_loss": 0.62283975, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64312911, + "num_input_tokens_seen": 181875975, + "step": 8457, + "time_per_iteration": 3.0319621562957764 + }, + { + "auxiliary_loss_clip": 0.01115865, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.04459834, + "balance_loss_mlp": 1.0193274, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.6248491752993317, + "language_loss": 0.67450708, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69600552, + "num_input_tokens_seen": 181896450, + "step": 8458, + "time_per_iteration": 4.102068185806274 + }, + { + "auxiliary_loss_clip": 0.01109112, + "auxiliary_loss_mlp": 0.01038555, + "balance_loss_clip": 1.04089379, + "balance_loss_mlp": 1.02424979, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.7129588051674314, + "language_loss": 0.77494466, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.79642135, + "num_input_tokens_seen": 181916770, + "step": 8459, + "time_per_iteration": 2.531759262084961 + }, + { + "auxiliary_loss_clip": 0.01127585, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.04624104, + "balance_loss_mlp": 1.02494144, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 9.148014796135346, + "language_loss": 0.80637425, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82803988, + "num_input_tokens_seen": 181932710, + "step": 8460, + "time_per_iteration": 3.9790263175964355 + }, + { + "auxiliary_loss_clip": 0.01102428, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.04263663, + "balance_loss_mlp": 1.02597213, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.548114239947171, + "language_loss": 0.686903, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70832038, + "num_input_tokens_seen": 181950665, + "step": 8461, + "time_per_iteration": 2.5447986125946045 + }, + { + "auxiliary_loss_clip": 0.01117178, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.04188538, + "balance_loss_mlp": 1.02029347, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.7143320552206676, + "language_loss": 0.75742805, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.77894044, + "num_input_tokens_seen": 181971270, + "step": 8462, + "time_per_iteration": 2.537574291229248 + }, + { + "auxiliary_loss_clip": 0.01082176, + "auxiliary_loss_mlp": 0.01042163, + "balance_loss_clip": 1.03893876, + "balance_loss_mlp": 1.02734494, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 3.199298411088687, + "language_loss": 0.81282699, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83407044, + "num_input_tokens_seen": 181988410, + "step": 8463, + "time_per_iteration": 2.5375964641571045 + }, + { + "auxiliary_loss_clip": 0.01104498, + "auxiliary_loss_mlp": 0.01042842, + "balance_loss_clip": 1.03974354, + "balance_loss_mlp": 1.02920413, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.7548725935221796, + "language_loss": 0.76405513, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78552854, + "num_input_tokens_seen": 182006530, + "step": 8464, + "time_per_iteration": 2.507983446121216 + }, + { + "auxiliary_loss_clip": 0.01029925, + "auxiliary_loss_mlp": 0.01013718, + "balance_loss_clip": 1.01361048, + "balance_loss_mlp": 1.01274014, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6908660428141178, + "language_loss": 0.59375238, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61418885, + "num_input_tokens_seen": 182074240, + "step": 8465, + "time_per_iteration": 3.1951522827148438 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.01038864, + "balance_loss_clip": 1.04244661, + "balance_loss_mlp": 1.02528548, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 2.6406939186732297, + "language_loss": 0.80141151, + "learning_rate": 2.038749012684354e-06, + "loss": 0.82300079, + "num_input_tokens_seen": 182093360, + "step": 8466, + "time_per_iteration": 2.461668014526367 + }, + { + "auxiliary_loss_clip": 0.01107772, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.04008615, + "balance_loss_mlp": 1.0201807, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.6903410132827974, + "language_loss": 0.78974044, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.8111577, + "num_input_tokens_seen": 182110170, + "step": 8467, + "time_per_iteration": 2.4943339824676514 + }, + { + "auxiliary_loss_clip": 0.01117152, + "auxiliary_loss_mlp": 0.01034053, + "balance_loss_clip": 1.04192424, + "balance_loss_mlp": 1.02109492, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.7358379136440758, + "language_loss": 0.74387288, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76538491, + "num_input_tokens_seen": 182129570, + "step": 8468, + "time_per_iteration": 2.5044543743133545 + }, + { + "auxiliary_loss_clip": 0.01118761, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.04199696, + "balance_loss_mlp": 1.02188623, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.9019304031500641, + "language_loss": 0.777771, + "learning_rate": 2.03758084040404e-06, + "loss": 0.79930824, + "num_input_tokens_seen": 182147565, + "step": 8469, + "time_per_iteration": 2.4788978099823 + }, + { + "auxiliary_loss_clip": 0.01105708, + "auxiliary_loss_mlp": 0.0104196, + "balance_loss_clip": 1.04201674, + "balance_loss_mlp": 1.02735639, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.462615404563551, + "language_loss": 0.69507873, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71655542, + "num_input_tokens_seen": 182169695, + "step": 8470, + "time_per_iteration": 2.5667638778686523 + }, + { + "auxiliary_loss_clip": 0.01097052, + "auxiliary_loss_mlp": 0.01045661, + "balance_loss_clip": 1.04052007, + "balance_loss_mlp": 1.03022301, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.1763518493164127, + "language_loss": 0.73532009, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75674719, + "num_input_tokens_seen": 182186385, + "step": 8471, + "time_per_iteration": 2.538269281387329 + }, + { + "auxiliary_loss_clip": 0.01041456, + "auxiliary_loss_mlp": 0.01005314, + "balance_loss_clip": 1.01525688, + "balance_loss_mlp": 1.00430059, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7568181235611237, + "language_loss": 0.58169329, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60216105, + "num_input_tokens_seen": 182247095, + "step": 8472, + "time_per_iteration": 3.1020994186401367 + }, + { + "auxiliary_loss_clip": 0.01069445, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.03551269, + "balance_loss_mlp": 1.02661455, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 2.1706440602911976, + "language_loss": 0.69178969, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71288067, + "num_input_tokens_seen": 182266380, + "step": 8473, + "time_per_iteration": 4.164538860321045 + }, + { + "auxiliary_loss_clip": 0.01096099, + "auxiliary_loss_mlp": 0.0103735, + "balance_loss_clip": 1.04074681, + "balance_loss_mlp": 1.02365303, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 1.8736387359492777, + "language_loss": 0.85443342, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87576789, + "num_input_tokens_seen": 182284685, + "step": 8474, + "time_per_iteration": 2.6091132164001465 + }, + { + "auxiliary_loss_clip": 0.01098341, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.04193389, + "balance_loss_mlp": 1.0188756, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.191278968462329, + "language_loss": 0.64954221, + "learning_rate": 2.035244457765222e-06, + "loss": 0.67085153, + "num_input_tokens_seen": 182301810, + "step": 8475, + "time_per_iteration": 2.5298354625701904 + }, + { + "auxiliary_loss_clip": 0.01101339, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.03860831, + "balance_loss_mlp": 1.02288783, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 2.9365557180269284, + "language_loss": 0.82053965, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84192717, + "num_input_tokens_seen": 182320285, + "step": 8476, + "time_per_iteration": 2.623438835144043 + }, + { + "auxiliary_loss_clip": 0.01069933, + "auxiliary_loss_mlp": 0.01043109, + "balance_loss_clip": 1.03646469, + "balance_loss_mlp": 1.02618146, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 2.0653623286659077, + "language_loss": 0.80752462, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.82865506, + "num_input_tokens_seen": 182339465, + "step": 8477, + "time_per_iteration": 2.6413886547088623 + }, + { + "auxiliary_loss_clip": 0.01096988, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.03894699, + "balance_loss_mlp": 1.01445448, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 3.3595757948995395, + "language_loss": 0.62181294, + "learning_rate": 2.034076248204082e-06, + "loss": 0.64308506, + "num_input_tokens_seen": 182358375, + "step": 8478, + "time_per_iteration": 2.574023485183716 + }, + { + "auxiliary_loss_clip": 0.01105111, + "auxiliary_loss_mlp": 0.01038063, + "balance_loss_clip": 1.04206526, + "balance_loss_mlp": 1.02501583, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.554159560642808, + "language_loss": 0.65705585, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.6784876, + "num_input_tokens_seen": 182377935, + "step": 8479, + "time_per_iteration": 2.612358570098877 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.04341447, + "balance_loss_mlp": 1.02012706, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.6097262855669356, + "language_loss": 0.69593787, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71736616, + "num_input_tokens_seen": 182396440, + "step": 8480, + "time_per_iteration": 2.544239044189453 + }, + { + "auxiliary_loss_clip": 0.01123231, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.04272306, + "balance_loss_mlp": 1.02275348, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.9563566541350306, + "language_loss": 0.79314351, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81474471, + "num_input_tokens_seen": 182415890, + "step": 8481, + "time_per_iteration": 2.5282211303710938 + }, + { + "auxiliary_loss_clip": 0.01103353, + "auxiliary_loss_mlp": 0.01038058, + "balance_loss_clip": 1.04053581, + "balance_loss_mlp": 1.02415824, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.5717344751292004, + "language_loss": 0.83410281, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85551691, + "num_input_tokens_seen": 182434235, + "step": 8482, + "time_per_iteration": 2.5753097534179688 + }, + { + "auxiliary_loss_clip": 0.01114016, + "auxiliary_loss_mlp": 0.00781472, + "balance_loss_clip": 1.04199934, + "balance_loss_mlp": 1.00049257, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.5983919267547126, + "language_loss": 0.85498053, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87393546, + "num_input_tokens_seen": 182454360, + "step": 8483, + "time_per_iteration": 2.6044771671295166 + }, + { + "auxiliary_loss_clip": 0.01109445, + "auxiliary_loss_mlp": 0.01034255, + "balance_loss_clip": 1.04167128, + "balance_loss_mlp": 1.02070737, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.9088234700160134, + "language_loss": 0.83166856, + "learning_rate": 2.031739794591775e-06, + "loss": 0.8531056, + "num_input_tokens_seen": 182471940, + "step": 8484, + "time_per_iteration": 2.571855068206787 + }, + { + "auxiliary_loss_clip": 0.01095781, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.03954184, + "balance_loss_mlp": 1.01744974, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 1.8849105353895117, + "language_loss": 0.81213623, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83341342, + "num_input_tokens_seen": 182490685, + "step": 8485, + "time_per_iteration": 2.5044105052948 + }, + { + "auxiliary_loss_clip": 0.01093222, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.03632259, + "balance_loss_mlp": 1.01664424, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 1.903902139205624, + "language_loss": 0.73898852, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.76022732, + "num_input_tokens_seen": 182508325, + "step": 8486, + "time_per_iteration": 2.501906156539917 + }, + { + "auxiliary_loss_clip": 0.01079031, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.03696096, + "balance_loss_mlp": 1.01974702, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.5372932646349735, + "language_loss": 0.70127112, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72240645, + "num_input_tokens_seen": 182527020, + "step": 8487, + "time_per_iteration": 2.56789231300354 + }, + { + "auxiliary_loss_clip": 0.01097257, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.04015994, + "balance_loss_mlp": 1.02050543, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 2.652364917381597, + "language_loss": 0.72809494, + "learning_rate": 2.030182134581827e-06, + "loss": 0.7494216, + "num_input_tokens_seen": 182543505, + "step": 8488, + "time_per_iteration": 3.9368579387664795 + }, + { + "auxiliary_loss_clip": 0.0107974, + "auxiliary_loss_mlp": 0.00780472, + "balance_loss_clip": 1.037992, + "balance_loss_mlp": 1.00063658, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.659536542936796, + "language_loss": 0.6929822, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71158433, + "num_input_tokens_seen": 182562250, + "step": 8489, + "time_per_iteration": 2.535794496536255 + }, + { + "auxiliary_loss_clip": 0.01097794, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.04073954, + "balance_loss_mlp": 1.02001929, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 2.046839281419867, + "language_loss": 0.72812223, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.74943548, + "num_input_tokens_seen": 182581910, + "step": 8490, + "time_per_iteration": 2.5908100605010986 + }, + { + "auxiliary_loss_clip": 0.0109152, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.03933024, + "balance_loss_mlp": 1.01765978, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.5001971031096613, + "language_loss": 0.80692971, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82815182, + "num_input_tokens_seen": 182601350, + "step": 8491, + "time_per_iteration": 2.6605918407440186 + }, + { + "auxiliary_loss_clip": 0.01106113, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.04040134, + "balance_loss_mlp": 1.01892829, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.167615960184243, + "language_loss": 0.78977025, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81115311, + "num_input_tokens_seen": 182619660, + "step": 8492, + "time_per_iteration": 2.5239901542663574 + }, + { + "auxiliary_loss_clip": 0.01087502, + "auxiliary_loss_mlp": 0.01041405, + "balance_loss_clip": 1.03800178, + "balance_loss_mlp": 1.02652133, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 1.8775600742266663, + "language_loss": 0.77722138, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79851049, + "num_input_tokens_seen": 182639815, + "step": 8493, + "time_per_iteration": 2.628037452697754 + }, + { + "auxiliary_loss_clip": 0.01078934, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.03937888, + "balance_loss_mlp": 1.01755619, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 2.0472935027535315, + "language_loss": 0.83438641, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85550535, + "num_input_tokens_seen": 182659655, + "step": 8494, + "time_per_iteration": 2.650055170059204 + }, + { + "auxiliary_loss_clip": 0.01126149, + "auxiliary_loss_mlp": 0.01036785, + "balance_loss_clip": 1.04596567, + "balance_loss_mlp": 1.02349281, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 2.1537334535389094, + "language_loss": 0.79461706, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81624639, + "num_input_tokens_seen": 182677075, + "step": 8495, + "time_per_iteration": 2.600210666656494 + }, + { + "auxiliary_loss_clip": 0.01085892, + "auxiliary_loss_mlp": 0.01036171, + "balance_loss_clip": 1.03883505, + "balance_loss_mlp": 1.0223012, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.395600349338272, + "language_loss": 0.78138483, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80260551, + "num_input_tokens_seen": 182699625, + "step": 8496, + "time_per_iteration": 2.6499032974243164 + }, + { + "auxiliary_loss_clip": 0.01107029, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.04178429, + "balance_loss_mlp": 1.0176295, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.8331344956180688, + "language_loss": 0.78733325, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.80870736, + "num_input_tokens_seen": 182717020, + "step": 8497, + "time_per_iteration": 2.525676965713501 + }, + { + "auxiliary_loss_clip": 0.01120089, + "auxiliary_loss_mlp": 0.01037702, + "balance_loss_clip": 1.04312611, + "balance_loss_mlp": 1.02395153, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.7442720944592585, + "language_loss": 0.81655157, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.83812952, + "num_input_tokens_seen": 182736955, + "step": 8498, + "time_per_iteration": 4.599763631820679 + }, + { + "auxiliary_loss_clip": 0.01087792, + "auxiliary_loss_mlp": 0.00781126, + "balance_loss_clip": 1.03768027, + "balance_loss_mlp": 1.00042212, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.6815615136225424, + "language_loss": 0.71137238, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.73006153, + "num_input_tokens_seen": 182757620, + "step": 8499, + "time_per_iteration": 2.5551748275756836 + }, + { + "auxiliary_loss_clip": 0.01065216, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.03807902, + "balance_loss_mlp": 1.02321362, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.5414944006432547, + "language_loss": 0.72537863, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74641854, + "num_input_tokens_seen": 182780195, + "step": 8500, + "time_per_iteration": 4.131205081939697 + }, + { + "auxiliary_loss_clip": 0.01113969, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.04108465, + "balance_loss_mlp": 1.0225935, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 3.4250472782132206, + "language_loss": 0.62267369, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.6441974, + "num_input_tokens_seen": 182795765, + "step": 8501, + "time_per_iteration": 2.464014768600464 + }, + { + "auxiliary_loss_clip": 0.01120267, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.0402981, + "balance_loss_mlp": 1.02347851, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.702882963346636, + "language_loss": 0.88019145, + "learning_rate": 2.024730186540907e-06, + "loss": 0.90177345, + "num_input_tokens_seen": 182813120, + "step": 8502, + "time_per_iteration": 2.4333813190460205 + }, + { + "auxiliary_loss_clip": 0.01104671, + "auxiliary_loss_mlp": 0.0103507, + "balance_loss_clip": 1.03942227, + "balance_loss_mlp": 1.02205873, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.4395929115993524, + "language_loss": 0.82398093, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.8453784, + "num_input_tokens_seen": 182835745, + "step": 8503, + "time_per_iteration": 2.536465644836426 + }, + { + "auxiliary_loss_clip": 0.01024719, + "auxiliary_loss_mlp": 0.01010117, + "balance_loss_clip": 1.01882839, + "balance_loss_mlp": 1.00884712, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8597646210119406, + "language_loss": 0.63860738, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65895569, + "num_input_tokens_seen": 182892540, + "step": 8504, + "time_per_iteration": 3.152550220489502 + }, + { + "auxiliary_loss_clip": 0.01091073, + "auxiliary_loss_mlp": 0.00779483, + "balance_loss_clip": 1.04296982, + "balance_loss_mlp": 1.00037682, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 1.721866288414479, + "language_loss": 0.843678, + "learning_rate": 2.023561886666816e-06, + "loss": 0.8623836, + "num_input_tokens_seen": 182911515, + "step": 8505, + "time_per_iteration": 2.6551623344421387 + }, + { + "auxiliary_loss_clip": 0.01108992, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.04401696, + "balance_loss_mlp": 1.01475596, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 1.8207400914185898, + "language_loss": 0.75330335, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77467579, + "num_input_tokens_seen": 182930860, + "step": 8506, + "time_per_iteration": 2.6096010208129883 + }, + { + "auxiliary_loss_clip": 0.01121053, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.04247236, + "balance_loss_mlp": 1.01950932, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.5904262764695252, + "language_loss": 0.57632291, + "learning_rate": 2.022783015592131e-06, + "loss": 0.59787619, + "num_input_tokens_seen": 182949960, + "step": 8507, + "time_per_iteration": 2.4916939735412598 + }, + { + "auxiliary_loss_clip": 0.01111656, + "auxiliary_loss_mlp": 0.01045373, + "balance_loss_clip": 1.04395139, + "balance_loss_mlp": 1.03067422, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.909995241363996, + "language_loss": 0.85681379, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87838405, + "num_input_tokens_seen": 182968085, + "step": 8508, + "time_per_iteration": 2.484309434890747 + }, + { + "auxiliary_loss_clip": 0.01089594, + "auxiliary_loss_mlp": 0.00779995, + "balance_loss_clip": 1.0424664, + "balance_loss_mlp": 1.00047207, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.709727198027734, + "language_loss": 0.72202855, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74072444, + "num_input_tokens_seen": 182987275, + "step": 8509, + "time_per_iteration": 2.602491617202759 + }, + { + "auxiliary_loss_clip": 0.01116825, + "auxiliary_loss_mlp": 0.00778675, + "balance_loss_clip": 1.04078507, + "balance_loss_mlp": 1.00027871, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 2.5955339054459485, + "language_loss": 0.75899792, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.77795291, + "num_input_tokens_seen": 183004700, + "step": 8510, + "time_per_iteration": 2.454148292541504 + }, + { + "auxiliary_loss_clip": 0.01118827, + "auxiliary_loss_mlp": 0.01032626, + "balance_loss_clip": 1.0430181, + "balance_loss_mlp": 1.01933408, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.774495194842799, + "language_loss": 0.70771182, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.72922635, + "num_input_tokens_seen": 183025830, + "step": 8511, + "time_per_iteration": 2.5766232013702393 + }, + { + "auxiliary_loss_clip": 0.01094443, + "auxiliary_loss_mlp": 0.01029959, + "balance_loss_clip": 1.04040194, + "balance_loss_mlp": 1.01669717, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 1.8432493590060108, + "language_loss": 0.66288698, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68413103, + "num_input_tokens_seen": 183045140, + "step": 8512, + "time_per_iteration": 2.5534911155700684 + }, + { + "auxiliary_loss_clip": 0.0106092, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.03527856, + "balance_loss_mlp": 1.02875817, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 1.7408617587283506, + "language_loss": 0.66692495, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.68797863, + "num_input_tokens_seen": 183063935, + "step": 8513, + "time_per_iteration": 4.123725891113281 + }, + { + "auxiliary_loss_clip": 0.01080222, + "auxiliary_loss_mlp": 0.01037221, + "balance_loss_clip": 1.04233658, + "balance_loss_mlp": 1.02295709, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 1.9328660495178032, + "language_loss": 0.68856764, + "learning_rate": 2.0200569403921e-06, + "loss": 0.70974207, + "num_input_tokens_seen": 183084135, + "step": 8514, + "time_per_iteration": 2.599830150604248 + }, + { + "auxiliary_loss_clip": 0.01117438, + "auxiliary_loss_mlp": 0.01035124, + "balance_loss_clip": 1.04162931, + "balance_loss_mlp": 1.02249348, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.594393816213425, + "language_loss": 0.66178995, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68331558, + "num_input_tokens_seen": 183104570, + "step": 8515, + "time_per_iteration": 2.5292000770568848 + }, + { + "auxiliary_loss_clip": 0.01105992, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.04178226, + "balance_loss_mlp": 1.01875055, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 1.9654403139452261, + "language_loss": 0.75022483, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77160007, + "num_input_tokens_seen": 183123850, + "step": 8516, + "time_per_iteration": 2.623542547225952 + }, + { + "auxiliary_loss_clip": 0.01096183, + "auxiliary_loss_mlp": 0.01040009, + "balance_loss_clip": 1.04443431, + "balance_loss_mlp": 1.02601981, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 2.1481688607957854, + "language_loss": 0.78088653, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80224848, + "num_input_tokens_seen": 183141725, + "step": 8517, + "time_per_iteration": 2.621751308441162 + }, + { + "auxiliary_loss_clip": 0.01112046, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.04220045, + "balance_loss_mlp": 1.0222106, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.77146838869805, + "language_loss": 0.74112189, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.7625984, + "num_input_tokens_seen": 183161300, + "step": 8518, + "time_per_iteration": 2.5329699516296387 + }, + { + "auxiliary_loss_clip": 0.01108478, + "auxiliary_loss_mlp": 0.01040589, + "balance_loss_clip": 1.04577696, + "balance_loss_mlp": 1.02681398, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 3.4413000939288216, + "language_loss": 0.78037333, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80186403, + "num_input_tokens_seen": 183180495, + "step": 8519, + "time_per_iteration": 2.5317845344543457 + }, + { + "auxiliary_loss_clip": 0.01122306, + "auxiliary_loss_mlp": 0.01039456, + "balance_loss_clip": 1.04432416, + "balance_loss_mlp": 1.02538943, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.5838982972705569, + "language_loss": 0.79493159, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81654918, + "num_input_tokens_seen": 183200330, + "step": 8520, + "time_per_iteration": 2.495608329772949 + }, + { + "auxiliary_loss_clip": 0.01102315, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_clip": 1.04049695, + "balance_loss_mlp": 1.0291605, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.725222234238781, + "language_loss": 0.81323898, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83470881, + "num_input_tokens_seen": 183218230, + "step": 8521, + "time_per_iteration": 2.487199306488037 + }, + { + "auxiliary_loss_clip": 0.01109332, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.03976226, + "balance_loss_mlp": 1.01578534, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 2.3744793539497255, + "language_loss": 0.6817109, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70310557, + "num_input_tokens_seen": 183236735, + "step": 8522, + "time_per_iteration": 2.5348520278930664 + }, + { + "auxiliary_loss_clip": 0.01095657, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_clip": 1.04375196, + "balance_loss_mlp": 1.02649546, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 1.9162589087281483, + "language_loss": 0.62077463, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.64217615, + "num_input_tokens_seen": 183257550, + "step": 8523, + "time_per_iteration": 2.600114107131958 + }, + { + "auxiliary_loss_clip": 0.01084036, + "auxiliary_loss_mlp": 0.0104747, + "balance_loss_clip": 1.03875065, + "balance_loss_mlp": 1.03354609, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 2.2721189475963572, + "language_loss": 0.77676117, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.79807627, + "num_input_tokens_seen": 183275515, + "step": 8524, + "time_per_iteration": 2.5592734813690186 + }, + { + "auxiliary_loss_clip": 0.0109679, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.04101777, + "balance_loss_mlp": 1.02264357, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 2.579478823296913, + "language_loss": 0.7504673, + "learning_rate": 2.015773034588706e-06, + "loss": 0.7717911, + "num_input_tokens_seen": 183293880, + "step": 8525, + "time_per_iteration": 2.5071606636047363 + }, + { + "auxiliary_loss_clip": 0.01097612, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.04124904, + "balance_loss_mlp": 1.02843451, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.750766483335507, + "language_loss": 0.74150288, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76291907, + "num_input_tokens_seen": 183315860, + "step": 8526, + "time_per_iteration": 2.648440361022949 + }, + { + "auxiliary_loss_clip": 0.01113996, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.0443455, + "balance_loss_mlp": 1.0287261, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.558927208547033, + "language_loss": 0.65251404, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67407906, + "num_input_tokens_seen": 183335480, + "step": 8527, + "time_per_iteration": 3.958122491836548 + }, + { + "auxiliary_loss_clip": 0.01095736, + "auxiliary_loss_mlp": 0.01037774, + "balance_loss_clip": 1.04300463, + "balance_loss_mlp": 1.02532244, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.4557630520177902, + "language_loss": 0.74423575, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76557088, + "num_input_tokens_seen": 183354395, + "step": 8528, + "time_per_iteration": 2.570645809173584 + }, + { + "auxiliary_loss_clip": 0.0110535, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.0396806, + "balance_loss_mlp": 1.02343023, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.697849722188113, + "language_loss": 0.83044422, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85186857, + "num_input_tokens_seen": 183372980, + "step": 8529, + "time_per_iteration": 2.5299623012542725 + }, + { + "auxiliary_loss_clip": 0.01068767, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.03657985, + "balance_loss_mlp": 1.02218759, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.7957670343174847, + "language_loss": 0.74075854, + "learning_rate": 2.01382577957204e-06, + "loss": 0.76180512, + "num_input_tokens_seen": 183390160, + "step": 8530, + "time_per_iteration": 2.624364137649536 + }, + { + "auxiliary_loss_clip": 0.01012342, + "auxiliary_loss_mlp": 0.01005172, + "balance_loss_clip": 1.01586783, + "balance_loss_mlp": 1.00407553, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7486949562546064, + "language_loss": 0.60734713, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62752235, + "num_input_tokens_seen": 183455280, + "step": 8531, + "time_per_iteration": 3.2820417881011963 + }, + { + "auxiliary_loss_clip": 0.01098835, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.04687405, + "balance_loss_mlp": 1.02371013, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.7912522171174956, + "language_loss": 0.7683326, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.78970456, + "num_input_tokens_seen": 183473955, + "step": 8532, + "time_per_iteration": 2.5901591777801514 + }, + { + "auxiliary_loss_clip": 0.01098058, + "auxiliary_loss_mlp": 0.0104167, + "balance_loss_clip": 1.04231036, + "balance_loss_mlp": 1.02754331, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 1.773857579541739, + "language_loss": 0.670591, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69198835, + "num_input_tokens_seen": 183497195, + "step": 8533, + "time_per_iteration": 2.673957109451294 + }, + { + "auxiliary_loss_clip": 0.01089644, + "auxiliary_loss_mlp": 0.01039627, + "balance_loss_clip": 1.04179955, + "balance_loss_mlp": 1.02469015, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 2.02750209001958, + "language_loss": 0.82267094, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84396374, + "num_input_tokens_seen": 183513675, + "step": 8534, + "time_per_iteration": 2.59493088722229 + }, + { + "auxiliary_loss_clip": 0.01110603, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.04119623, + "balance_loss_mlp": 1.02373183, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.4424978158480966, + "language_loss": 0.63845587, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.6599462, + "num_input_tokens_seen": 183535165, + "step": 8535, + "time_per_iteration": 2.587578773498535 + }, + { + "auxiliary_loss_clip": 0.01113233, + "auxiliary_loss_mlp": 0.01028686, + "balance_loss_clip": 1.04605925, + "balance_loss_mlp": 1.01502454, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.598513985364238, + "language_loss": 0.70068276, + "learning_rate": 2.011489056413418e-06, + "loss": 0.72210193, + "num_input_tokens_seen": 183553780, + "step": 8536, + "time_per_iteration": 2.5411574840545654 + }, + { + "auxiliary_loss_clip": 0.01111407, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.0432893, + "balance_loss_mlp": 1.0202446, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 1.8511516413475724, + "language_loss": 0.71424448, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73570704, + "num_input_tokens_seen": 183572285, + "step": 8537, + "time_per_iteration": 4.050787448883057 + }, + { + "auxiliary_loss_clip": 0.01075663, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.0369041, + "balance_loss_mlp": 1.02044427, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 1.8443953659045118, + "language_loss": 0.80177575, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82288027, + "num_input_tokens_seen": 183589330, + "step": 8538, + "time_per_iteration": 4.013871669769287 + }, + { + "auxiliary_loss_clip": 0.01108, + "auxiliary_loss_mlp": 0.0103105, + "balance_loss_clip": 1.04126453, + "balance_loss_mlp": 1.01735306, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 1.822545255595614, + "language_loss": 0.78482234, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.8062129, + "num_input_tokens_seen": 183609205, + "step": 8539, + "time_per_iteration": 2.549628973007202 + }, + { + "auxiliary_loss_clip": 0.01097146, + "auxiliary_loss_mlp": 0.01037719, + "balance_loss_clip": 1.03850269, + "balance_loss_mlp": 1.02326441, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 3.1401198021048065, + "language_loss": 0.76060402, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78195262, + "num_input_tokens_seen": 183629985, + "step": 8540, + "time_per_iteration": 2.617206573486328 + }, + { + "auxiliary_loss_clip": 0.01071653, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.04156685, + "balance_loss_mlp": 1.01879144, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 1.8474350905489623, + "language_loss": 0.74587965, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76693708, + "num_input_tokens_seen": 183648220, + "step": 8541, + "time_per_iteration": 2.588648557662964 + }, + { + "auxiliary_loss_clip": 0.01058772, + "auxiliary_loss_mlp": 0.01037528, + "balance_loss_clip": 1.04013395, + "balance_loss_mlp": 1.02364635, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.7689005461617142, + "language_loss": 0.70385456, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72481757, + "num_input_tokens_seen": 183668230, + "step": 8542, + "time_per_iteration": 2.658813238143921 + }, + { + "auxiliary_loss_clip": 0.0110003, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.04152119, + "balance_loss_mlp": 1.01672423, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 3.4574918309661498, + "language_loss": 0.79719365, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.8184948, + "num_input_tokens_seen": 183687800, + "step": 8543, + "time_per_iteration": 2.582932472229004 + }, + { + "auxiliary_loss_clip": 0.01097411, + "auxiliary_loss_mlp": 0.01040479, + "balance_loss_clip": 1.04244137, + "balance_loss_mlp": 1.02585149, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.667938684333833, + "language_loss": 0.67981601, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70119494, + "num_input_tokens_seen": 183709025, + "step": 8544, + "time_per_iteration": 2.6478514671325684 + }, + { + "auxiliary_loss_clip": 0.01087347, + "auxiliary_loss_mlp": 0.0104982, + "balance_loss_clip": 1.03881836, + "balance_loss_mlp": 1.03457344, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.483719235487129, + "language_loss": 0.72219074, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.7435624, + "num_input_tokens_seen": 183725740, + "step": 8545, + "time_per_iteration": 2.578244686126709 + }, + { + "auxiliary_loss_clip": 0.01112534, + "auxiliary_loss_mlp": 0.01042364, + "balance_loss_clip": 1.04212582, + "balance_loss_mlp": 1.02650952, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 2.2726855642627277, + "language_loss": 0.82366526, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84521425, + "num_input_tokens_seen": 183743995, + "step": 8546, + "time_per_iteration": 2.6093175411224365 + }, + { + "auxiliary_loss_clip": 0.01109355, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.04095078, + "balance_loss_mlp": 1.02259636, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.6409144789258798, + "language_loss": 0.73401946, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75548011, + "num_input_tokens_seen": 183764150, + "step": 8547, + "time_per_iteration": 2.539586305618286 + }, + { + "auxiliary_loss_clip": 0.01109491, + "auxiliary_loss_mlp": 0.01043657, + "balance_loss_clip": 1.04073334, + "balance_loss_mlp": 1.02972782, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.614748125214593, + "language_loss": 0.73377043, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75530189, + "num_input_tokens_seen": 183783280, + "step": 8548, + "time_per_iteration": 2.5509557723999023 + }, + { + "auxiliary_loss_clip": 0.0108814, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.04077435, + "balance_loss_mlp": 1.01960933, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.7324884012786073, + "language_loss": 0.81927526, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.8404963, + "num_input_tokens_seen": 183800725, + "step": 8549, + "time_per_iteration": 2.5767080783843994 + }, + { + "auxiliary_loss_clip": 0.01111879, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.04458547, + "balance_loss_mlp": 1.01860964, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 2.010509204496292, + "language_loss": 0.71873671, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.740174, + "num_input_tokens_seen": 183818735, + "step": 8550, + "time_per_iteration": 2.538217306137085 + }, + { + "auxiliary_loss_clip": 0.0111276, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.04694057, + "balance_loss_mlp": 1.02286613, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.5964996118590653, + "language_loss": 0.75036567, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77186263, + "num_input_tokens_seen": 183840015, + "step": 8551, + "time_per_iteration": 2.558197498321533 + }, + { + "auxiliary_loss_clip": 0.01088983, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.04190254, + "balance_loss_mlp": 1.01748967, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.6496040085150823, + "language_loss": 0.68972462, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71092916, + "num_input_tokens_seen": 183860145, + "step": 8552, + "time_per_iteration": 4.083352327346802 + }, + { + "auxiliary_loss_clip": 0.01109251, + "auxiliary_loss_mlp": 0.01037725, + "balance_loss_clip": 1.04115963, + "balance_loss_mlp": 1.02312791, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.7482201366086363, + "language_loss": 0.7464627, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76793242, + "num_input_tokens_seen": 183880540, + "step": 8553, + "time_per_iteration": 2.543673515319824 + }, + { + "auxiliary_loss_clip": 0.01120181, + "auxiliary_loss_mlp": 0.0103801, + "balance_loss_clip": 1.04308319, + "balance_loss_mlp": 1.02434874, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 2.4119857063265777, + "language_loss": 0.6769129, + "learning_rate": 2.004478805593435e-06, + "loss": 0.69849479, + "num_input_tokens_seen": 183900895, + "step": 8554, + "time_per_iteration": 2.4530956745147705 + }, + { + "auxiliary_loss_clip": 0.0111446, + "auxiliary_loss_mlp": 0.01039524, + "balance_loss_clip": 1.0413754, + "balance_loss_mlp": 1.02344882, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 1.8202669800219033, + "language_loss": 0.73436761, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75590742, + "num_input_tokens_seen": 183920335, + "step": 8555, + "time_per_iteration": 2.499142646789551 + }, + { + "auxiliary_loss_clip": 0.01085052, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.04414082, + "balance_loss_mlp": 1.02456927, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.4834334236551188, + "language_loss": 0.74888217, + "learning_rate": 2.003699883863633e-06, + "loss": 0.77010751, + "num_input_tokens_seen": 183936220, + "step": 8556, + "time_per_iteration": 2.533074378967285 + }, + { + "auxiliary_loss_clip": 0.01092706, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.04521775, + "balance_loss_mlp": 1.02324486, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.7644419504199274, + "language_loss": 0.85840976, + "learning_rate": 2.003310422780898e-06, + "loss": 0.87970126, + "num_input_tokens_seen": 183953250, + "step": 8557, + "time_per_iteration": 2.6398489475250244 + }, + { + "auxiliary_loss_clip": 0.01102003, + "auxiliary_loss_mlp": 0.01041138, + "balance_loss_clip": 1.03930259, + "balance_loss_mlp": 1.02697563, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.7328723833064934, + "language_loss": 0.88942242, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91085386, + "num_input_tokens_seen": 183973865, + "step": 8558, + "time_per_iteration": 2.5765466690063477 + }, + { + "auxiliary_loss_clip": 0.01119433, + "auxiliary_loss_mlp": 0.0077919, + "balance_loss_clip": 1.04429722, + "balance_loss_mlp": 1.00026917, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.9512610349458837, + "language_loss": 0.64837718, + "learning_rate": 2.002531500253602e-06, + "loss": 0.66736341, + "num_input_tokens_seen": 183992555, + "step": 8559, + "time_per_iteration": 2.4640557765960693 + }, + { + "auxiliary_loss_clip": 0.01103919, + "auxiliary_loss_mlp": 0.00779371, + "balance_loss_clip": 1.04128444, + "balance_loss_mlp": 1.00037909, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 2.892819940949724, + "language_loss": 0.63441956, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65325248, + "num_input_tokens_seen": 184010825, + "step": 8560, + "time_per_iteration": 2.545199394226074 + }, + { + "auxiliary_loss_clip": 0.01119899, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.04327345, + "balance_loss_mlp": 1.01912618, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.505952358652961, + "language_loss": 0.70085895, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72238737, + "num_input_tokens_seen": 184030155, + "step": 8561, + "time_per_iteration": 2.468325138092041 + }, + { + "auxiliary_loss_clip": 0.01097424, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.03987026, + "balance_loss_mlp": 1.01885772, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.6147349373695796, + "language_loss": 0.66723192, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68852234, + "num_input_tokens_seen": 184051440, + "step": 8562, + "time_per_iteration": 2.6498429775238037 + }, + { + "auxiliary_loss_clip": 0.01115495, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.04555094, + "balance_loss_mlp": 1.01908922, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.612600483396774, + "language_loss": 0.77529502, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79677629, + "num_input_tokens_seen": 184070205, + "step": 8563, + "time_per_iteration": 2.529088258743286 + }, + { + "auxiliary_loss_clip": 0.01112159, + "auxiliary_loss_mlp": 0.01037182, + "balance_loss_clip": 1.041538, + "balance_loss_mlp": 1.02173877, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.1430509183125155, + "language_loss": 0.8289243, + "learning_rate": 2.0005841925139e-06, + "loss": 0.85041773, + "num_input_tokens_seen": 184087345, + "step": 8564, + "time_per_iteration": 2.532299518585205 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.04202139, + "balance_loss_mlp": 1.01976943, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.8228652050447545, + "language_loss": 0.73009336, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75146115, + "num_input_tokens_seen": 184107110, + "step": 8565, + "time_per_iteration": 2.5162906646728516 + }, + { + "auxiliary_loss_clip": 0.01113167, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.04195499, + "balance_loss_mlp": 1.01951218, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 1.8161836593158804, + "language_loss": 0.68546176, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70694876, + "num_input_tokens_seen": 184127105, + "step": 8566, + "time_per_iteration": 3.9892075061798096 + }, + { + "auxiliary_loss_clip": 0.0112329, + "auxiliary_loss_mlp": 0.00780451, + "balance_loss_clip": 1.04204941, + "balance_loss_mlp": 1.00056589, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 1.8298533434212887, + "language_loss": 0.77864343, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.79768085, + "num_input_tokens_seen": 184148060, + "step": 8567, + "time_per_iteration": 2.4910221099853516 + }, + { + "auxiliary_loss_clip": 0.01112751, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.04341948, + "balance_loss_mlp": 1.02301025, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.1850085572717775, + "language_loss": 0.79635704, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81786078, + "num_input_tokens_seen": 184166175, + "step": 8568, + "time_per_iteration": 2.5366852283477783 + }, + { + "auxiliary_loss_clip": 0.01098329, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.042261, + "balance_loss_mlp": 1.02222705, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.421423979269937, + "language_loss": 0.91027641, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93161023, + "num_input_tokens_seen": 184182600, + "step": 8569, + "time_per_iteration": 2.5780293941497803 + }, + { + "auxiliary_loss_clip": 0.01125705, + "auxiliary_loss_mlp": 0.01036904, + "balance_loss_clip": 1.04547501, + "balance_loss_mlp": 1.02274776, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.760827012673513, + "language_loss": 0.76029545, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78192151, + "num_input_tokens_seen": 184202020, + "step": 8570, + "time_per_iteration": 2.528215169906616 + }, + { + "auxiliary_loss_clip": 0.01112821, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.04331636, + "balance_loss_mlp": 1.02600026, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.608131878883989, + "language_loss": 0.73827571, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.7598182, + "num_input_tokens_seen": 184224850, + "step": 8571, + "time_per_iteration": 2.687392473220825 + }, + { + "auxiliary_loss_clip": 0.0103071, + "auxiliary_loss_mlp": 0.01008187, + "balance_loss_clip": 1.02372372, + "balance_loss_mlp": 1.00700653, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7934223121231273, + "language_loss": 0.52882326, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54921222, + "num_input_tokens_seen": 184288520, + "step": 8572, + "time_per_iteration": 3.2121758460998535 + }, + { + "auxiliary_loss_clip": 0.01110866, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.04658294, + "balance_loss_mlp": 1.02543545, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.7477567961517357, + "language_loss": 0.76684415, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78833842, + "num_input_tokens_seen": 184308565, + "step": 8573, + "time_per_iteration": 2.6273670196533203 + }, + { + "auxiliary_loss_clip": 0.01108055, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.04156148, + "balance_loss_mlp": 1.01822352, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.8474151432528199, + "language_loss": 0.77481848, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79622078, + "num_input_tokens_seen": 184326795, + "step": 8574, + "time_per_iteration": 2.5615906715393066 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.04334736, + "balance_loss_mlp": 1.01883483, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 3.3835365826972152, + "language_loss": 0.85640895, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87771106, + "num_input_tokens_seen": 184345990, + "step": 8575, + "time_per_iteration": 2.6371700763702393 + }, + { + "auxiliary_loss_clip": 0.01110677, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.04110956, + "balance_loss_mlp": 1.01855314, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.696393764768156, + "language_loss": 0.76932824, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79076219, + "num_input_tokens_seen": 184366300, + "step": 8576, + "time_per_iteration": 2.6073055267333984 + }, + { + "auxiliary_loss_clip": 0.01074467, + "auxiliary_loss_mlp": 0.00782044, + "balance_loss_clip": 1.03874469, + "balance_loss_mlp": 1.00048387, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 2.4910831909158295, + "language_loss": 0.7565071, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77507222, + "num_input_tokens_seen": 184383030, + "step": 8577, + "time_per_iteration": 5.727568864822388 + }, + { + "auxiliary_loss_clip": 0.01097785, + "auxiliary_loss_mlp": 0.0104589, + "balance_loss_clip": 1.04042888, + "balance_loss_mlp": 1.02867627, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.8177469717564285, + "language_loss": 0.8066504, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.82808709, + "num_input_tokens_seen": 184403410, + "step": 8578, + "time_per_iteration": 2.5958244800567627 + }, + { + "auxiliary_loss_clip": 0.01118026, + "auxiliary_loss_mlp": 0.01035995, + "balance_loss_clip": 1.0417819, + "balance_loss_mlp": 1.02210093, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 2.8603655127229035, + "language_loss": 0.76023805, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78177822, + "num_input_tokens_seen": 184423830, + "step": 8579, + "time_per_iteration": 2.511340379714966 + }, + { + "auxiliary_loss_clip": 0.01088919, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.04544616, + "balance_loss_mlp": 1.01827562, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.7462836969467368, + "language_loss": 0.79454756, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81575072, + "num_input_tokens_seen": 184445050, + "step": 8580, + "time_per_iteration": 2.610167980194092 + }, + { + "auxiliary_loss_clip": 0.01084551, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_clip": 1.03900361, + "balance_loss_mlp": 1.0323596, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 2.0695134955581875, + "language_loss": 0.73155034, + "learning_rate": 1.99396335310315e-06, + "loss": 0.75288355, + "num_input_tokens_seen": 184460775, + "step": 8581, + "time_per_iteration": 2.5397884845733643 + }, + { + "auxiliary_loss_clip": 0.01110299, + "auxiliary_loss_mlp": 0.01036239, + "balance_loss_clip": 1.0435791, + "balance_loss_mlp": 1.02277446, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.788247204796059, + "language_loss": 0.74572486, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76719028, + "num_input_tokens_seen": 184477365, + "step": 8582, + "time_per_iteration": 2.454000949859619 + }, + { + "auxiliary_loss_clip": 0.01082642, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.0415895, + "balance_loss_mlp": 1.01997423, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 6.993480825152096, + "language_loss": 0.66159827, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68275285, + "num_input_tokens_seen": 184497045, + "step": 8583, + "time_per_iteration": 2.545315742492676 + }, + { + "auxiliary_loss_clip": 0.01112452, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.04301023, + "balance_loss_mlp": 1.02025998, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.5300216807118416, + "language_loss": 0.7609297, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78239942, + "num_input_tokens_seen": 184517675, + "step": 8584, + "time_per_iteration": 2.558574914932251 + }, + { + "auxiliary_loss_clip": 0.0109031, + "auxiliary_loss_mlp": 0.01040896, + "balance_loss_clip": 1.0411675, + "balance_loss_mlp": 1.02697849, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 2.134548475824443, + "language_loss": 0.79002285, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.81133497, + "num_input_tokens_seen": 184537745, + "step": 8585, + "time_per_iteration": 2.588404893875122 + }, + { + "auxiliary_loss_clip": 0.01104109, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.04228318, + "balance_loss_mlp": 1.02132797, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.1305855943795082, + "language_loss": 0.81389391, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83527601, + "num_input_tokens_seen": 184553630, + "step": 8586, + "time_per_iteration": 2.5124917030334473 + }, + { + "auxiliary_loss_clip": 0.0110325, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.04594898, + "balance_loss_mlp": 1.01925945, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 1.6362183364402048, + "language_loss": 0.72086895, + "learning_rate": 1.991626598310701e-06, + "loss": 0.74222863, + "num_input_tokens_seen": 184573530, + "step": 8587, + "time_per_iteration": 2.5532915592193604 + }, + { + "auxiliary_loss_clip": 0.0103563, + "auxiliary_loss_mlp": 0.01005698, + "balance_loss_clip": 1.01892269, + "balance_loss_mlp": 1.00416028, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7297581921080758, + "language_loss": 0.57909644, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59950972, + "num_input_tokens_seen": 184637875, + "step": 8588, + "time_per_iteration": 3.085186243057251 + }, + { + "auxiliary_loss_clip": 0.01098085, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.04270661, + "balance_loss_mlp": 1.0212028, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.7348363514106964, + "language_loss": 0.75200599, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77334148, + "num_input_tokens_seen": 184656125, + "step": 8589, + "time_per_iteration": 2.5266835689544678 + }, + { + "auxiliary_loss_clip": 0.01115073, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.04514503, + "balance_loss_mlp": 1.02027214, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 1.4558683995789157, + "language_loss": 0.67426074, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69574201, + "num_input_tokens_seen": 184675920, + "step": 8590, + "time_per_iteration": 2.5402755737304688 + }, + { + "auxiliary_loss_clip": 0.01037364, + "auxiliary_loss_mlp": 0.01000291, + "balance_loss_clip": 1.02172315, + "balance_loss_mlp": 0.99885482, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.8315215114881209, + "language_loss": 0.55867261, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57904917, + "num_input_tokens_seen": 184730520, + "step": 8591, + "time_per_iteration": 4.5772974491119385 + }, + { + "auxiliary_loss_clip": 0.01095883, + "auxiliary_loss_mlp": 0.01030582, + "balance_loss_clip": 1.04262316, + "balance_loss_mlp": 1.01846457, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.6441261303024277, + "language_loss": 0.81501287, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83627754, + "num_input_tokens_seen": 184748340, + "step": 8592, + "time_per_iteration": 2.5721778869628906 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.01029939, + "balance_loss_clip": 1.04459572, + "balance_loss_mlp": 1.01705813, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 1.924274654945936, + "language_loss": 0.8299467, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85133862, + "num_input_tokens_seen": 184766615, + "step": 8593, + "time_per_iteration": 2.5229220390319824 + }, + { + "auxiliary_loss_clip": 0.01096453, + "auxiliary_loss_mlp": 0.01041372, + "balance_loss_clip": 1.04134607, + "balance_loss_mlp": 1.02650714, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.7563120177291567, + "language_loss": 0.68831754, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.70969576, + "num_input_tokens_seen": 184788075, + "step": 8594, + "time_per_iteration": 2.5998597145080566 + }, + { + "auxiliary_loss_clip": 0.01078815, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.03787875, + "balance_loss_mlp": 1.02138686, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.4241207544786296, + "language_loss": 0.77420306, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79534447, + "num_input_tokens_seen": 184808710, + "step": 8595, + "time_per_iteration": 2.617279291152954 + }, + { + "auxiliary_loss_clip": 0.01121084, + "auxiliary_loss_mlp": 0.01041817, + "balance_loss_clip": 1.04463196, + "balance_loss_mlp": 1.0283041, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 2.030238658458469, + "language_loss": 0.65384889, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67547798, + "num_input_tokens_seen": 184826475, + "step": 8596, + "time_per_iteration": 2.454345703125 + }, + { + "auxiliary_loss_clip": 0.01080223, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.04011989, + "balance_loss_mlp": 1.02488196, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.5638882617853664, + "language_loss": 0.7571311, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77833569, + "num_input_tokens_seen": 184845245, + "step": 8597, + "time_per_iteration": 2.6437482833862305 + }, + { + "auxiliary_loss_clip": 0.01119472, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.0426811, + "balance_loss_mlp": 1.01490366, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.8568598517114756, + "language_loss": 0.81269145, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83416605, + "num_input_tokens_seen": 184866605, + "step": 8598, + "time_per_iteration": 2.531750202178955 + }, + { + "auxiliary_loss_clip": 0.01071652, + "auxiliary_loss_mlp": 0.01045662, + "balance_loss_clip": 1.036623, + "balance_loss_mlp": 1.03169036, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.8401149944772166, + "language_loss": 0.75573087, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77690405, + "num_input_tokens_seen": 184886945, + "step": 8599, + "time_per_iteration": 2.6429529190063477 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.04721463, + "balance_loss_mlp": 1.02390695, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 2.4227698861721594, + "language_loss": 0.72043848, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74187648, + "num_input_tokens_seen": 184905590, + "step": 8600, + "time_per_iteration": 2.599862575531006 + }, + { + "auxiliary_loss_clip": 0.01084651, + "auxiliary_loss_mlp": 0.01035399, + "balance_loss_clip": 1.03975177, + "balance_loss_mlp": 1.02144575, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.7433132729072385, + "language_loss": 0.74660021, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76780069, + "num_input_tokens_seen": 184925555, + "step": 8601, + "time_per_iteration": 2.6262083053588867 + }, + { + "auxiliary_loss_clip": 0.01110242, + "auxiliary_loss_mlp": 0.01040982, + "balance_loss_clip": 1.04307389, + "balance_loss_mlp": 1.02676034, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 2.2598533069824533, + "language_loss": 0.84007049, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.86158282, + "num_input_tokens_seen": 184944490, + "step": 8602, + "time_per_iteration": 2.5141642093658447 + }, + { + "auxiliary_loss_clip": 0.01121719, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.04381239, + "balance_loss_mlp": 1.02147651, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.8744480395458036, + "language_loss": 0.74645132, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.7680223, + "num_input_tokens_seen": 184963190, + "step": 8603, + "time_per_iteration": 2.5396103858947754 + }, + { + "auxiliary_loss_clip": 0.01099893, + "auxiliary_loss_mlp": 0.01039937, + "balance_loss_clip": 1.04332876, + "balance_loss_mlp": 1.02704477, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.1667976437196286, + "language_loss": 0.72706467, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74846292, + "num_input_tokens_seen": 184981220, + "step": 8604, + "time_per_iteration": 2.5075929164886475 + }, + { + "auxiliary_loss_clip": 0.011032, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.04240048, + "balance_loss_mlp": 1.02571023, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 2.3047330717719294, + "language_loss": 0.85473377, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87616742, + "num_input_tokens_seen": 184998810, + "step": 8605, + "time_per_iteration": 3.9666099548339844 + }, + { + "auxiliary_loss_clip": 0.01112426, + "auxiliary_loss_mlp": 0.01027205, + "balance_loss_clip": 1.05000627, + "balance_loss_mlp": 1.01449716, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.408764896088664, + "language_loss": 0.64557755, + "learning_rate": 1.984226965411294e-06, + "loss": 0.66697395, + "num_input_tokens_seen": 185021185, + "step": 8606, + "time_per_iteration": 2.567007064819336 + }, + { + "auxiliary_loss_clip": 0.01098684, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.04752493, + "balance_loss_mlp": 1.02119064, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.8666755112606235, + "language_loss": 0.78038144, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80171013, + "num_input_tokens_seen": 185038465, + "step": 8607, + "time_per_iteration": 2.5598509311676025 + }, + { + "auxiliary_loss_clip": 0.01110298, + "auxiliary_loss_mlp": 0.01036075, + "balance_loss_clip": 1.04251957, + "balance_loss_mlp": 1.0219841, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 1.8302384565098524, + "language_loss": 0.72725976, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74872351, + "num_input_tokens_seen": 185057340, + "step": 8608, + "time_per_iteration": 2.5525949001312256 + }, + { + "auxiliary_loss_clip": 0.01117769, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.04397583, + "balance_loss_mlp": 1.02172923, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.7977804671104838, + "language_loss": 0.8597095, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88124597, + "num_input_tokens_seen": 185074935, + "step": 8609, + "time_per_iteration": 2.5138614177703857 + }, + { + "auxiliary_loss_clip": 0.01108062, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.03985143, + "balance_loss_mlp": 1.02309573, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.6148785103529397, + "language_loss": 0.73615539, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75759435, + "num_input_tokens_seen": 185095050, + "step": 8610, + "time_per_iteration": 2.5323896408081055 + }, + { + "auxiliary_loss_clip": 0.01126282, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.04520941, + "balance_loss_mlp": 1.02066112, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 2.0034556471766405, + "language_loss": 0.67633635, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69795573, + "num_input_tokens_seen": 185112275, + "step": 8611, + "time_per_iteration": 2.4333386421203613 + }, + { + "auxiliary_loss_clip": 0.01120501, + "auxiliary_loss_mlp": 0.0104211, + "balance_loss_clip": 1.04318833, + "balance_loss_mlp": 1.02835298, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 1.998593399714133, + "language_loss": 0.77363825, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79526436, + "num_input_tokens_seen": 185132165, + "step": 8612, + "time_per_iteration": 2.483563184738159 + }, + { + "auxiliary_loss_clip": 0.01112405, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_clip": 1.04240727, + "balance_loss_mlp": 1.02714658, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 1.88129854966152, + "language_loss": 0.8198548, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84138882, + "num_input_tokens_seen": 185151025, + "step": 8613, + "time_per_iteration": 2.46574330329895 + }, + { + "auxiliary_loss_clip": 0.01124131, + "auxiliary_loss_mlp": 0.01038753, + "balance_loss_clip": 1.04486859, + "balance_loss_mlp": 1.02462065, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.199616581818175, + "language_loss": 0.67128682, + "learning_rate": 1.981111389254541e-06, + "loss": 0.69291574, + "num_input_tokens_seen": 185168455, + "step": 8614, + "time_per_iteration": 2.432128429412842 + }, + { + "auxiliary_loss_clip": 0.01096978, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.04190278, + "balance_loss_mlp": 1.01879871, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 3.438468666309011, + "language_loss": 0.86887681, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.89018226, + "num_input_tokens_seen": 185184415, + "step": 8615, + "time_per_iteration": 2.5872020721435547 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.04199696, + "balance_loss_mlp": 1.02595687, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.6168653424090136, + "language_loss": 0.80688697, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.82835555, + "num_input_tokens_seen": 185202910, + "step": 8616, + "time_per_iteration": 5.425913572311401 + }, + { + "auxiliary_loss_clip": 0.01115646, + "auxiliary_loss_mlp": 0.00781497, + "balance_loss_clip": 1.04764795, + "balance_loss_mlp": 1.00067472, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 1.8217812467624268, + "language_loss": 0.75176579, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77073717, + "num_input_tokens_seen": 185223085, + "step": 8617, + "time_per_iteration": 2.517338991165161 + }, + { + "auxiliary_loss_clip": 0.01121651, + "auxiliary_loss_mlp": 0.01038168, + "balance_loss_clip": 1.04302835, + "balance_loss_mlp": 1.02363658, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 2.260963196965191, + "language_loss": 0.7037667, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72536492, + "num_input_tokens_seen": 185241295, + "step": 8618, + "time_per_iteration": 2.4515798091888428 + }, + { + "auxiliary_loss_clip": 0.010363, + "auxiliary_loss_mlp": 0.01009115, + "balance_loss_clip": 1.02019453, + "balance_loss_mlp": 1.00776231, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9483782169382908, + "language_loss": 0.67247474, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69292885, + "num_input_tokens_seen": 185298295, + "step": 8619, + "time_per_iteration": 3.0190236568450928 + }, + { + "auxiliary_loss_clip": 0.01073129, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.03968239, + "balance_loss_mlp": 1.01773643, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 1.9137219731180404, + "language_loss": 0.79739213, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81843454, + "num_input_tokens_seen": 185317000, + "step": 8620, + "time_per_iteration": 2.5695953369140625 + }, + { + "auxiliary_loss_clip": 0.01060152, + "auxiliary_loss_mlp": 0.00779608, + "balance_loss_clip": 1.04032779, + "balance_loss_mlp": 1.00042534, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 1.970150544240096, + "language_loss": 0.82214904, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84054661, + "num_input_tokens_seen": 185331185, + "step": 8621, + "time_per_iteration": 2.6136350631713867 + }, + { + "auxiliary_loss_clip": 0.01097484, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.03986859, + "balance_loss_mlp": 1.02300119, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 2.3598862065271926, + "language_loss": 0.65707457, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67840886, + "num_input_tokens_seen": 185348955, + "step": 8622, + "time_per_iteration": 2.5408196449279785 + }, + { + "auxiliary_loss_clip": 0.01101248, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.04038095, + "balance_loss_mlp": 1.02256799, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 2.156877684465166, + "language_loss": 0.6073823, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62876546, + "num_input_tokens_seen": 185367330, + "step": 8623, + "time_per_iteration": 2.48964786529541 + }, + { + "auxiliary_loss_clip": 0.01120775, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.04256725, + "balance_loss_mlp": 1.02132392, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.8096163470536832, + "language_loss": 0.76359618, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78515166, + "num_input_tokens_seen": 185385060, + "step": 8624, + "time_per_iteration": 2.455303192138672 + }, + { + "auxiliary_loss_clip": 0.01076035, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.03639102, + "balance_loss_mlp": 1.02327502, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 2.1167391452733986, + "language_loss": 0.71199894, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73313928, + "num_input_tokens_seen": 185403745, + "step": 8625, + "time_per_iteration": 2.5800623893737793 + }, + { + "auxiliary_loss_clip": 0.01100744, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.04166603, + "balance_loss_mlp": 1.0199337, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.9188142794619032, + "language_loss": 0.68208861, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70342374, + "num_input_tokens_seen": 185422620, + "step": 8626, + "time_per_iteration": 2.519310235977173 + }, + { + "auxiliary_loss_clip": 0.01109953, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.04215837, + "balance_loss_mlp": 1.01857889, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 2.0764225331950796, + "language_loss": 0.70957136, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.73099017, + "num_input_tokens_seen": 185439380, + "step": 8627, + "time_per_iteration": 2.46932315826416 + }, + { + "auxiliary_loss_clip": 0.01125763, + "auxiliary_loss_mlp": 0.00780259, + "balance_loss_clip": 1.04525673, + "balance_loss_mlp": 1.00041938, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 2.27453536922491, + "language_loss": 0.73014832, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.74920857, + "num_input_tokens_seen": 185458830, + "step": 8628, + "time_per_iteration": 2.4609017372131348 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.01034226, + "balance_loss_clip": 1.04355073, + "balance_loss_mlp": 1.02085114, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 1.569704415016817, + "language_loss": 0.77446491, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79579389, + "num_input_tokens_seen": 185477270, + "step": 8629, + "time_per_iteration": 2.510549545288086 + }, + { + "auxiliary_loss_clip": 0.011136, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.04353309, + "balance_loss_mlp": 1.02021861, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.533636153665246, + "language_loss": 0.75002444, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.7715072, + "num_input_tokens_seen": 185495795, + "step": 8630, + "time_per_iteration": 2.5005788803100586 + }, + { + "auxiliary_loss_clip": 0.01110883, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.04246306, + "balance_loss_mlp": 1.0235343, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 2.71560204326229, + "language_loss": 0.80172694, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82322192, + "num_input_tokens_seen": 185514885, + "step": 8631, + "time_per_iteration": 3.9680733680725098 + }, + { + "auxiliary_loss_clip": 0.01113005, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.04425204, + "balance_loss_mlp": 1.01661801, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.5243251822898907, + "language_loss": 0.74366045, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76509857, + "num_input_tokens_seen": 185537155, + "step": 8632, + "time_per_iteration": 2.514580249786377 + }, + { + "auxiliary_loss_clip": 0.01090486, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.04464459, + "balance_loss_mlp": 1.01731229, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.9105566259433668, + "language_loss": 0.78535521, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.80657041, + "num_input_tokens_seen": 185555520, + "step": 8633, + "time_per_iteration": 2.5471999645233154 + }, + { + "auxiliary_loss_clip": 0.01112237, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.04275692, + "balance_loss_mlp": 1.01994002, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 4.404998206992137, + "language_loss": 0.80282986, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82428968, + "num_input_tokens_seen": 185573855, + "step": 8634, + "time_per_iteration": 2.506837844848633 + }, + { + "auxiliary_loss_clip": 0.01120375, + "auxiliary_loss_mlp": 0.01037487, + "balance_loss_clip": 1.04520071, + "balance_loss_mlp": 1.02399254, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 1.4737794953822476, + "language_loss": 0.6879338, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.70951241, + "num_input_tokens_seen": 185595145, + "step": 8635, + "time_per_iteration": 2.493990659713745 + }, + { + "auxiliary_loss_clip": 0.01099562, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.04086554, + "balance_loss_mlp": 1.02274597, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.5900433474719722, + "language_loss": 0.77568436, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79704672, + "num_input_tokens_seen": 185613320, + "step": 8636, + "time_per_iteration": 2.4911773204803467 + }, + { + "auxiliary_loss_clip": 0.01124021, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.04438519, + "balance_loss_mlp": 1.02202916, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 2.969610360303854, + "language_loss": 0.71137673, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.7329818, + "num_input_tokens_seen": 185630730, + "step": 8637, + "time_per_iteration": 2.42515230178833 + }, + { + "auxiliary_loss_clip": 0.01087236, + "auxiliary_loss_mlp": 0.01036946, + "balance_loss_clip": 1.04333639, + "balance_loss_mlp": 1.02312994, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 1.9573134697115644, + "language_loss": 0.75781882, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.77906066, + "num_input_tokens_seen": 185648515, + "step": 8638, + "time_per_iteration": 2.5369441509246826 + }, + { + "auxiliary_loss_clip": 0.01087438, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.040097, + "balance_loss_mlp": 1.01786113, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 1.8615098084432566, + "language_loss": 0.74325347, + "learning_rate": 1.971375543740272e-06, + "loss": 0.76443684, + "num_input_tokens_seen": 185665220, + "step": 8639, + "time_per_iteration": 2.5369374752044678 + }, + { + "auxiliary_loss_clip": 0.01121885, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.04489625, + "balance_loss_mlp": 1.0205977, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.6552222662993528, + "language_loss": 0.77309531, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79466546, + "num_input_tokens_seen": 185683750, + "step": 8640, + "time_per_iteration": 2.478937864303589 + }, + { + "auxiliary_loss_clip": 0.01081509, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.04102731, + "balance_loss_mlp": 1.02331114, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 2.1088512162919395, + "language_loss": 0.66549301, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68667448, + "num_input_tokens_seen": 185700625, + "step": 8641, + "time_per_iteration": 2.5202832221984863 + }, + { + "auxiliary_loss_clip": 0.01120153, + "auxiliary_loss_mlp": 0.01034614, + "balance_loss_clip": 1.0441916, + "balance_loss_mlp": 1.02137637, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.962707959306952, + "language_loss": 0.76544285, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78699046, + "num_input_tokens_seen": 185721155, + "step": 8642, + "time_per_iteration": 2.5331153869628906 + }, + { + "auxiliary_loss_clip": 0.01117699, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.04225254, + "balance_loss_mlp": 1.01758742, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.6611597393207238, + "language_loss": 0.82919204, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85067666, + "num_input_tokens_seen": 185740990, + "step": 8643, + "time_per_iteration": 2.5997025966644287 + }, + { + "auxiliary_loss_clip": 0.0112242, + "auxiliary_loss_mlp": 0.01040232, + "balance_loss_clip": 1.0430119, + "balance_loss_mlp": 1.02591467, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.5079109552559748, + "language_loss": 0.70171958, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72334611, + "num_input_tokens_seen": 185762235, + "step": 8644, + "time_per_iteration": 2.495652198791504 + }, + { + "auxiliary_loss_clip": 0.0110787, + "auxiliary_loss_mlp": 0.00778872, + "balance_loss_clip": 1.04119062, + "balance_loss_mlp": 1.00041699, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.521627666908703, + "language_loss": 0.79911852, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.81798601, + "num_input_tokens_seen": 185783415, + "step": 8645, + "time_per_iteration": 3.9644641876220703 + }, + { + "auxiliary_loss_clip": 0.01118871, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.04164839, + "balance_loss_mlp": 1.01818657, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.7822681763764405, + "language_loss": 0.78274286, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80425024, + "num_input_tokens_seen": 185801345, + "step": 8646, + "time_per_iteration": 2.4121110439300537 + }, + { + "auxiliary_loss_clip": 0.01111427, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.04301548, + "balance_loss_mlp": 1.02559328, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.9787565147309583, + "language_loss": 0.65626717, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.67776763, + "num_input_tokens_seen": 185820815, + "step": 8647, + "time_per_iteration": 2.4673824310302734 + }, + { + "auxiliary_loss_clip": 0.01123478, + "auxiliary_loss_mlp": 0.01036526, + "balance_loss_clip": 1.0430994, + "balance_loss_mlp": 1.02124333, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 1.6748569236007176, + "language_loss": 0.71544969, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73704982, + "num_input_tokens_seen": 185841450, + "step": 8648, + "time_per_iteration": 2.4715826511383057 + }, + { + "auxiliary_loss_clip": 0.01101547, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.04408455, + "balance_loss_mlp": 1.0184238, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.9802698028278765, + "language_loss": 0.6404705, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66181558, + "num_input_tokens_seen": 185859935, + "step": 8649, + "time_per_iteration": 2.5135154724121094 + }, + { + "auxiliary_loss_clip": 0.01095804, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.03943765, + "balance_loss_mlp": 1.02161026, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.7021310186694083, + "language_loss": 0.70654762, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72787708, + "num_input_tokens_seen": 185876795, + "step": 8650, + "time_per_iteration": 2.458225727081299 + }, + { + "auxiliary_loss_clip": 0.01119774, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.04206657, + "balance_loss_mlp": 1.021631, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.7509163156138912, + "language_loss": 0.77369714, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79524851, + "num_input_tokens_seen": 185895570, + "step": 8651, + "time_per_iteration": 2.4108662605285645 + }, + { + "auxiliary_loss_clip": 0.01067048, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.04041195, + "balance_loss_mlp": 1.02307057, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 2.461847828015821, + "language_loss": 0.78234899, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80339444, + "num_input_tokens_seen": 185913700, + "step": 8652, + "time_per_iteration": 2.5804696083068848 + }, + { + "auxiliary_loss_clip": 0.01083405, + "auxiliary_loss_mlp": 0.01036895, + "balance_loss_clip": 1.03923893, + "balance_loss_mlp": 1.02103436, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 2.107729557762427, + "language_loss": 0.70463121, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72583413, + "num_input_tokens_seen": 185932460, + "step": 8653, + "time_per_iteration": 2.5509033203125 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01042933, + "balance_loss_clip": 1.04441345, + "balance_loss_mlp": 1.02850866, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.8857135205745985, + "language_loss": 0.78790033, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80924714, + "num_input_tokens_seen": 185952030, + "step": 8654, + "time_per_iteration": 2.550985097885132 + }, + { + "auxiliary_loss_clip": 0.01112769, + "auxiliary_loss_mlp": 0.010431, + "balance_loss_clip": 1.04185784, + "balance_loss_mlp": 1.02819943, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 2.012506746554025, + "language_loss": 0.84298182, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86454058, + "num_input_tokens_seen": 185973130, + "step": 8655, + "time_per_iteration": 3.9227912425994873 + }, + { + "auxiliary_loss_clip": 0.01109764, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.04404712, + "balance_loss_mlp": 1.02204728, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 2.24556627175663, + "language_loss": 0.65783286, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.67928123, + "num_input_tokens_seen": 185990200, + "step": 8656, + "time_per_iteration": 3.9490888118743896 + }, + { + "auxiliary_loss_clip": 0.01079603, + "auxiliary_loss_mlp": 0.0103754, + "balance_loss_clip": 1.04066944, + "balance_loss_mlp": 1.02401519, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 2.051794567919227, + "language_loss": 0.73060393, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.75177538, + "num_input_tokens_seen": 186009880, + "step": 8657, + "time_per_iteration": 2.602905750274658 + }, + { + "auxiliary_loss_clip": 0.01088312, + "auxiliary_loss_mlp": 0.01040011, + "balance_loss_clip": 1.04072118, + "balance_loss_mlp": 1.02537227, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 2.9362903642029132, + "language_loss": 0.71833253, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73961568, + "num_input_tokens_seen": 186026680, + "step": 8658, + "time_per_iteration": 2.5504508018493652 + }, + { + "auxiliary_loss_clip": 0.01120662, + "auxiliary_loss_mlp": 0.01038098, + "balance_loss_clip": 1.04286742, + "balance_loss_mlp": 1.02407956, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.8516685887475484, + "language_loss": 0.83576912, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85735673, + "num_input_tokens_seen": 186046920, + "step": 8659, + "time_per_iteration": 2.4889683723449707 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01048732, + "balance_loss_clip": 1.0422802, + "balance_loss_mlp": 1.03208995, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 2.9477754956365696, + "language_loss": 0.75289237, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77441365, + "num_input_tokens_seen": 186062090, + "step": 8660, + "time_per_iteration": 2.507185935974121 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.01039627, + "balance_loss_clip": 1.04334295, + "balance_loss_mlp": 1.02594185, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.8121139721179549, + "language_loss": 0.77610868, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.79771388, + "num_input_tokens_seen": 186081135, + "step": 8661, + "time_per_iteration": 2.455775499343872 + }, + { + "auxiliary_loss_clip": 0.01099754, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.03914118, + "balance_loss_mlp": 1.0215559, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.740712257958717, + "language_loss": 0.70321029, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72456068, + "num_input_tokens_seen": 186099700, + "step": 8662, + "time_per_iteration": 2.5584957599639893 + }, + { + "auxiliary_loss_clip": 0.01106763, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.04062533, + "balance_loss_mlp": 1.02052331, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.586436683758185, + "language_loss": 0.6941514, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71557808, + "num_input_tokens_seen": 186119740, + "step": 8663, + "time_per_iteration": 2.5374526977539062 + }, + { + "auxiliary_loss_clip": 0.01094538, + "auxiliary_loss_mlp": 0.00782389, + "balance_loss_clip": 1.04018831, + "balance_loss_mlp": 1.00052702, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 1.4746406519061461, + "language_loss": 0.76956648, + "learning_rate": 1.961640376626072e-06, + "loss": 0.78833574, + "num_input_tokens_seen": 186140645, + "step": 8664, + "time_per_iteration": 2.537031412124634 + }, + { + "auxiliary_loss_clip": 0.01103288, + "auxiliary_loss_mlp": 0.01038289, + "balance_loss_clip": 1.04929018, + "balance_loss_mlp": 1.02385855, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 2.025141906033863, + "language_loss": 0.76688373, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78829944, + "num_input_tokens_seen": 186160130, + "step": 8665, + "time_per_iteration": 2.5338642597198486 + }, + { + "auxiliary_loss_clip": 0.01114246, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.04735017, + "balance_loss_mlp": 1.02118933, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.6498195431638725, + "language_loss": 0.72109199, + "learning_rate": 1.960861599474586e-06, + "loss": 0.7425772, + "num_input_tokens_seen": 186179485, + "step": 8666, + "time_per_iteration": 2.475386381149292 + }, + { + "auxiliary_loss_clip": 0.01107153, + "auxiliary_loss_mlp": 0.01041355, + "balance_loss_clip": 1.04294181, + "balance_loss_mlp": 1.02470195, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 1.965368268429852, + "language_loss": 0.68435442, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70583951, + "num_input_tokens_seen": 186197140, + "step": 8667, + "time_per_iteration": 2.4910547733306885 + }, + { + "auxiliary_loss_clip": 0.01088541, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.05232286, + "balance_loss_mlp": 1.02183127, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.4658756663672796, + "language_loss": 0.810256, + "learning_rate": 1.960082828259629e-06, + "loss": 0.8315053, + "num_input_tokens_seen": 186216800, + "step": 8668, + "time_per_iteration": 2.6221375465393066 + }, + { + "auxiliary_loss_clip": 0.0110313, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.04267478, + "balance_loss_mlp": 1.01956558, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.4980000221682106, + "language_loss": 0.6348626, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65622604, + "num_input_tokens_seen": 186235320, + "step": 8669, + "time_per_iteration": 2.5777084827423096 + }, + { + "auxiliary_loss_clip": 0.01101517, + "auxiliary_loss_mlp": 0.00780375, + "balance_loss_clip": 1.0454917, + "balance_loss_mlp": 1.00047517, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.5893231989007115, + "language_loss": 0.66714907, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68596792, + "num_input_tokens_seen": 186254460, + "step": 8670, + "time_per_iteration": 2.5341784954071045 + }, + { + "auxiliary_loss_clip": 0.01083288, + "auxiliary_loss_mlp": 0.01038291, + "balance_loss_clip": 1.04121041, + "balance_loss_mlp": 1.02471876, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.264838889526334, + "language_loss": 0.76190782, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78312355, + "num_input_tokens_seen": 186269465, + "step": 8671, + "time_per_iteration": 4.016963481903076 + }, + { + "auxiliary_loss_clip": 0.01092381, + "auxiliary_loss_mlp": 0.01040898, + "balance_loss_clip": 1.04591441, + "balance_loss_mlp": 1.02599645, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 1.9152263216605827, + "language_loss": 0.78612024, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80745304, + "num_input_tokens_seen": 186288660, + "step": 8672, + "time_per_iteration": 2.5350329875946045 + }, + { + "auxiliary_loss_clip": 0.01082675, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.03744268, + "balance_loss_mlp": 1.02072978, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.945808518696606, + "language_loss": 0.72122061, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74238169, + "num_input_tokens_seen": 186305760, + "step": 8673, + "time_per_iteration": 2.5336155891418457 + }, + { + "auxiliary_loss_clip": 0.01106546, + "auxiliary_loss_mlp": 0.01036999, + "balance_loss_clip": 1.04070067, + "balance_loss_mlp": 1.02265775, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.485331427281132, + "language_loss": 0.74768555, + "learning_rate": 1.957746551415166e-06, + "loss": 0.76912093, + "num_input_tokens_seen": 186324135, + "step": 8674, + "time_per_iteration": 2.4655940532684326 + }, + { + "auxiliary_loss_clip": 0.01094464, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.0400517, + "balance_loss_mlp": 1.01780379, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.034918969442806, + "language_loss": 0.86248362, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88375795, + "num_input_tokens_seen": 186340205, + "step": 8675, + "time_per_iteration": 2.478074312210083 + }, + { + "auxiliary_loss_clip": 0.01024701, + "auxiliary_loss_mlp": 0.01001518, + "balance_loss_clip": 1.02080989, + "balance_loss_mlp": 1.00008154, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.885206142274736, + "language_loss": 0.63132465, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65158689, + "num_input_tokens_seen": 186396940, + "step": 8676, + "time_per_iteration": 3.0436484813690186 + }, + { + "auxiliary_loss_clip": 0.01107629, + "auxiliary_loss_mlp": 0.01032621, + "balance_loss_clip": 1.04209316, + "balance_loss_mlp": 1.01931167, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.4963721059423878, + "language_loss": 0.68799961, + "learning_rate": 1.956578434424046e-06, + "loss": 0.70940208, + "num_input_tokens_seen": 186418680, + "step": 8677, + "time_per_iteration": 2.541274070739746 + }, + { + "auxiliary_loss_clip": 0.0110914, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.04193008, + "balance_loss_mlp": 1.02060056, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.8844392046849978, + "language_loss": 0.65247416, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67391253, + "num_input_tokens_seen": 186438265, + "step": 8678, + "time_per_iteration": 2.5278308391571045 + }, + { + "auxiliary_loss_clip": 0.01102354, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.04070616, + "balance_loss_mlp": 1.02378368, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 2.1773903666262893, + "language_loss": 0.6824882, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.70390475, + "num_input_tokens_seen": 186456870, + "step": 8679, + "time_per_iteration": 2.515916347503662 + }, + { + "auxiliary_loss_clip": 0.01125665, + "auxiliary_loss_mlp": 0.010401, + "balance_loss_clip": 1.04744172, + "balance_loss_mlp": 1.02584279, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 2.0027979931150512, + "language_loss": 0.67002326, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69168091, + "num_input_tokens_seen": 186476425, + "step": 8680, + "time_per_iteration": 2.4478070735931396 + }, + { + "auxiliary_loss_clip": 0.01123705, + "auxiliary_loss_mlp": 0.01041469, + "balance_loss_clip": 1.04542708, + "balance_loss_mlp": 1.02720523, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 2.014576040000099, + "language_loss": 0.83392, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85557169, + "num_input_tokens_seen": 186492555, + "step": 8681, + "time_per_iteration": 2.419405937194824 + }, + { + "auxiliary_loss_clip": 0.01094915, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.04063284, + "balance_loss_mlp": 1.01993012, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.90167462618087, + "language_loss": 0.77702802, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79831123, + "num_input_tokens_seen": 186513190, + "step": 8682, + "time_per_iteration": 2.5483951568603516 + }, + { + "auxiliary_loss_clip": 0.01086215, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_clip": 1.04027641, + "balance_loss_mlp": 1.03136575, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.7787278004922653, + "language_loss": 0.68770254, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.70901549, + "num_input_tokens_seen": 186534830, + "step": 8683, + "time_per_iteration": 2.675373077392578 + }, + { + "auxiliary_loss_clip": 0.01089904, + "auxiliary_loss_mlp": 0.01038701, + "balance_loss_clip": 1.04297268, + "balance_loss_mlp": 1.02505159, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.8885845536140344, + "language_loss": 0.76129043, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78257656, + "num_input_tokens_seen": 186554390, + "step": 8684, + "time_per_iteration": 2.5811142921447754 + }, + { + "auxiliary_loss_clip": 0.0110189, + "auxiliary_loss_mlp": 0.00779667, + "balance_loss_clip": 1.03991628, + "balance_loss_mlp": 1.00043571, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.686350707416154, + "language_loss": 0.75755227, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.77636778, + "num_input_tokens_seen": 186572360, + "step": 8685, + "time_per_iteration": 3.9785232543945312 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01039749, + "balance_loss_clip": 1.04475021, + "balance_loss_mlp": 1.02602851, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.7426128875065858, + "language_loss": 0.80732489, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.82871807, + "num_input_tokens_seen": 186590655, + "step": 8686, + "time_per_iteration": 2.509694814682007 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.04471445, + "balance_loss_mlp": 1.0193491, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.7404207882963438, + "language_loss": 0.70193243, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72326243, + "num_input_tokens_seen": 186610345, + "step": 8687, + "time_per_iteration": 2.5727076530456543 + }, + { + "auxiliary_loss_clip": 0.01117764, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.04318309, + "balance_loss_mlp": 1.01767778, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.0058901362497057, + "language_loss": 0.8343274, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85581219, + "num_input_tokens_seen": 186624360, + "step": 8688, + "time_per_iteration": 2.4236209392547607 + }, + { + "auxiliary_loss_clip": 0.01108417, + "auxiliary_loss_mlp": 0.00779909, + "balance_loss_clip": 1.04324055, + "balance_loss_mlp": 1.00047266, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 3.147727457390661, + "language_loss": 0.7355634, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75444663, + "num_input_tokens_seen": 186638680, + "step": 8689, + "time_per_iteration": 2.439086437225342 + }, + { + "auxiliary_loss_clip": 0.0109608, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.0401119, + "balance_loss_mlp": 1.02010822, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 2.112789750797485, + "language_loss": 0.82981157, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85110378, + "num_input_tokens_seen": 186655840, + "step": 8690, + "time_per_iteration": 2.482382297515869 + }, + { + "auxiliary_loss_clip": 0.01085948, + "auxiliary_loss_mlp": 0.01040974, + "balance_loss_clip": 1.04303229, + "balance_loss_mlp": 1.02649021, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.178338352761243, + "language_loss": 0.79289234, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81416154, + "num_input_tokens_seen": 186674150, + "step": 8691, + "time_per_iteration": 2.661766529083252 + }, + { + "auxiliary_loss_clip": 0.01116984, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.045416, + "balance_loss_mlp": 1.02397442, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 2.5918666734547497, + "language_loss": 0.7617712, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78333229, + "num_input_tokens_seen": 186690675, + "step": 8692, + "time_per_iteration": 2.4966235160827637 + }, + { + "auxiliary_loss_clip": 0.01106363, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.04381049, + "balance_loss_mlp": 1.02239966, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.630334874527332, + "language_loss": 0.72293139, + "learning_rate": 1.950348737138691e-06, + "loss": 0.74434215, + "num_input_tokens_seen": 186710380, + "step": 8693, + "time_per_iteration": 2.5313282012939453 + }, + { + "auxiliary_loss_clip": 0.01124027, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.04350185, + "balance_loss_mlp": 1.02242231, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 2.631088207609035, + "language_loss": 0.82090122, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84251332, + "num_input_tokens_seen": 186729135, + "step": 8694, + "time_per_iteration": 2.446148633956909 + }, + { + "auxiliary_loss_clip": 0.01032045, + "auxiliary_loss_mlp": 0.01014108, + "balance_loss_clip": 1.04107785, + "balance_loss_mlp": 1.01285064, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.7972481398630022, + "language_loss": 0.55682766, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57728916, + "num_input_tokens_seen": 186791115, + "step": 8695, + "time_per_iteration": 4.530478239059448 + }, + { + "auxiliary_loss_clip": 0.01060381, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.04295003, + "balance_loss_mlp": 1.02513814, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.6073126117695065, + "language_loss": 0.73133308, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75231993, + "num_input_tokens_seen": 186808660, + "step": 8696, + "time_per_iteration": 4.106512069702148 + }, + { + "auxiliary_loss_clip": 0.01097249, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.04229224, + "balance_loss_mlp": 1.02009332, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.6268369230940392, + "language_loss": 0.71182716, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73313284, + "num_input_tokens_seen": 186825900, + "step": 8697, + "time_per_iteration": 2.4832680225372314 + }, + { + "auxiliary_loss_clip": 0.01094635, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.04784989, + "balance_loss_mlp": 1.02046537, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 2.26805832123696, + "language_loss": 0.80364537, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82491839, + "num_input_tokens_seen": 186843735, + "step": 8698, + "time_per_iteration": 2.5488226413726807 + }, + { + "auxiliary_loss_clip": 0.01107711, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.04158151, + "balance_loss_mlp": 1.02008891, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.7184350242585462, + "language_loss": 0.74235302, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76376468, + "num_input_tokens_seen": 186862440, + "step": 8699, + "time_per_iteration": 2.496446132659912 + }, + { + "auxiliary_loss_clip": 0.01109437, + "auxiliary_loss_mlp": 0.00781133, + "balance_loss_clip": 1.03928816, + "balance_loss_mlp": 1.00053716, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.5958010724909084, + "language_loss": 0.73340237, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75230801, + "num_input_tokens_seen": 186880940, + "step": 8700, + "time_per_iteration": 2.4907474517822266 + }, + { + "auxiliary_loss_clip": 0.01096572, + "auxiliary_loss_mlp": 0.01038173, + "balance_loss_clip": 1.0423429, + "balance_loss_mlp": 1.0232842, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.5921968627432923, + "language_loss": 0.67220056, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69354796, + "num_input_tokens_seen": 186900785, + "step": 8701, + "time_per_iteration": 2.5439391136169434 + }, + { + "auxiliary_loss_clip": 0.01102419, + "auxiliary_loss_mlp": 0.00779056, + "balance_loss_clip": 1.04514885, + "balance_loss_mlp": 1.00046515, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 1.9179500262006937, + "language_loss": 0.66884363, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68765843, + "num_input_tokens_seen": 186920895, + "step": 8702, + "time_per_iteration": 2.5564682483673096 + }, + { + "auxiliary_loss_clip": 0.01097073, + "auxiliary_loss_mlp": 0.01037548, + "balance_loss_clip": 1.04112029, + "balance_loss_mlp": 1.02327859, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.7967273656097311, + "language_loss": 0.76619923, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78754544, + "num_input_tokens_seen": 186940605, + "step": 8703, + "time_per_iteration": 2.558459758758545 + }, + { + "auxiliary_loss_clip": 0.01113844, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_clip": 1.04411769, + "balance_loss_mlp": 1.02534854, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 1.9649455947969292, + "language_loss": 0.76648951, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.78803802, + "num_input_tokens_seen": 186960820, + "step": 8704, + "time_per_iteration": 2.607637882232666 + }, + { + "auxiliary_loss_clip": 0.01099831, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.0459969, + "balance_loss_mlp": 1.02508247, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.6811796454731938, + "language_loss": 0.78080672, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80218196, + "num_input_tokens_seen": 186976240, + "step": 8705, + "time_per_iteration": 2.4897613525390625 + }, + { + "auxiliary_loss_clip": 0.01103925, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.04514086, + "balance_loss_mlp": 1.01901734, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 2.4721744769546308, + "language_loss": 0.69747698, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.71885508, + "num_input_tokens_seen": 186992855, + "step": 8706, + "time_per_iteration": 2.524550437927246 + }, + { + "auxiliary_loss_clip": 0.01035206, + "auxiliary_loss_mlp": 0.01000422, + "balance_loss_clip": 1.01971173, + "balance_loss_mlp": 0.99894983, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6830015536391529, + "language_loss": 0.52459282, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54494905, + "num_input_tokens_seen": 187051205, + "step": 8707, + "time_per_iteration": 3.114241361618042 + }, + { + "auxiliary_loss_clip": 0.01096781, + "auxiliary_loss_mlp": 0.01038967, + "balance_loss_clip": 1.04024386, + "balance_loss_mlp": 1.02544236, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.9060546920161894, + "language_loss": 0.74923068, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.77058816, + "num_input_tokens_seen": 187070540, + "step": 8708, + "time_per_iteration": 2.523871660232544 + }, + { + "auxiliary_loss_clip": 0.01090892, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.04253387, + "balance_loss_mlp": 1.01849926, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.5431873488351877, + "language_loss": 0.77191448, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79313737, + "num_input_tokens_seen": 187089975, + "step": 8709, + "time_per_iteration": 2.5051393508911133 + }, + { + "auxiliary_loss_clip": 0.0107186, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.03535819, + "balance_loss_mlp": 1.02305222, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 1.93833474582159, + "language_loss": 0.83442467, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85553521, + "num_input_tokens_seen": 187108775, + "step": 8710, + "time_per_iteration": 3.972156524658203 + }, + { + "auxiliary_loss_clip": 0.01088692, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.03933716, + "balance_loss_mlp": 1.01777351, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.993676058481753, + "language_loss": 0.69479471, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71599567, + "num_input_tokens_seen": 187128830, + "step": 8711, + "time_per_iteration": 2.53126859664917 + }, + { + "auxiliary_loss_clip": 0.01108955, + "auxiliary_loss_mlp": 0.01038885, + "balance_loss_clip": 1.04296887, + "balance_loss_mlp": 1.02552783, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.86040092869133, + "language_loss": 0.82807028, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.8495487, + "num_input_tokens_seen": 187149570, + "step": 8712, + "time_per_iteration": 2.535942792892456 + }, + { + "auxiliary_loss_clip": 0.01121719, + "auxiliary_loss_mlp": 0.01042438, + "balance_loss_clip": 1.04336929, + "balance_loss_mlp": 1.02784657, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.8320308490401225, + "language_loss": 0.69635653, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71799809, + "num_input_tokens_seen": 187170575, + "step": 8713, + "time_per_iteration": 2.4582762718200684 + }, + { + "auxiliary_loss_clip": 0.01083868, + "auxiliary_loss_mlp": 0.01038739, + "balance_loss_clip": 1.03673863, + "balance_loss_mlp": 1.0224669, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 3.5704834830281333, + "language_loss": 0.76647639, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.78770238, + "num_input_tokens_seen": 187187190, + "step": 8714, + "time_per_iteration": 2.5006608963012695 + }, + { + "auxiliary_loss_clip": 0.01083197, + "auxiliary_loss_mlp": 0.01040093, + "balance_loss_clip": 1.03847861, + "balance_loss_mlp": 1.02422643, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 1.8180784174923967, + "language_loss": 0.76230127, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78353411, + "num_input_tokens_seen": 187204350, + "step": 8715, + "time_per_iteration": 2.5098910331726074 + }, + { + "auxiliary_loss_clip": 0.01094446, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.03941357, + "balance_loss_mlp": 1.01977992, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.4542344632353486, + "language_loss": 0.71035665, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73163587, + "num_input_tokens_seen": 187225605, + "step": 8716, + "time_per_iteration": 2.6110098361968994 + }, + { + "auxiliary_loss_clip": 0.01119325, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.04266858, + "balance_loss_mlp": 1.02897358, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 2.3065369880838618, + "language_loss": 0.87287438, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89448273, + "num_input_tokens_seen": 187241335, + "step": 8717, + "time_per_iteration": 2.4654810428619385 + }, + { + "auxiliary_loss_clip": 0.01102497, + "auxiliary_loss_mlp": 0.01037624, + "balance_loss_clip": 1.04239941, + "balance_loss_mlp": 1.02446902, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 1.8504603314611547, + "language_loss": 0.60932517, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63072634, + "num_input_tokens_seen": 187259925, + "step": 8718, + "time_per_iteration": 2.494654893875122 + }, + { + "auxiliary_loss_clip": 0.01097118, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.04642892, + "balance_loss_mlp": 1.02909684, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.7454886235467377, + "language_loss": 0.71992797, + "learning_rate": 1.940226533916872e-06, + "loss": 0.7413317, + "num_input_tokens_seen": 187279035, + "step": 8719, + "time_per_iteration": 2.541167974472046 + }, + { + "auxiliary_loss_clip": 0.01105081, + "auxiliary_loss_mlp": 0.01036208, + "balance_loss_clip": 1.04192352, + "balance_loss_mlp": 1.02403688, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 2.061386165912286, + "language_loss": 0.73211062, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.75352353, + "num_input_tokens_seen": 187297555, + "step": 8720, + "time_per_iteration": 2.459246873855591 + }, + { + "auxiliary_loss_clip": 0.01105947, + "auxiliary_loss_mlp": 0.01041324, + "balance_loss_clip": 1.03905475, + "balance_loss_mlp": 1.02701283, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.6464551439512896, + "language_loss": 0.70263672, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72410941, + "num_input_tokens_seen": 187320265, + "step": 8721, + "time_per_iteration": 2.557199716567993 + }, + { + "auxiliary_loss_clip": 0.01064863, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.03538072, + "balance_loss_mlp": 1.02812934, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 2.187497803376525, + "language_loss": 0.86611634, + "learning_rate": 1.939058681065813e-06, + "loss": 0.8871873, + "num_input_tokens_seen": 187338045, + "step": 8722, + "time_per_iteration": 2.6267266273498535 + }, + { + "auxiliary_loss_clip": 0.01118131, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.04321599, + "balance_loss_mlp": 1.02349854, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.6423725069840507, + "language_loss": 0.80031687, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82187086, + "num_input_tokens_seen": 187356040, + "step": 8723, + "time_per_iteration": 2.484173059463501 + }, + { + "auxiliary_loss_clip": 0.0110809, + "auxiliary_loss_mlp": 0.0104525, + "balance_loss_clip": 1.04523253, + "balance_loss_mlp": 1.0303247, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 1.8936541623917351, + "language_loss": 0.75061852, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77215195, + "num_input_tokens_seen": 187374185, + "step": 8724, + "time_per_iteration": 3.999060869216919 + }, + { + "auxiliary_loss_clip": 0.01123892, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.0423677, + "balance_loss_mlp": 1.02317691, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.6257947235100794, + "language_loss": 0.70477152, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72639263, + "num_input_tokens_seen": 187396640, + "step": 8725, + "time_per_iteration": 2.5374553203582764 + }, + { + "auxiliary_loss_clip": 0.01015228, + "auxiliary_loss_mlp": 0.01018959, + "balance_loss_clip": 1.01762688, + "balance_loss_mlp": 1.01749289, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7610225479095917, + "language_loss": 0.55655134, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57689321, + "num_input_tokens_seen": 187455945, + "step": 8726, + "time_per_iteration": 3.1021413803100586 + }, + { + "auxiliary_loss_clip": 0.01021684, + "auxiliary_loss_mlp": 0.01005926, + "balance_loss_clip": 1.02157092, + "balance_loss_mlp": 1.00434041, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.7922134513009366, + "language_loss": 0.5834527, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60372877, + "num_input_tokens_seen": 187519975, + "step": 8727, + "time_per_iteration": 3.050264596939087 + }, + { + "auxiliary_loss_clip": 0.01111923, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.04145801, + "balance_loss_mlp": 1.02118301, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3120332450377665, + "language_loss": 0.70780516, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72928488, + "num_input_tokens_seen": 187541775, + "step": 8728, + "time_per_iteration": 2.517808198928833 + }, + { + "auxiliary_loss_clip": 0.01105074, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.03936958, + "balance_loss_mlp": 1.01619697, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.533206409721519, + "language_loss": 0.69881916, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.7201578, + "num_input_tokens_seen": 187560425, + "step": 8729, + "time_per_iteration": 2.469456434249878 + }, + { + "auxiliary_loss_clip": 0.0108181, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.03953195, + "balance_loss_mlp": 1.02066648, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.6620584429525584, + "language_loss": 0.83426154, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85542512, + "num_input_tokens_seen": 187579930, + "step": 8730, + "time_per_iteration": 2.549105644226074 + }, + { + "auxiliary_loss_clip": 0.01084819, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.03862786, + "balance_loss_mlp": 1.02037394, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 2.1983884170119117, + "language_loss": 0.79558927, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81677669, + "num_input_tokens_seen": 187595365, + "step": 8731, + "time_per_iteration": 2.5384397506713867 + }, + { + "auxiliary_loss_clip": 0.01102948, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.04036784, + "balance_loss_mlp": 1.01925755, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.6578068591751687, + "language_loss": 0.83060795, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85196435, + "num_input_tokens_seen": 187614715, + "step": 8732, + "time_per_iteration": 2.527589797973633 + }, + { + "auxiliary_loss_clip": 0.01107339, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.04199195, + "balance_loss_mlp": 1.02247286, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.5015004628753408, + "language_loss": 0.77510607, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.79653502, + "num_input_tokens_seen": 187630745, + "step": 8733, + "time_per_iteration": 2.450889825820923 + }, + { + "auxiliary_loss_clip": 0.01123253, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.04431391, + "balance_loss_mlp": 1.02714777, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 1.8389715270347957, + "language_loss": 0.81679356, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83844113, + "num_input_tokens_seen": 187648200, + "step": 8734, + "time_per_iteration": 3.8723034858703613 + }, + { + "auxiliary_loss_clip": 0.01098854, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.04920053, + "balance_loss_mlp": 1.01573849, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.3734559387124399, + "language_loss": 0.76994914, + "learning_rate": 1.933998230828826e-06, + "loss": 0.79122698, + "num_input_tokens_seen": 187669205, + "step": 8735, + "time_per_iteration": 4.187237024307251 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.04478455, + "balance_loss_mlp": 1.0194633, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 4.988996637844647, + "language_loss": 0.80356711, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.8250106, + "num_input_tokens_seen": 187690890, + "step": 8736, + "time_per_iteration": 2.5132641792297363 + }, + { + "auxiliary_loss_clip": 0.01124552, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.04607105, + "balance_loss_mlp": 1.02022898, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.2725426147747965, + "language_loss": 0.69719195, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.71878684, + "num_input_tokens_seen": 187713045, + "step": 8737, + "time_per_iteration": 2.548001766204834 + }, + { + "auxiliary_loss_clip": 0.01099363, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.04259896, + "balance_loss_mlp": 1.02250051, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.476255955702352, + "language_loss": 0.77470237, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79605627, + "num_input_tokens_seen": 187733640, + "step": 8738, + "time_per_iteration": 2.544879198074341 + }, + { + "auxiliary_loss_clip": 0.01012597, + "auxiliary_loss_mlp": 0.00754831, + "balance_loss_clip": 1.01472366, + "balance_loss_mlp": 1.00011849, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.750148925920107, + "language_loss": 0.54494834, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56262261, + "num_input_tokens_seen": 187792930, + "step": 8739, + "time_per_iteration": 3.088222026824951 + }, + { + "auxiliary_loss_clip": 0.01091432, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.03921199, + "balance_loss_mlp": 1.02157569, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 1.6265813664907836, + "language_loss": 0.84749472, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86875629, + "num_input_tokens_seen": 187812495, + "step": 8740, + "time_per_iteration": 2.6376078128814697 + }, + { + "auxiliary_loss_clip": 0.01105638, + "auxiliary_loss_mlp": 0.00780124, + "balance_loss_clip": 1.03997779, + "balance_loss_mlp": 1.00056624, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 2.4820043544498054, + "language_loss": 0.69363409, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71249169, + "num_input_tokens_seen": 187829685, + "step": 8741, + "time_per_iteration": 2.4671425819396973 + }, + { + "auxiliary_loss_clip": 0.01101153, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.04211986, + "balance_loss_mlp": 1.0192939, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.760946411599855, + "language_loss": 0.66280138, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68414408, + "num_input_tokens_seen": 187846495, + "step": 8742, + "time_per_iteration": 2.5042569637298584 + }, + { + "auxiliary_loss_clip": 0.01086992, + "auxiliary_loss_mlp": 0.01043066, + "balance_loss_clip": 1.04038656, + "balance_loss_mlp": 1.02761626, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 3.2552784178644214, + "language_loss": 0.62985605, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65115666, + "num_input_tokens_seen": 187862010, + "step": 8743, + "time_per_iteration": 2.566847562789917 + }, + { + "auxiliary_loss_clip": 0.01033713, + "auxiliary_loss_mlp": 0.01013704, + "balance_loss_clip": 1.01648653, + "balance_loss_mlp": 1.0121665, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7839485707252555, + "language_loss": 0.54195482, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56242901, + "num_input_tokens_seen": 187922730, + "step": 8744, + "time_per_iteration": 3.157109022140503 + }, + { + "auxiliary_loss_clip": 0.01102746, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.04412282, + "balance_loss_mlp": 1.01846385, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.160398558500759, + "language_loss": 0.75883383, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.78020084, + "num_input_tokens_seen": 187940160, + "step": 8745, + "time_per_iteration": 2.5270016193389893 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.01037548, + "balance_loss_clip": 1.04340649, + "balance_loss_mlp": 1.0247035, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.8685800277463884, + "language_loss": 0.81002814, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83151186, + "num_input_tokens_seen": 187958625, + "step": 8746, + "time_per_iteration": 2.4803147315979004 + }, + { + "auxiliary_loss_clip": 0.01109817, + "auxiliary_loss_mlp": 0.01034001, + "balance_loss_clip": 1.04291654, + "balance_loss_mlp": 1.02060211, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 1.8434641657247266, + "language_loss": 0.75452888, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.775967, + "num_input_tokens_seen": 187977575, + "step": 8747, + "time_per_iteration": 2.4862892627716064 + }, + { + "auxiliary_loss_clip": 0.01054488, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.03562629, + "balance_loss_mlp": 1.01873505, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 1.8282100076863201, + "language_loss": 0.8313008, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.85216308, + "num_input_tokens_seen": 187996650, + "step": 8748, + "time_per_iteration": 2.5905022621154785 + }, + { + "auxiliary_loss_clip": 0.0110154, + "auxiliary_loss_mlp": 0.01036257, + "balance_loss_clip": 1.04153752, + "balance_loss_mlp": 1.02182674, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 3.1693467909815727, + "language_loss": 0.80139607, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82277405, + "num_input_tokens_seen": 188013510, + "step": 8749, + "time_per_iteration": 4.009716510772705 + }, + { + "auxiliary_loss_clip": 0.01108782, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.04498029, + "balance_loss_mlp": 1.02214491, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.8652158127797778, + "language_loss": 0.72317624, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74461967, + "num_input_tokens_seen": 188032085, + "step": 8750, + "time_per_iteration": 2.5563418865203857 + }, + { + "auxiliary_loss_clip": 0.01099718, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.04112315, + "balance_loss_mlp": 1.02086091, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.3484069816968642, + "language_loss": 0.76102519, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.7823621, + "num_input_tokens_seen": 188050590, + "step": 8751, + "time_per_iteration": 2.531500816345215 + }, + { + "auxiliary_loss_clip": 0.0111784, + "auxiliary_loss_mlp": 0.01035801, + "balance_loss_clip": 1.04443705, + "balance_loss_mlp": 1.02303958, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.462377829762743, + "language_loss": 0.75947058, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78100699, + "num_input_tokens_seen": 188071620, + "step": 8752, + "time_per_iteration": 2.478067636489868 + }, + { + "auxiliary_loss_clip": 0.01111123, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.04318857, + "balance_loss_mlp": 1.01933956, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.7627564084104315, + "language_loss": 0.68260789, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70405614, + "num_input_tokens_seen": 188091740, + "step": 8753, + "time_per_iteration": 2.5280447006225586 + }, + { + "auxiliary_loss_clip": 0.01111101, + "auxiliary_loss_mlp": 0.01037684, + "balance_loss_clip": 1.04495525, + "balance_loss_mlp": 1.02469599, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.5209734238517236, + "language_loss": 0.8375715, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.85905933, + "num_input_tokens_seen": 188111165, + "step": 8754, + "time_per_iteration": 2.533613681793213 + }, + { + "auxiliary_loss_clip": 0.01108878, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.04287684, + "balance_loss_mlp": 1.02158046, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.116494497635352, + "language_loss": 0.87229216, + "learning_rate": 1.926213760058522e-06, + "loss": 0.89372784, + "num_input_tokens_seen": 188127825, + "step": 8755, + "time_per_iteration": 2.455324411392212 + }, + { + "auxiliary_loss_clip": 0.01015483, + "auxiliary_loss_mlp": 0.01010038, + "balance_loss_clip": 1.02257383, + "balance_loss_mlp": 1.00869703, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7173066856213152, + "language_loss": 0.58799672, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60825193, + "num_input_tokens_seen": 188194050, + "step": 8756, + "time_per_iteration": 3.1751468181610107 + }, + { + "auxiliary_loss_clip": 0.01092726, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.04175735, + "balance_loss_mlp": 1.02236915, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 1.5926460539646496, + "language_loss": 0.70276284, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72405088, + "num_input_tokens_seen": 188212565, + "step": 8757, + "time_per_iteration": 2.5692195892333984 + }, + { + "auxiliary_loss_clip": 0.01108644, + "auxiliary_loss_mlp": 0.0103901, + "balance_loss_clip": 1.04311168, + "balance_loss_mlp": 1.02586734, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.9374429157865283, + "language_loss": 0.87775332, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.89922976, + "num_input_tokens_seen": 188229505, + "step": 8758, + "time_per_iteration": 2.467398166656494 + }, + { + "auxiliary_loss_clip": 0.01060719, + "auxiliary_loss_mlp": 0.01041281, + "balance_loss_clip": 1.03769839, + "balance_loss_mlp": 1.02647483, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.6592931430896016, + "language_loss": 0.7594471, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78046715, + "num_input_tokens_seen": 188250395, + "step": 8759, + "time_per_iteration": 2.654021978378296 + }, + { + "auxiliary_loss_clip": 0.01097557, + "auxiliary_loss_mlp": 0.01026956, + "balance_loss_clip": 1.04258978, + "balance_loss_mlp": 1.01368213, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 1.9241116488523544, + "language_loss": 0.71690989, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.73815501, + "num_input_tokens_seen": 188266785, + "step": 8760, + "time_per_iteration": 2.544447660446167 + }, + { + "auxiliary_loss_clip": 0.01106714, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.05046761, + "balance_loss_mlp": 1.02234006, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 2.268642919123824, + "language_loss": 0.75590742, + "learning_rate": 1.923878631697736e-06, + "loss": 0.77734363, + "num_input_tokens_seen": 188282525, + "step": 8761, + "time_per_iteration": 2.5590391159057617 + }, + { + "auxiliary_loss_clip": 0.01105997, + "auxiliary_loss_mlp": 0.0077962, + "balance_loss_clip": 1.04088426, + "balance_loss_mlp": 1.0005095, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 2.1720610945784546, + "language_loss": 0.70891309, + "learning_rate": 1.923489453654373e-06, + "loss": 0.72776926, + "num_input_tokens_seen": 188301395, + "step": 8762, + "time_per_iteration": 2.508035659790039 + }, + { + "auxiliary_loss_clip": 0.01021293, + "auxiliary_loss_mlp": 0.01001491, + "balance_loss_clip": 1.01772237, + "balance_loss_mlp": 0.99997157, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.926364964118702, + "language_loss": 0.65454513, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67477292, + "num_input_tokens_seen": 188357665, + "step": 8763, + "time_per_iteration": 4.406506061553955 + }, + { + "auxiliary_loss_clip": 0.01109657, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.04316032, + "balance_loss_mlp": 1.01813126, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 1.7670057865259834, + "language_loss": 0.71349192, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73490393, + "num_input_tokens_seen": 188376935, + "step": 8764, + "time_per_iteration": 2.482104539871216 + }, + { + "auxiliary_loss_clip": 0.0108023, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.03663707, + "balance_loss_mlp": 1.01982689, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.8412591747953653, + "language_loss": 0.74217039, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76331878, + "num_input_tokens_seen": 188394995, + "step": 8765, + "time_per_iteration": 2.5414187908172607 + }, + { + "auxiliary_loss_clip": 0.01100045, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.03948665, + "balance_loss_mlp": 1.02122116, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.6126312557330327, + "language_loss": 0.85477161, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87613273, + "num_input_tokens_seen": 188415475, + "step": 8766, + "time_per_iteration": 2.5677855014801025 + }, + { + "auxiliary_loss_clip": 0.01124883, + "auxiliary_loss_mlp": 0.01039928, + "balance_loss_clip": 1.04602647, + "balance_loss_mlp": 1.02528954, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.7318670078139324, + "language_loss": 0.78893185, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81058002, + "num_input_tokens_seen": 188435665, + "step": 8767, + "time_per_iteration": 2.470661163330078 + }, + { + "auxiliary_loss_clip": 0.01113591, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.04385769, + "balance_loss_mlp": 1.02444935, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 2.1466968626906398, + "language_loss": 0.73428798, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75580853, + "num_input_tokens_seen": 188455405, + "step": 8768, + "time_per_iteration": 2.5490548610687256 + }, + { + "auxiliary_loss_clip": 0.01093249, + "auxiliary_loss_mlp": 0.0104684, + "balance_loss_clip": 1.03869033, + "balance_loss_mlp": 1.0333333, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 2.4236388716559234, + "language_loss": 0.74201155, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76341242, + "num_input_tokens_seen": 188472940, + "step": 8769, + "time_per_iteration": 2.5454394817352295 + }, + { + "auxiliary_loss_clip": 0.0108449, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.03963804, + "balance_loss_mlp": 1.02220893, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 4.6898551696280615, + "language_loss": 0.7373302, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75853771, + "num_input_tokens_seen": 188493035, + "step": 8770, + "time_per_iteration": 2.6303303241729736 + }, + { + "auxiliary_loss_clip": 0.01123608, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.04617131, + "balance_loss_mlp": 1.02191007, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.9157570464141283, + "language_loss": 0.68275416, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70434332, + "num_input_tokens_seen": 188513860, + "step": 8771, + "time_per_iteration": 2.5297110080718994 + }, + { + "auxiliary_loss_clip": 0.0110881, + "auxiliary_loss_mlp": 0.01037865, + "balance_loss_clip": 1.04316878, + "balance_loss_mlp": 1.0242393, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.8048395677415547, + "language_loss": 0.76587868, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78734541, + "num_input_tokens_seen": 188533345, + "step": 8772, + "time_per_iteration": 2.5026745796203613 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.01046825, + "balance_loss_clip": 1.04118752, + "balance_loss_mlp": 1.03237128, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.9862002492615705, + "language_loss": 0.658306, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.67983735, + "num_input_tokens_seen": 188551550, + "step": 8773, + "time_per_iteration": 3.932798385620117 + }, + { + "auxiliary_loss_clip": 0.01088322, + "auxiliary_loss_mlp": 0.01044716, + "balance_loss_clip": 1.04075074, + "balance_loss_mlp": 1.03202677, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.6025246857016608, + "language_loss": 0.86201823, + "learning_rate": 1.91881954765502e-06, + "loss": 0.8833487, + "num_input_tokens_seen": 188571615, + "step": 8774, + "time_per_iteration": 4.148171663284302 + }, + { + "auxiliary_loss_clip": 0.01085898, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.0375632, + "balance_loss_mlp": 1.01944923, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.4861869625192194, + "language_loss": 0.79839027, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.81957239, + "num_input_tokens_seen": 188591965, + "step": 8775, + "time_per_iteration": 2.539168119430542 + }, + { + "auxiliary_loss_clip": 0.01096842, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.03942871, + "balance_loss_mlp": 1.02348876, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 3.215687110560189, + "language_loss": 0.83759391, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85893965, + "num_input_tokens_seen": 188610675, + "step": 8776, + "time_per_iteration": 2.565904140472412 + }, + { + "auxiliary_loss_clip": 0.01093255, + "auxiliary_loss_mlp": 0.01028666, + "balance_loss_clip": 1.04412174, + "balance_loss_mlp": 1.01467121, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 2.109717314193798, + "language_loss": 0.68120289, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70242214, + "num_input_tokens_seen": 188628235, + "step": 8777, + "time_per_iteration": 2.5222697257995605 + }, + { + "auxiliary_loss_clip": 0.01097858, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.04244804, + "balance_loss_mlp": 1.0250479, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 1.6779035975784649, + "language_loss": 0.82387537, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84524047, + "num_input_tokens_seen": 188648925, + "step": 8778, + "time_per_iteration": 2.5618960857391357 + }, + { + "auxiliary_loss_clip": 0.0111233, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.04465914, + "balance_loss_mlp": 1.02184641, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 2.303896970363576, + "language_loss": 0.79935157, + "learning_rate": 1.916873882856013e-06, + "loss": 0.82083267, + "num_input_tokens_seen": 188668125, + "step": 8779, + "time_per_iteration": 2.5635759830474854 + }, + { + "auxiliary_loss_clip": 0.01104143, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.04061496, + "balance_loss_mlp": 1.01729107, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.14973490264488, + "language_loss": 0.76328683, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78462881, + "num_input_tokens_seen": 188684410, + "step": 8780, + "time_per_iteration": 2.5374257564544678 + }, + { + "auxiliary_loss_clip": 0.01091391, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.04190755, + "balance_loss_mlp": 1.01779532, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.5685293152220334, + "language_loss": 0.69573295, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71697235, + "num_input_tokens_seen": 188706130, + "step": 8781, + "time_per_iteration": 2.7216217517852783 + }, + { + "auxiliary_loss_clip": 0.01107861, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.04430485, + "balance_loss_mlp": 1.02253628, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.625057400468846, + "language_loss": 0.72350001, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74492598, + "num_input_tokens_seen": 188725030, + "step": 8782, + "time_per_iteration": 2.53513503074646 + }, + { + "auxiliary_loss_clip": 0.01098558, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.0483439, + "balance_loss_mlp": 1.01493859, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.8021199790415268, + "language_loss": 0.68764758, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70890772, + "num_input_tokens_seen": 188744325, + "step": 8783, + "time_per_iteration": 2.564730405807495 + }, + { + "auxiliary_loss_clip": 0.01118397, + "auxiliary_loss_mlp": 0.01042341, + "balance_loss_clip": 1.04458165, + "balance_loss_mlp": 1.02606845, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 4.413656613464165, + "language_loss": 0.69430006, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71590745, + "num_input_tokens_seen": 188765100, + "step": 8784, + "time_per_iteration": 2.5965356826782227 + }, + { + "auxiliary_loss_clip": 0.0112413, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.0426569, + "balance_loss_mlp": 1.01805162, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.732013666537628, + "language_loss": 0.75253344, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77410346, + "num_input_tokens_seen": 188783995, + "step": 8785, + "time_per_iteration": 2.533097982406616 + }, + { + "auxiliary_loss_clip": 0.01110013, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.04371119, + "balance_loss_mlp": 1.01960504, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 2.0319899353328332, + "language_loss": 0.83509195, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85653734, + "num_input_tokens_seen": 188803120, + "step": 8786, + "time_per_iteration": 2.547653913497925 + }, + { + "auxiliary_loss_clip": 0.01082852, + "auxiliary_loss_mlp": 0.01025083, + "balance_loss_clip": 1.04069066, + "balance_loss_mlp": 1.01322746, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 2.3552818690282207, + "language_loss": 0.82824939, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.84932876, + "num_input_tokens_seen": 188820960, + "step": 8787, + "time_per_iteration": 2.63855242729187 + }, + { + "auxiliary_loss_clip": 0.01061526, + "auxiliary_loss_mlp": 0.0102896, + "balance_loss_clip": 1.03501439, + "balance_loss_mlp": 1.01663351, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.6984674935670507, + "language_loss": 0.83484846, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.8557533, + "num_input_tokens_seen": 188837165, + "step": 8788, + "time_per_iteration": 4.0000083446502686 + }, + { + "auxiliary_loss_clip": 0.01083692, + "auxiliary_loss_mlp": 0.01042639, + "balance_loss_clip": 1.045434, + "balance_loss_mlp": 1.0283581, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 1.5954488302378247, + "language_loss": 0.75257218, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.77383548, + "num_input_tokens_seen": 188858555, + "step": 8789, + "time_per_iteration": 2.7020866870880127 + }, + { + "auxiliary_loss_clip": 0.01112208, + "auxiliary_loss_mlp": 0.01032664, + "balance_loss_clip": 1.04569626, + "balance_loss_mlp": 1.01958084, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.545196193434357, + "language_loss": 0.70165825, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.72310698, + "num_input_tokens_seen": 188879050, + "step": 8790, + "time_per_iteration": 2.596158027648926 + }, + { + "auxiliary_loss_clip": 0.0111702, + "auxiliary_loss_mlp": 0.01024057, + "balance_loss_clip": 1.04358268, + "balance_loss_mlp": 1.01219571, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.5582770360836298, + "language_loss": 0.79143107, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81284177, + "num_input_tokens_seen": 188898885, + "step": 8791, + "time_per_iteration": 2.4923346042633057 + }, + { + "auxiliary_loss_clip": 0.01072821, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.04757547, + "balance_loss_mlp": 1.0155102, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 1.668738752690368, + "language_loss": 0.66158152, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68259954, + "num_input_tokens_seen": 188917225, + "step": 8792, + "time_per_iteration": 2.595608949661255 + }, + { + "auxiliary_loss_clip": 0.01091226, + "auxiliary_loss_mlp": 0.01038559, + "balance_loss_clip": 1.03941679, + "balance_loss_mlp": 1.0242058, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 1.8611671370448588, + "language_loss": 0.79650676, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.81780457, + "num_input_tokens_seen": 188936120, + "step": 8793, + "time_per_iteration": 2.5723893642425537 + }, + { + "auxiliary_loss_clip": 0.01119459, + "auxiliary_loss_mlp": 0.01043405, + "balance_loss_clip": 1.04350424, + "balance_loss_mlp": 1.02988696, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 2.0442831636301486, + "language_loss": 0.84919631, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.87082493, + "num_input_tokens_seen": 188953405, + "step": 8794, + "time_per_iteration": 2.4467921257019043 + }, + { + "auxiliary_loss_clip": 0.01094209, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.03751612, + "balance_loss_mlp": 1.01916564, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 1.9958192936976993, + "language_loss": 0.68078738, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.70206821, + "num_input_tokens_seen": 188971150, + "step": 8795, + "time_per_iteration": 2.5195071697235107 + }, + { + "auxiliary_loss_clip": 0.01102029, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.04958153, + "balance_loss_mlp": 1.01734924, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.9812393941659068, + "language_loss": 0.81050527, + "learning_rate": 1.910259223028374e-06, + "loss": 0.83182967, + "num_input_tokens_seen": 188989550, + "step": 8796, + "time_per_iteration": 2.5529351234436035 + }, + { + "auxiliary_loss_clip": 0.01080903, + "auxiliary_loss_mlp": 0.01050438, + "balance_loss_clip": 1.03883481, + "balance_loss_mlp": 1.03458333, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 2.1551938073283354, + "language_loss": 0.69384211, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71515548, + "num_input_tokens_seen": 189008795, + "step": 8797, + "time_per_iteration": 2.6101701259613037 + }, + { + "auxiliary_loss_clip": 0.01098491, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.04218221, + "balance_loss_mlp": 1.02536213, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.4700613660231434, + "language_loss": 0.82433915, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84570241, + "num_input_tokens_seen": 189025540, + "step": 8798, + "time_per_iteration": 2.497950553894043 + }, + { + "auxiliary_loss_clip": 0.01096689, + "auxiliary_loss_mlp": 0.00783717, + "balance_loss_clip": 1.04077411, + "balance_loss_mlp": 1.00065649, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 2.226620801801311, + "language_loss": 0.71218228, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.73098636, + "num_input_tokens_seen": 189044885, + "step": 8799, + "time_per_iteration": 2.556033134460449 + }, + { + "auxiliary_loss_clip": 0.01106809, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.04546118, + "balance_loss_mlp": 1.02224183, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 2.5031730148362636, + "language_loss": 0.69454443, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71595591, + "num_input_tokens_seen": 189061280, + "step": 8800, + "time_per_iteration": 2.4847490787506104 + }, + { + "auxiliary_loss_clip": 0.01017891, + "auxiliary_loss_mlp": 0.01010623, + "balance_loss_clip": 1.01945376, + "balance_loss_mlp": 1.00943685, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.9954708635468711, + "language_loss": 0.5697062, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.58999133, + "num_input_tokens_seen": 189114775, + "step": 8801, + "time_per_iteration": 3.0438344478607178 + }, + { + "auxiliary_loss_clip": 0.01106657, + "auxiliary_loss_mlp": 0.01035191, + "balance_loss_clip": 1.047261, + "balance_loss_mlp": 1.02235222, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.6536858759330444, + "language_loss": 0.63953012, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66094863, + "num_input_tokens_seen": 189134700, + "step": 8802, + "time_per_iteration": 2.6326141357421875 + }, + { + "auxiliary_loss_clip": 0.01099456, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.04401183, + "balance_loss_mlp": 1.016559, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.736156140719979, + "language_loss": 0.69067323, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71196735, + "num_input_tokens_seen": 189155365, + "step": 8803, + "time_per_iteration": 4.131426811218262 + }, + { + "auxiliary_loss_clip": 0.01107151, + "auxiliary_loss_mlp": 0.0077841, + "balance_loss_clip": 1.04367852, + "balance_loss_mlp": 1.00056553, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.84990467374266, + "language_loss": 0.76398432, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78283989, + "num_input_tokens_seen": 189173885, + "step": 8804, + "time_per_iteration": 2.5278513431549072 + }, + { + "auxiliary_loss_clip": 0.01035963, + "auxiliary_loss_mlp": 0.0099921, + "balance_loss_clip": 1.01952314, + "balance_loss_mlp": 0.99794596, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.7519169739320015, + "language_loss": 0.52974546, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55009723, + "num_input_tokens_seen": 189236515, + "step": 8805, + "time_per_iteration": 3.1628918647766113 + }, + { + "auxiliary_loss_clip": 0.01036632, + "auxiliary_loss_mlp": 0.01005808, + "balance_loss_clip": 1.01883519, + "balance_loss_mlp": 1.00435376, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7432322969438773, + "language_loss": 0.63811743, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65854186, + "num_input_tokens_seen": 189300500, + "step": 8806, + "time_per_iteration": 3.0611062049865723 + }, + { + "auxiliary_loss_clip": 0.01113786, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.04451835, + "balance_loss_mlp": 1.02177143, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.5141199399028737, + "language_loss": 0.7226426, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74412829, + "num_input_tokens_seen": 189319745, + "step": 8807, + "time_per_iteration": 2.5101540088653564 + }, + { + "auxiliary_loss_clip": 0.0108082, + "auxiliary_loss_mlp": 0.01030925, + "balance_loss_clip": 1.04310048, + "balance_loss_mlp": 1.01898003, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 2.0953804520673978, + "language_loss": 0.69161093, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71272838, + "num_input_tokens_seen": 189334550, + "step": 8808, + "time_per_iteration": 2.559931755065918 + }, + { + "auxiliary_loss_clip": 0.01108185, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.04380834, + "balance_loss_mlp": 1.02117586, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 2.2883869970381556, + "language_loss": 0.86916053, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.89057767, + "num_input_tokens_seen": 189351735, + "step": 8809, + "time_per_iteration": 2.511209726333618 + }, + { + "auxiliary_loss_clip": 0.01114513, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.04509521, + "balance_loss_mlp": 1.02272332, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.6323301933821528, + "language_loss": 0.64155221, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.6630711, + "num_input_tokens_seen": 189373105, + "step": 8810, + "time_per_iteration": 2.6590936183929443 + }, + { + "auxiliary_loss_clip": 0.01118033, + "auxiliary_loss_mlp": 0.01038434, + "balance_loss_clip": 1.04419112, + "balance_loss_mlp": 1.02500474, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.624088963034106, + "language_loss": 0.67767107, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.6992358, + "num_input_tokens_seen": 189394615, + "step": 8811, + "time_per_iteration": 2.5279297828674316 + }, + { + "auxiliary_loss_clip": 0.01010441, + "auxiliary_loss_mlp": 0.01005689, + "balance_loss_clip": 1.02023983, + "balance_loss_mlp": 1.00427055, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6629120227716125, + "language_loss": 0.53363872, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55379999, + "num_input_tokens_seen": 189459750, + "step": 8812, + "time_per_iteration": 3.294158458709717 + }, + { + "auxiliary_loss_clip": 0.01022749, + "auxiliary_loss_mlp": 0.01003595, + "balance_loss_clip": 1.01718378, + "balance_loss_mlp": 1.00248682, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7243345860192568, + "language_loss": 0.56290287, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.5831663, + "num_input_tokens_seen": 189527540, + "step": 8813, + "time_per_iteration": 5.50199556350708 + }, + { + "auxiliary_loss_clip": 0.01066348, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.04183114, + "balance_loss_mlp": 1.0216136, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.5383233063045973, + "language_loss": 0.81934333, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.84034801, + "num_input_tokens_seen": 189546900, + "step": 8814, + "time_per_iteration": 4.198639869689941 + }, + { + "auxiliary_loss_clip": 0.01124993, + "auxiliary_loss_mlp": 0.01026523, + "balance_loss_clip": 1.046628, + "balance_loss_mlp": 1.01366639, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.5903966100874571, + "language_loss": 0.85010403, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87161922, + "num_input_tokens_seen": 189566490, + "step": 8815, + "time_per_iteration": 2.505209445953369 + }, + { + "auxiliary_loss_clip": 0.01117714, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.04578018, + "balance_loss_mlp": 1.02267861, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.247074940237736, + "language_loss": 0.66869795, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.69022369, + "num_input_tokens_seen": 189585580, + "step": 8816, + "time_per_iteration": 2.4965760707855225 + }, + { + "auxiliary_loss_clip": 0.01098304, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.04298162, + "balance_loss_mlp": 1.0218935, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.6311603999716193, + "language_loss": 0.718458, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.73979259, + "num_input_tokens_seen": 189608485, + "step": 8817, + "time_per_iteration": 2.7534937858581543 + }, + { + "auxiliary_loss_clip": 0.01091205, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.04028153, + "balance_loss_mlp": 1.02245247, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.659236554191143, + "language_loss": 0.65106249, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67234874, + "num_input_tokens_seen": 189627815, + "step": 8818, + "time_per_iteration": 2.5727484226226807 + }, + { + "auxiliary_loss_clip": 0.01082887, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.04915941, + "balance_loss_mlp": 1.01621151, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 1.932180513569628, + "language_loss": 0.74916697, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77030146, + "num_input_tokens_seen": 189644850, + "step": 8819, + "time_per_iteration": 2.6119446754455566 + }, + { + "auxiliary_loss_clip": 0.01087371, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.04288363, + "balance_loss_mlp": 1.02456701, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 1.891991957518795, + "language_loss": 0.81951505, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.8407824, + "num_input_tokens_seen": 189660945, + "step": 8820, + "time_per_iteration": 2.564574956893921 + }, + { + "auxiliary_loss_clip": 0.01102195, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.04532003, + "balance_loss_mlp": 1.02074599, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 1.712531239417171, + "language_loss": 0.72570288, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74705422, + "num_input_tokens_seen": 189680425, + "step": 8821, + "time_per_iteration": 2.6195101737976074 + }, + { + "auxiliary_loss_clip": 0.01090821, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.04235935, + "balance_loss_mlp": 1.01738203, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.482074374909102, + "language_loss": 0.74026757, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76147074, + "num_input_tokens_seen": 189700375, + "step": 8822, + "time_per_iteration": 2.642094135284424 + }, + { + "auxiliary_loss_clip": 0.01090347, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.04312265, + "balance_loss_mlp": 1.02260089, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 2.883984910083281, + "language_loss": 0.67419755, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69546288, + "num_input_tokens_seen": 189721225, + "step": 8823, + "time_per_iteration": 2.6546027660369873 + }, + { + "auxiliary_loss_clip": 0.01123437, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.04466903, + "balance_loss_mlp": 1.02278352, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.73859553357246, + "language_loss": 0.69619238, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71779758, + "num_input_tokens_seen": 189740170, + "step": 8824, + "time_per_iteration": 2.5761466026306152 + }, + { + "auxiliary_loss_clip": 0.01095783, + "auxiliary_loss_mlp": 0.0077828, + "balance_loss_clip": 1.04376626, + "balance_loss_mlp": 1.00058079, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 2.2685826000238642, + "language_loss": 0.76003861, + "learning_rate": 1.898977700702689e-06, + "loss": 0.77877927, + "num_input_tokens_seen": 189757890, + "step": 8825, + "time_per_iteration": 2.6136929988861084 + }, + { + "auxiliary_loss_clip": 0.01042563, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.03614628, + "balance_loss_mlp": 1.02881205, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 2.4335490531691435, + "language_loss": 0.85541666, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87626755, + "num_input_tokens_seen": 189775390, + "step": 8826, + "time_per_iteration": 2.8486602306365967 + }, + { + "auxiliary_loss_clip": 0.01115124, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.04172575, + "balance_loss_mlp": 1.01892889, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.620357230498687, + "language_loss": 0.64507955, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66655284, + "num_input_tokens_seen": 189793975, + "step": 8827, + "time_per_iteration": 4.097350835800171 + }, + { + "auxiliary_loss_clip": 0.01098982, + "auxiliary_loss_mlp": 0.01039904, + "balance_loss_clip": 1.04218984, + "balance_loss_mlp": 1.02618265, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.8457500395595878, + "language_loss": 0.59827352, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.6196624, + "num_input_tokens_seen": 189817870, + "step": 8828, + "time_per_iteration": 2.7886931896209717 + }, + { + "auxiliary_loss_clip": 0.01110882, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.04209542, + "balance_loss_mlp": 1.01872826, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.7758474496600725, + "language_loss": 0.81497395, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83640814, + "num_input_tokens_seen": 189837905, + "step": 8829, + "time_per_iteration": 2.5567808151245117 + }, + { + "auxiliary_loss_clip": 0.01101637, + "auxiliary_loss_mlp": 0.01031, + "balance_loss_clip": 1.04354668, + "balance_loss_mlp": 1.01790452, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.9122316806848918, + "language_loss": 0.78078568, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80211204, + "num_input_tokens_seen": 189856970, + "step": 8830, + "time_per_iteration": 2.5671887397766113 + }, + { + "auxiliary_loss_clip": 0.01106585, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.04205728, + "balance_loss_mlp": 1.01593351, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.065247416058492, + "language_loss": 0.80752569, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82887781, + "num_input_tokens_seen": 189872830, + "step": 8831, + "time_per_iteration": 2.5188474655151367 + }, + { + "auxiliary_loss_clip": 0.01106309, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.0423876, + "balance_loss_mlp": 1.01766372, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 6.578801079080272, + "language_loss": 0.73230582, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75367993, + "num_input_tokens_seen": 189891635, + "step": 8832, + "time_per_iteration": 2.54011869430542 + }, + { + "auxiliary_loss_clip": 0.0108869, + "auxiliary_loss_mlp": 0.01035983, + "balance_loss_clip": 1.04167199, + "balance_loss_mlp": 1.02168953, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 3.242603987860067, + "language_loss": 0.75802672, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77927345, + "num_input_tokens_seen": 189909050, + "step": 8833, + "time_per_iteration": 2.6514675617218018 + }, + { + "auxiliary_loss_clip": 0.01088149, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.04012525, + "balance_loss_mlp": 1.0175786, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.7084099825179455, + "language_loss": 0.7374388, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.7586273, + "num_input_tokens_seen": 189927405, + "step": 8834, + "time_per_iteration": 2.733463764190674 + }, + { + "auxiliary_loss_clip": 0.01125274, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.04481566, + "balance_loss_mlp": 1.02378607, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 1.8901064337105644, + "language_loss": 0.77974164, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.80138129, + "num_input_tokens_seen": 189947740, + "step": 8835, + "time_per_iteration": 2.5768628120422363 + }, + { + "auxiliary_loss_clip": 0.0110048, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.04824817, + "balance_loss_mlp": 1.02218699, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.8843923340767657, + "language_loss": 0.72311759, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74449384, + "num_input_tokens_seen": 189966495, + "step": 8836, + "time_per_iteration": 2.6047322750091553 + }, + { + "auxiliary_loss_clip": 0.01103216, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.0457356, + "balance_loss_mlp": 1.02100444, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 1.7972956068248211, + "language_loss": 0.8063339, + "learning_rate": 1.894310406375987e-06, + "loss": 0.82771826, + "num_input_tokens_seen": 189985325, + "step": 8837, + "time_per_iteration": 2.608417272567749 + }, + { + "auxiliary_loss_clip": 0.01111242, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.05050516, + "balance_loss_mlp": 1.0172379, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.7401286230675062, + "language_loss": 0.85643637, + "learning_rate": 1.893921490881035e-06, + "loss": 0.87786222, + "num_input_tokens_seen": 190003290, + "step": 8838, + "time_per_iteration": 2.54614520072937 + }, + { + "auxiliary_loss_clip": 0.01094288, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.04105616, + "balance_loss_mlp": 1.01862812, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.7181615418172198, + "language_loss": 0.72887903, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.75013399, + "num_input_tokens_seen": 190023260, + "step": 8839, + "time_per_iteration": 2.6168534755706787 + }, + { + "auxiliary_loss_clip": 0.01100775, + "auxiliary_loss_mlp": 0.01035354, + "balance_loss_clip": 1.04327726, + "balance_loss_mlp": 1.02255714, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.9756783605127084, + "language_loss": 0.76571071, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.787072, + "num_input_tokens_seen": 190042035, + "step": 8840, + "time_per_iteration": 2.586075782775879 + }, + { + "auxiliary_loss_clip": 0.01081521, + "auxiliary_loss_mlp": 0.01035187, + "balance_loss_clip": 1.03992593, + "balance_loss_mlp": 1.02149558, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 2.071753049808909, + "language_loss": 0.77458274, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79574984, + "num_input_tokens_seen": 190057545, + "step": 8841, + "time_per_iteration": 4.106637477874756 + }, + { + "auxiliary_loss_clip": 0.01028331, + "auxiliary_loss_mlp": 0.01007449, + "balance_loss_clip": 1.02215505, + "balance_loss_mlp": 1.00614929, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.696231614506344, + "language_loss": 0.56780267, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58816051, + "num_input_tokens_seen": 190123800, + "step": 8842, + "time_per_iteration": 3.2892544269561768 + }, + { + "auxiliary_loss_clip": 0.01098027, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.0422349, + "balance_loss_mlp": 1.0237149, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.7811757339708578, + "language_loss": 0.73535526, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.75671864, + "num_input_tokens_seen": 190141625, + "step": 8843, + "time_per_iteration": 2.561818838119507 + }, + { + "auxiliary_loss_clip": 0.01024972, + "auxiliary_loss_mlp": 0.01002243, + "balance_loss_clip": 1.01970029, + "balance_loss_mlp": 1.00103283, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8751588967461502, + "language_loss": 0.60998416, + "learning_rate": 1.891588082900145e-06, + "loss": 0.6302563, + "num_input_tokens_seen": 190198110, + "step": 8844, + "time_per_iteration": 3.2123515605926514 + }, + { + "auxiliary_loss_clip": 0.0103376, + "auxiliary_loss_mlp": 0.01004054, + "balance_loss_clip": 1.01667762, + "balance_loss_mlp": 1.00274301, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.8461418430540547, + "language_loss": 0.6228981, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64327621, + "num_input_tokens_seen": 190259950, + "step": 8845, + "time_per_iteration": 3.105783700942993 + }, + { + "auxiliary_loss_clip": 0.01090137, + "auxiliary_loss_mlp": 0.01039314, + "balance_loss_clip": 1.0433923, + "balance_loss_mlp": 1.02486038, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 1.9178860268733633, + "language_loss": 0.75543594, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77673048, + "num_input_tokens_seen": 190278265, + "step": 8846, + "time_per_iteration": 2.6490070819854736 + }, + { + "auxiliary_loss_clip": 0.011104, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.04488111, + "balance_loss_mlp": 1.02116692, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.5519278161298689, + "language_loss": 0.75501299, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.7764467, + "num_input_tokens_seen": 190298400, + "step": 8847, + "time_per_iteration": 2.614004135131836 + }, + { + "auxiliary_loss_clip": 0.01099193, + "auxiliary_loss_mlp": 0.01031231, + "balance_loss_clip": 1.0430274, + "balance_loss_mlp": 1.01866043, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.7814498196686095, + "language_loss": 0.87626141, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89756566, + "num_input_tokens_seen": 190316235, + "step": 8848, + "time_per_iteration": 2.6089060306549072 + }, + { + "auxiliary_loss_clip": 0.01083488, + "auxiliary_loss_mlp": 0.01039263, + "balance_loss_clip": 1.04062426, + "balance_loss_mlp": 1.02383709, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 1.8299846499644414, + "language_loss": 0.74387908, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76510656, + "num_input_tokens_seen": 190335060, + "step": 8849, + "time_per_iteration": 2.6312761306762695 + }, + { + "auxiliary_loss_clip": 0.01107382, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.03975582, + "balance_loss_mlp": 1.01495278, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 1.698966318843605, + "language_loss": 0.79873347, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.82009721, + "num_input_tokens_seen": 190353265, + "step": 8850, + "time_per_iteration": 2.5916571617126465 + }, + { + "auxiliary_loss_clip": 0.01120267, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.04329443, + "balance_loss_mlp": 1.01844597, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.4397772765239727, + "language_loss": 0.55131584, + "learning_rate": 1.888865960862821e-06, + "loss": 0.5728327, + "num_input_tokens_seen": 190376575, + "step": 8851, + "time_per_iteration": 2.6455564498901367 + }, + { + "auxiliary_loss_clip": 0.01108937, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.04173064, + "balance_loss_mlp": 1.0221535, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 2.116343313273888, + "language_loss": 0.68570566, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70715106, + "num_input_tokens_seen": 190395185, + "step": 8852, + "time_per_iteration": 2.582005262374878 + }, + { + "auxiliary_loss_clip": 0.0102024, + "auxiliary_loss_mlp": 0.00754209, + "balance_loss_clip": 1.01336932, + "balance_loss_mlp": 1.00011575, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8002595542018148, + "language_loss": 0.62857747, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64632189, + "num_input_tokens_seen": 190452595, + "step": 8853, + "time_per_iteration": 6.856045722961426 + }, + { + "auxiliary_loss_clip": 0.01110889, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.04103732, + "balance_loss_mlp": 1.01833224, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.565978327449507, + "language_loss": 0.79567933, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81710768, + "num_input_tokens_seen": 190469140, + "step": 8854, + "time_per_iteration": 2.5598790645599365 + }, + { + "auxiliary_loss_clip": 0.0109248, + "auxiliary_loss_mlp": 0.01026205, + "balance_loss_clip": 1.04429483, + "balance_loss_mlp": 1.01429605, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 2.256646694360929, + "language_loss": 0.73402941, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75521624, + "num_input_tokens_seen": 190489015, + "step": 8855, + "time_per_iteration": 2.6404216289520264 + }, + { + "auxiliary_loss_clip": 0.01094636, + "auxiliary_loss_mlp": 0.00778929, + "balance_loss_clip": 1.03993404, + "balance_loss_mlp": 1.00066876, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 2.206004184854093, + "language_loss": 0.6476292, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66636485, + "num_input_tokens_seen": 190508065, + "step": 8856, + "time_per_iteration": 2.667067050933838 + }, + { + "auxiliary_loss_clip": 0.01098822, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.04232252, + "balance_loss_mlp": 1.02409303, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 1.9158385882138034, + "language_loss": 0.77914441, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.8005206, + "num_input_tokens_seen": 190527045, + "step": 8857, + "time_per_iteration": 2.6184258460998535 + }, + { + "auxiliary_loss_clip": 0.01096903, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.04728758, + "balance_loss_mlp": 1.02036452, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 1.7964792760551556, + "language_loss": 0.7143172, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73562711, + "num_input_tokens_seen": 190544075, + "step": 8858, + "time_per_iteration": 2.6903269290924072 + }, + { + "auxiliary_loss_clip": 0.01108165, + "auxiliary_loss_mlp": 0.01037004, + "balance_loss_clip": 1.0419364, + "balance_loss_mlp": 1.02238917, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.7620943101398925, + "language_loss": 0.69395226, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71540397, + "num_input_tokens_seen": 190566030, + "step": 8859, + "time_per_iteration": 2.6019999980926514 + }, + { + "auxiliary_loss_clip": 0.01106995, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.04496741, + "balance_loss_mlp": 1.0156163, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.6293123975833903, + "language_loss": 0.69585645, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71719825, + "num_input_tokens_seen": 190585605, + "step": 8860, + "time_per_iteration": 2.8079185485839844 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.04638076, + "balance_loss_mlp": 1.01874089, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.7421321120911284, + "language_loss": 0.77732086, + "learning_rate": 1.884977574556683e-06, + "loss": 0.79865479, + "num_input_tokens_seen": 190604625, + "step": 8861, + "time_per_iteration": 2.7269535064697266 + }, + { + "auxiliary_loss_clip": 0.01078749, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.04166603, + "balance_loss_mlp": 1.02718985, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 1.7119730133018496, + "language_loss": 0.86162406, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.88282287, + "num_input_tokens_seen": 190625060, + "step": 8862, + "time_per_iteration": 2.7324442863464355 + }, + { + "auxiliary_loss_clip": 0.0109747, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.03965294, + "balance_loss_mlp": 1.02633131, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 1.82108178455198, + "language_loss": 0.61576092, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.63715363, + "num_input_tokens_seen": 190643150, + "step": 8863, + "time_per_iteration": 2.605343818664551 + }, + { + "auxiliary_loss_clip": 0.01099245, + "auxiliary_loss_mlp": 0.01042613, + "balance_loss_clip": 1.04650664, + "balance_loss_mlp": 1.02877271, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.8149956133954959, + "language_loss": 0.73440766, + "learning_rate": 1.883811143046377e-06, + "loss": 0.75582623, + "num_input_tokens_seen": 190662725, + "step": 8864, + "time_per_iteration": 2.657792806625366 + }, + { + "auxiliary_loss_clip": 0.01120388, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.04409671, + "balance_loss_mlp": 1.02322102, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.6934226745418877, + "language_loss": 0.63999563, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66156065, + "num_input_tokens_seen": 190683680, + "step": 8865, + "time_per_iteration": 2.6522915363311768 + }, + { + "auxiliary_loss_clip": 0.01112889, + "auxiliary_loss_mlp": 0.01030953, + "balance_loss_clip": 1.04450381, + "balance_loss_mlp": 1.01733899, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 1.884638281945193, + "language_loss": 0.78528756, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80672598, + "num_input_tokens_seen": 190703350, + "step": 8866, + "time_per_iteration": 4.029601335525513 + }, + { + "auxiliary_loss_clip": 0.01106439, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.04138005, + "balance_loss_mlp": 1.01820469, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 1.8886923491537273, + "language_loss": 0.73613364, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75750816, + "num_input_tokens_seen": 190721170, + "step": 8867, + "time_per_iteration": 2.535303831100464 + }, + { + "auxiliary_loss_clip": 0.01099441, + "auxiliary_loss_mlp": 0.01038896, + "balance_loss_clip": 1.04111481, + "balance_loss_mlp": 1.02405405, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.5429295452224137, + "language_loss": 0.72104633, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74242967, + "num_input_tokens_seen": 190743795, + "step": 8868, + "time_per_iteration": 2.736833095550537 + }, + { + "auxiliary_loss_clip": 0.0109133, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.04173207, + "balance_loss_mlp": 1.01958573, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.6435635601469192, + "language_loss": 0.78483868, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80608523, + "num_input_tokens_seen": 190761560, + "step": 8869, + "time_per_iteration": 2.652571201324463 + }, + { + "auxiliary_loss_clip": 0.01113764, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.04372311, + "balance_loss_mlp": 1.01939785, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 2.0934773422805466, + "language_loss": 0.75563526, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.77710307, + "num_input_tokens_seen": 190778875, + "step": 8870, + "time_per_iteration": 2.539578437805176 + }, + { + "auxiliary_loss_clip": 0.01102506, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.04461217, + "balance_loss_mlp": 1.02231741, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 2.14127781131994, + "language_loss": 0.75126815, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77265847, + "num_input_tokens_seen": 190799830, + "step": 8871, + "time_per_iteration": 2.5969338417053223 + }, + { + "auxiliary_loss_clip": 0.01099359, + "auxiliary_loss_mlp": 0.01035217, + "balance_loss_clip": 1.04339361, + "balance_loss_mlp": 1.02152562, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.9094392738261063, + "language_loss": 0.7234267, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74477243, + "num_input_tokens_seen": 190817155, + "step": 8872, + "time_per_iteration": 2.5909271240234375 + }, + { + "auxiliary_loss_clip": 0.01098208, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.04674935, + "balance_loss_mlp": 1.02924824, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.5930438580189585, + "language_loss": 0.64889181, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67030376, + "num_input_tokens_seen": 190835240, + "step": 8873, + "time_per_iteration": 2.5464401245117188 + }, + { + "auxiliary_loss_clip": 0.01096149, + "auxiliary_loss_mlp": 0.01039036, + "balance_loss_clip": 1.04214454, + "balance_loss_mlp": 1.0260123, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 2.1550041638190853, + "language_loss": 0.80409163, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82544351, + "num_input_tokens_seen": 190851620, + "step": 8874, + "time_per_iteration": 2.5620789527893066 + }, + { + "auxiliary_loss_clip": 0.01114805, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.04718971, + "balance_loss_mlp": 1.01812351, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 1.7459306549088602, + "language_loss": 0.69721687, + "learning_rate": 1.879534569789582e-06, + "loss": 0.71867895, + "num_input_tokens_seen": 190870545, + "step": 8875, + "time_per_iteration": 2.561681032180786 + }, + { + "auxiliary_loss_clip": 0.01038888, + "auxiliary_loss_mlp": 0.01006573, + "balance_loss_clip": 1.01355267, + "balance_loss_mlp": 1.00526762, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7269995892287896, + "language_loss": 0.59669864, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61715323, + "num_input_tokens_seen": 190931995, + "step": 8876, + "time_per_iteration": 3.2501752376556396 + }, + { + "auxiliary_loss_clip": 0.01110662, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.04522812, + "balance_loss_mlp": 1.02084446, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.8885395154467572, + "language_loss": 0.75255746, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.77400362, + "num_input_tokens_seen": 190949890, + "step": 8877, + "time_per_iteration": 2.714653491973877 + }, + { + "auxiliary_loss_clip": 0.01027326, + "auxiliary_loss_mlp": 0.01002135, + "balance_loss_clip": 1.01510525, + "balance_loss_mlp": 1.00096655, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7576456912485925, + "language_loss": 0.57162619, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59192079, + "num_input_tokens_seen": 191008480, + "step": 8878, + "time_per_iteration": 3.0005745887756348 + }, + { + "auxiliary_loss_clip": 0.01125864, + "auxiliary_loss_mlp": 0.01037783, + "balance_loss_clip": 1.04521477, + "balance_loss_mlp": 1.02326298, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.433847447726577, + "language_loss": 0.72289002, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74452651, + "num_input_tokens_seen": 191028995, + "step": 8879, + "time_per_iteration": 2.5497756004333496 + }, + { + "auxiliary_loss_clip": 0.01124211, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.04591656, + "balance_loss_mlp": 1.02018583, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.0685264795647895, + "language_loss": 0.83380032, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85538709, + "num_input_tokens_seen": 191045285, + "step": 8880, + "time_per_iteration": 2.5279555320739746 + }, + { + "auxiliary_loss_clip": 0.01053052, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.03794479, + "balance_loss_mlp": 1.02094388, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.3339876549267933, + "language_loss": 0.79340398, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81427121, + "num_input_tokens_seen": 191066105, + "step": 8881, + "time_per_iteration": 4.129899024963379 + }, + { + "auxiliary_loss_clip": 0.01020536, + "auxiliary_loss_mlp": 0.01002575, + "balance_loss_clip": 1.01894307, + "balance_loss_mlp": 1.00149, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7946370548105921, + "language_loss": 0.59289056, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61312163, + "num_input_tokens_seen": 191126315, + "step": 8882, + "time_per_iteration": 3.0727176666259766 + }, + { + "auxiliary_loss_clip": 0.01026283, + "auxiliary_loss_mlp": 0.01013564, + "balance_loss_clip": 1.01851773, + "balance_loss_mlp": 1.01242602, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8825530925237335, + "language_loss": 0.63745266, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65785116, + "num_input_tokens_seen": 191174240, + "step": 8883, + "time_per_iteration": 2.9178531169891357 + }, + { + "auxiliary_loss_clip": 0.01082912, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.04432726, + "balance_loss_mlp": 1.02118337, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 3.0005990079517093, + "language_loss": 0.82370043, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84488112, + "num_input_tokens_seen": 191193335, + "step": 8884, + "time_per_iteration": 2.7602145671844482 + }, + { + "auxiliary_loss_clip": 0.01088357, + "auxiliary_loss_mlp": 0.01037915, + "balance_loss_clip": 1.04039598, + "balance_loss_mlp": 1.02438438, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 2.3573072188484074, + "language_loss": 0.7228837, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74414647, + "num_input_tokens_seen": 191210900, + "step": 8885, + "time_per_iteration": 2.611943006515503 + }, + { + "auxiliary_loss_clip": 0.01101564, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.04130626, + "balance_loss_mlp": 1.0208137, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 2.016377988090557, + "language_loss": 0.78892541, + "learning_rate": 1.87525854926798e-06, + "loss": 0.81029075, + "num_input_tokens_seen": 191226730, + "step": 8886, + "time_per_iteration": 2.6058056354522705 + }, + { + "auxiliary_loss_clip": 0.01090102, + "auxiliary_loss_mlp": 0.00780236, + "balance_loss_clip": 1.04852581, + "balance_loss_mlp": 1.00061131, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.4686077832796351, + "language_loss": 0.75037277, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76907617, + "num_input_tokens_seen": 191250435, + "step": 8887, + "time_per_iteration": 2.7644307613372803 + }, + { + "auxiliary_loss_clip": 0.01096522, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.04186559, + "balance_loss_mlp": 1.02007937, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 2.337227230894072, + "language_loss": 0.69308656, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.71438766, + "num_input_tokens_seen": 191268315, + "step": 8888, + "time_per_iteration": 2.5868256092071533 + }, + { + "auxiliary_loss_clip": 0.01117579, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.04453063, + "balance_loss_mlp": 1.02197242, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.1331169149077396, + "language_loss": 0.77490532, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79644573, + "num_input_tokens_seen": 191287000, + "step": 8889, + "time_per_iteration": 2.5485448837280273 + }, + { + "auxiliary_loss_clip": 0.01120836, + "auxiliary_loss_mlp": 0.01040528, + "balance_loss_clip": 1.0449636, + "balance_loss_mlp": 1.02674127, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 2.871229660337128, + "language_loss": 0.68966961, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71128327, + "num_input_tokens_seen": 191304565, + "step": 8890, + "time_per_iteration": 2.528775691986084 + }, + { + "auxiliary_loss_clip": 0.01124845, + "auxiliary_loss_mlp": 0.01042927, + "balance_loss_clip": 1.04517174, + "balance_loss_mlp": 1.02788317, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 2.935283940434291, + "language_loss": 0.77163327, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.793311, + "num_input_tokens_seen": 191318300, + "step": 8891, + "time_per_iteration": 2.5341176986694336 + }, + { + "auxiliary_loss_clip": 0.0110353, + "auxiliary_loss_mlp": 0.01046288, + "balance_loss_clip": 1.04095149, + "balance_loss_mlp": 1.03331184, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.5050756464383703, + "language_loss": 0.74417597, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76567411, + "num_input_tokens_seen": 191337925, + "step": 8892, + "time_per_iteration": 5.489374160766602 + }, + { + "auxiliary_loss_clip": 0.01107658, + "auxiliary_loss_mlp": 0.01036579, + "balance_loss_clip": 1.04899359, + "balance_loss_mlp": 1.02379966, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 3.7812521843568367, + "language_loss": 0.8814404, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90288275, + "num_input_tokens_seen": 191357120, + "step": 8893, + "time_per_iteration": 2.6453726291656494 + }, + { + "auxiliary_loss_clip": 0.01118107, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.04395831, + "balance_loss_mlp": 1.02127683, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.7129806119503859, + "language_loss": 0.73142493, + "learning_rate": 1.872149074536869e-06, + "loss": 0.75293815, + "num_input_tokens_seen": 191375395, + "step": 8894, + "time_per_iteration": 2.5361196994781494 + }, + { + "auxiliary_loss_clip": 0.01114292, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.04728734, + "balance_loss_mlp": 1.01707053, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 1.8031850936276277, + "language_loss": 0.74662995, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.76808, + "num_input_tokens_seen": 191395595, + "step": 8895, + "time_per_iteration": 2.585068702697754 + }, + { + "auxiliary_loss_clip": 0.01094284, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.04626513, + "balance_loss_mlp": 1.0217073, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.6381116366856154, + "language_loss": 0.76953894, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79083431, + "num_input_tokens_seen": 191413730, + "step": 8896, + "time_per_iteration": 2.624237060546875 + }, + { + "auxiliary_loss_clip": 0.01091524, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.04343402, + "balance_loss_mlp": 1.01652706, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.70072485989417, + "language_loss": 0.78512311, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80633658, + "num_input_tokens_seen": 191432400, + "step": 8897, + "time_per_iteration": 2.564781904220581 + }, + { + "auxiliary_loss_clip": 0.01109226, + "auxiliary_loss_mlp": 0.01033489, + "balance_loss_clip": 1.04271221, + "balance_loss_mlp": 1.0203166, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.7891876614070545, + "language_loss": 0.75913537, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.78056252, + "num_input_tokens_seen": 191448855, + "step": 8898, + "time_per_iteration": 2.6212680339813232 + }, + { + "auxiliary_loss_clip": 0.01029837, + "auxiliary_loss_mlp": 0.01015231, + "balance_loss_clip": 1.01310802, + "balance_loss_mlp": 1.01419413, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8395399477768403, + "language_loss": 0.57977152, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60022223, + "num_input_tokens_seen": 191519690, + "step": 8899, + "time_per_iteration": 3.3080899715423584 + }, + { + "auxiliary_loss_clip": 0.0110048, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.0467881, + "balance_loss_mlp": 1.0182997, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.4136987606274218, + "language_loss": 0.69869292, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72000682, + "num_input_tokens_seen": 191539380, + "step": 8900, + "time_per_iteration": 2.610731363296509 + }, + { + "auxiliary_loss_clip": 0.01098597, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.04119945, + "balance_loss_mlp": 1.01815212, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.581507980827545, + "language_loss": 0.71686447, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73816335, + "num_input_tokens_seen": 191557400, + "step": 8901, + "time_per_iteration": 2.570887804031372 + }, + { + "auxiliary_loss_clip": 0.01091287, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.04002666, + "balance_loss_mlp": 1.01888824, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 2.139541338839038, + "language_loss": 0.77598834, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79722643, + "num_input_tokens_seen": 191575860, + "step": 8902, + "time_per_iteration": 2.591472625732422 + }, + { + "auxiliary_loss_clip": 0.01092873, + "auxiliary_loss_mlp": 0.0103927, + "balance_loss_clip": 1.04646516, + "balance_loss_mlp": 1.02622294, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 2.301797301434473, + "language_loss": 0.70288491, + "learning_rate": 1.868651286721281e-06, + "loss": 0.72420633, + "num_input_tokens_seen": 191595775, + "step": 8903, + "time_per_iteration": 2.5682475566864014 + }, + { + "auxiliary_loss_clip": 0.01109874, + "auxiliary_loss_mlp": 0.00779837, + "balance_loss_clip": 1.04192924, + "balance_loss_mlp": 1.00054073, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.5378109552915804, + "language_loss": 0.72012532, + "learning_rate": 1.86826266833795e-06, + "loss": 0.73902243, + "num_input_tokens_seen": 191617785, + "step": 8904, + "time_per_iteration": 2.5664148330688477 + }, + { + "auxiliary_loss_clip": 0.01097299, + "auxiliary_loss_mlp": 0.01040687, + "balance_loss_clip": 1.04340136, + "balance_loss_mlp": 1.02679932, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.710548423331449, + "language_loss": 0.73331082, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75469065, + "num_input_tokens_seen": 191636900, + "step": 8905, + "time_per_iteration": 3.9191782474517822 + }, + { + "auxiliary_loss_clip": 0.0110441, + "auxiliary_loss_mlp": 0.01034349, + "balance_loss_clip": 1.04280376, + "balance_loss_mlp": 1.02289271, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.4559200380113257, + "language_loss": 0.83782768, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85921526, + "num_input_tokens_seen": 191656720, + "step": 8906, + "time_per_iteration": 2.5473387241363525 + }, + { + "auxiliary_loss_clip": 0.01111158, + "auxiliary_loss_mlp": 0.00780124, + "balance_loss_clip": 1.04330373, + "balance_loss_mlp": 1.0005722, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 2.240760466570145, + "language_loss": 0.7406863, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.75959909, + "num_input_tokens_seen": 191674445, + "step": 8907, + "time_per_iteration": 2.4991519451141357 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.04189539, + "balance_loss_mlp": 1.02189469, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.730272949702223, + "language_loss": 0.76477897, + "learning_rate": 1.866708244906912e-06, + "loss": 0.78620601, + "num_input_tokens_seen": 191695000, + "step": 8908, + "time_per_iteration": 2.519351005554199 + }, + { + "auxiliary_loss_clip": 0.01093241, + "auxiliary_loss_mlp": 0.00780517, + "balance_loss_clip": 1.04004431, + "balance_loss_mlp": 1.00053835, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 1.7348203563391607, + "language_loss": 0.73911375, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.75785136, + "num_input_tokens_seen": 191713295, + "step": 8909, + "time_per_iteration": 2.5494799613952637 + }, + { + "auxiliary_loss_clip": 0.01079447, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.04105461, + "balance_loss_mlp": 1.03212285, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 2.2867143724481584, + "language_loss": 0.83965445, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86089522, + "num_input_tokens_seen": 191732725, + "step": 8910, + "time_per_iteration": 2.5711164474487305 + }, + { + "auxiliary_loss_clip": 0.01100737, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.04254985, + "balance_loss_mlp": 1.02124119, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 2.3593210640362687, + "language_loss": 0.81570774, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.83706367, + "num_input_tokens_seen": 191753765, + "step": 8911, + "time_per_iteration": 2.595303535461426 + }, + { + "auxiliary_loss_clip": 0.0108181, + "auxiliary_loss_mlp": 0.01041905, + "balance_loss_clip": 1.04044938, + "balance_loss_mlp": 1.0290544, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.6843963285799055, + "language_loss": 0.68988538, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71112245, + "num_input_tokens_seen": 191773560, + "step": 8912, + "time_per_iteration": 2.6100144386291504 + }, + { + "auxiliary_loss_clip": 0.01095871, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.04113579, + "balance_loss_mlp": 1.02309465, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 1.91038521436243, + "language_loss": 0.71531749, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73663926, + "num_input_tokens_seen": 191791255, + "step": 8913, + "time_per_iteration": 2.53311824798584 + }, + { + "auxiliary_loss_clip": 0.01092429, + "auxiliary_loss_mlp": 0.01037655, + "balance_loss_clip": 1.04320204, + "balance_loss_mlp": 1.02454162, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.7728217878747317, + "language_loss": 0.72290915, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74421, + "num_input_tokens_seen": 191809325, + "step": 8914, + "time_per_iteration": 2.5828256607055664 + }, + { + "auxiliary_loss_clip": 0.0109939, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.04404616, + "balance_loss_mlp": 1.02817917, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 2.7445882606720438, + "language_loss": 0.70716941, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72858548, + "num_input_tokens_seen": 191829795, + "step": 8915, + "time_per_iteration": 2.56619930267334 + }, + { + "auxiliary_loss_clip": 0.01092023, + "auxiliary_loss_mlp": 0.01042302, + "balance_loss_clip": 1.04025447, + "balance_loss_mlp": 1.02893281, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 1.6103019847708169, + "language_loss": 0.74914813, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77049136, + "num_input_tokens_seen": 191850840, + "step": 8916, + "time_per_iteration": 2.5814592838287354 + }, + { + "auxiliary_loss_clip": 0.01080695, + "auxiliary_loss_mlp": 0.0077825, + "balance_loss_clip": 1.04329133, + "balance_loss_mlp": 1.00048137, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.078842542202142, + "language_loss": 0.72146398, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74005342, + "num_input_tokens_seen": 191869520, + "step": 8917, + "time_per_iteration": 2.7057690620422363 + }, + { + "auxiliary_loss_clip": 0.01099094, + "auxiliary_loss_mlp": 0.01041983, + "balance_loss_clip": 1.04387617, + "balance_loss_mlp": 1.02848864, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 1.9014810574749477, + "language_loss": 0.71107954, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.7324903, + "num_input_tokens_seen": 191887240, + "step": 8918, + "time_per_iteration": 2.5470638275146484 + }, + { + "auxiliary_loss_clip": 0.01102615, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.04723001, + "balance_loss_mlp": 1.02513158, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.6562522184235808, + "language_loss": 0.75224996, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77365446, + "num_input_tokens_seen": 191905690, + "step": 8919, + "time_per_iteration": 2.553755283355713 + }, + { + "auxiliary_loss_clip": 0.01088573, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.04075015, + "balance_loss_mlp": 1.02119613, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 1.8179601610460558, + "language_loss": 0.71355563, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73477995, + "num_input_tokens_seen": 191920725, + "step": 8920, + "time_per_iteration": 4.051400899887085 + }, + { + "auxiliary_loss_clip": 0.01101619, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.0381465, + "balance_loss_mlp": 1.0208652, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.3008764015456804, + "language_loss": 0.68653846, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.70790893, + "num_input_tokens_seen": 191944645, + "step": 8921, + "time_per_iteration": 2.6771466732025146 + }, + { + "auxiliary_loss_clip": 0.01109603, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.04426348, + "balance_loss_mlp": 1.02338648, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.9442219718261018, + "language_loss": 0.81646687, + "learning_rate": 1.86126840594594e-06, + "loss": 0.83792758, + "num_input_tokens_seen": 191962265, + "step": 8922, + "time_per_iteration": 2.5119175910949707 + }, + { + "auxiliary_loss_clip": 0.01111013, + "auxiliary_loss_mlp": 0.01032959, + "balance_loss_clip": 1.04436159, + "balance_loss_mlp": 1.0205493, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 2.2051501029087253, + "language_loss": 0.76925051, + "learning_rate": 1.860879884996686e-06, + "loss": 0.79069018, + "num_input_tokens_seen": 191978850, + "step": 8923, + "time_per_iteration": 2.5059070587158203 + }, + { + "auxiliary_loss_clip": 0.01091033, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.03916383, + "balance_loss_mlp": 1.01953578, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.48504754770773, + "language_loss": 0.70376825, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72500265, + "num_input_tokens_seen": 192002000, + "step": 8924, + "time_per_iteration": 2.6205289363861084 + }, + { + "auxiliary_loss_clip": 0.01089063, + "auxiliary_loss_mlp": 0.01037111, + "balance_loss_clip": 1.04158664, + "balance_loss_mlp": 1.02249575, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.9497334352795628, + "language_loss": 0.87186563, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89312738, + "num_input_tokens_seen": 192019100, + "step": 8925, + "time_per_iteration": 2.620544672012329 + }, + { + "auxiliary_loss_clip": 0.01119041, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.04066563, + "balance_loss_mlp": 1.01946783, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.498282684479171, + "language_loss": 0.783261, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80477738, + "num_input_tokens_seen": 192041660, + "step": 8926, + "time_per_iteration": 2.5569005012512207 + }, + { + "auxiliary_loss_clip": 0.01088963, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.04987216, + "balance_loss_mlp": 1.01884532, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 2.2775856401771715, + "language_loss": 0.67103958, + "learning_rate": 1.85932585410148e-06, + "loss": 0.692231, + "num_input_tokens_seen": 192063540, + "step": 8927, + "time_per_iteration": 2.7183644771575928 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.03992784, + "balance_loss_mlp": 1.01511228, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.6999659238341334, + "language_loss": 0.73435867, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.7557227, + "num_input_tokens_seen": 192081760, + "step": 8928, + "time_per_iteration": 2.603222370147705 + }, + { + "auxiliary_loss_clip": 0.01092477, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.03789651, + "balance_loss_mlp": 1.0186367, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.9138745181819625, + "language_loss": 0.63303912, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65427303, + "num_input_tokens_seen": 192101620, + "step": 8929, + "time_per_iteration": 2.660036563873291 + }, + { + "auxiliary_loss_clip": 0.01108277, + "auxiliary_loss_mlp": 0.01036036, + "balance_loss_clip": 1.04218817, + "balance_loss_mlp": 1.02353108, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.7643684921944434, + "language_loss": 0.65966773, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68111086, + "num_input_tokens_seen": 192121805, + "step": 8930, + "time_per_iteration": 2.5400941371917725 + }, + { + "auxiliary_loss_clip": 0.01067764, + "auxiliary_loss_mlp": 0.01027862, + "balance_loss_clip": 1.03855681, + "balance_loss_mlp": 1.01425457, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4296234928148885, + "language_loss": 0.67157739, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69253367, + "num_input_tokens_seen": 192141765, + "step": 8931, + "time_per_iteration": 4.102942228317261 + }, + { + "auxiliary_loss_clip": 0.01071109, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.0388341, + "balance_loss_mlp": 1.02196991, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.683353923662543, + "language_loss": 0.75976634, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.78083932, + "num_input_tokens_seen": 192161560, + "step": 8932, + "time_per_iteration": 4.077298641204834 + }, + { + "auxiliary_loss_clip": 0.01084839, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.04212248, + "balance_loss_mlp": 1.02266657, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.8726887603578863, + "language_loss": 0.66151834, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68272948, + "num_input_tokens_seen": 192180190, + "step": 8933, + "time_per_iteration": 2.6580352783203125 + }, + { + "auxiliary_loss_clip": 0.01099655, + "auxiliary_loss_mlp": 0.00778642, + "balance_loss_clip": 1.04103637, + "balance_loss_mlp": 1.00051403, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.7611669850852223, + "language_loss": 0.82918358, + "learning_rate": 1.856606505975565e-06, + "loss": 0.84796649, + "num_input_tokens_seen": 192198855, + "step": 8934, + "time_per_iteration": 2.555018424987793 + }, + { + "auxiliary_loss_clip": 0.01079324, + "auxiliary_loss_mlp": 0.0103847, + "balance_loss_clip": 1.0377655, + "balance_loss_mlp": 1.02519596, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.7410241747375048, + "language_loss": 0.79830843, + "learning_rate": 1.856218049303999e-06, + "loss": 0.81948638, + "num_input_tokens_seen": 192216555, + "step": 8935, + "time_per_iteration": 2.538210391998291 + }, + { + "auxiliary_loss_clip": 0.01106761, + "auxiliary_loss_mlp": 0.0104283, + "balance_loss_clip": 1.0414964, + "balance_loss_mlp": 1.02934146, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.8642834481771409, + "language_loss": 0.84212875, + "learning_rate": 1.855829598084659e-06, + "loss": 0.86362469, + "num_input_tokens_seen": 192236910, + "step": 8936, + "time_per_iteration": 2.5349273681640625 + }, + { + "auxiliary_loss_clip": 0.01088021, + "auxiliary_loss_mlp": 0.01030052, + "balance_loss_clip": 1.04448879, + "balance_loss_mlp": 1.01786351, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.2807747292407416, + "language_loss": 0.72544259, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74662328, + "num_input_tokens_seen": 192260790, + "step": 8937, + "time_per_iteration": 2.753772258758545 + }, + { + "auxiliary_loss_clip": 0.01092567, + "auxiliary_loss_mlp": 0.01037302, + "balance_loss_clip": 1.03653181, + "balance_loss_mlp": 1.02256799, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.287488531050698, + "language_loss": 0.8158164, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83711517, + "num_input_tokens_seen": 192277230, + "step": 8938, + "time_per_iteration": 2.5228841304779053 + }, + { + "auxiliary_loss_clip": 0.01123495, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.04294205, + "balance_loss_mlp": 1.02304804, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.320667471427437, + "language_loss": 0.80650187, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.82809299, + "num_input_tokens_seen": 192292840, + "step": 8939, + "time_per_iteration": 2.4517457485198975 + }, + { + "auxiliary_loss_clip": 0.01014671, + "auxiliary_loss_mlp": 0.01009109, + "balance_loss_clip": 1.01711488, + "balance_loss_mlp": 1.00760138, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.71438120376376, + "language_loss": 0.52479446, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54503226, + "num_input_tokens_seen": 192358240, + "step": 8940, + "time_per_iteration": 3.123433828353882 + }, + { + "auxiliary_loss_clip": 0.01085918, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.04584217, + "balance_loss_mlp": 1.01880455, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 2.0095236378039325, + "language_loss": 0.71865386, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73982477, + "num_input_tokens_seen": 192377370, + "step": 8941, + "time_per_iteration": 2.556628704071045 + }, + { + "auxiliary_loss_clip": 0.01089592, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.0376302, + "balance_loss_mlp": 1.01781094, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.6172321610067404, + "language_loss": 0.79549807, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81669271, + "num_input_tokens_seen": 192396450, + "step": 8942, + "time_per_iteration": 2.5455162525177 + }, + { + "auxiliary_loss_clip": 0.01122324, + "auxiliary_loss_mlp": 0.01038143, + "balance_loss_clip": 1.04405189, + "balance_loss_mlp": 1.02483332, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.7832473706183904, + "language_loss": 0.70045048, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72205514, + "num_input_tokens_seen": 192417390, + "step": 8943, + "time_per_iteration": 2.553521156311035 + }, + { + "auxiliary_loss_clip": 0.01032773, + "auxiliary_loss_mlp": 0.01002448, + "balance_loss_clip": 1.01747704, + "balance_loss_mlp": 1.00123203, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8125518711054529, + "language_loss": 0.59611297, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61646521, + "num_input_tokens_seen": 192478060, + "step": 8944, + "time_per_iteration": 3.038417100906372 + }, + { + "auxiliary_loss_clip": 0.01079096, + "auxiliary_loss_mlp": 0.01038211, + "balance_loss_clip": 1.04629254, + "balance_loss_mlp": 1.02379227, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 1.977919892410662, + "language_loss": 0.78113586, + "learning_rate": 1.852333784891169e-06, + "loss": 0.80230892, + "num_input_tokens_seen": 192495985, + "step": 8945, + "time_per_iteration": 4.081629991531372 + }, + { + "auxiliary_loss_clip": 0.01109078, + "auxiliary_loss_mlp": 0.01038171, + "balance_loss_clip": 1.040851, + "balance_loss_mlp": 1.02501607, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.6601728007777674, + "language_loss": 0.68728119, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70875371, + "num_input_tokens_seen": 192515445, + "step": 8946, + "time_per_iteration": 2.5246975421905518 + }, + { + "auxiliary_loss_clip": 0.0107479, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.04235876, + "balance_loss_mlp": 1.02694035, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 2.0625930120245943, + "language_loss": 0.77192891, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79307628, + "num_input_tokens_seen": 192536530, + "step": 8947, + "time_per_iteration": 2.6362810134887695 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.04302025, + "balance_loss_mlp": 1.0212841, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.7350772517842383, + "language_loss": 0.60196167, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62339079, + "num_input_tokens_seen": 192556075, + "step": 8948, + "time_per_iteration": 2.529583215713501 + }, + { + "auxiliary_loss_clip": 0.01079439, + "auxiliary_loss_mlp": 0.01037942, + "balance_loss_clip": 1.03809512, + "balance_loss_mlp": 1.02542531, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.687792885640704, + "language_loss": 0.7944892, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.81566298, + "num_input_tokens_seen": 192575535, + "step": 8949, + "time_per_iteration": 2.577078104019165 + }, + { + "auxiliary_loss_clip": 0.01077063, + "auxiliary_loss_mlp": 0.01037497, + "balance_loss_clip": 1.03638244, + "balance_loss_mlp": 1.02300167, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.8374049483108585, + "language_loss": 0.78062367, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80176926, + "num_input_tokens_seen": 192594490, + "step": 8950, + "time_per_iteration": 2.6614508628845215 + }, + { + "auxiliary_loss_clip": 0.01103499, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.04782462, + "balance_loss_mlp": 1.02060616, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.8133700238246484, + "language_loss": 0.73058838, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.75194621, + "num_input_tokens_seen": 192615650, + "step": 8951, + "time_per_iteration": 2.594892978668213 + }, + { + "auxiliary_loss_clip": 0.01119332, + "auxiliary_loss_mlp": 0.00779668, + "balance_loss_clip": 1.04270315, + "balance_loss_mlp": 1.00064087, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 2.8845143639189508, + "language_loss": 0.74871814, + "learning_rate": 1.849615132097085e-06, + "loss": 0.76770818, + "num_input_tokens_seen": 192633840, + "step": 8952, + "time_per_iteration": 2.444408655166626 + }, + { + "auxiliary_loss_clip": 0.01095386, + "auxiliary_loss_mlp": 0.01032445, + "balance_loss_clip": 1.04346836, + "balance_loss_mlp": 1.01951122, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.710546358495416, + "language_loss": 0.79744565, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81872404, + "num_input_tokens_seen": 192655890, + "step": 8953, + "time_per_iteration": 2.5976147651672363 + }, + { + "auxiliary_loss_clip": 0.01086202, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.04200339, + "balance_loss_mlp": 1.02110064, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 2.3298834673413897, + "language_loss": 0.80809975, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82931185, + "num_input_tokens_seen": 192673025, + "step": 8954, + "time_per_iteration": 2.5597896575927734 + }, + { + "auxiliary_loss_clip": 0.01119511, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.04404175, + "balance_loss_mlp": 1.01758504, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 1.9557170012623124, + "language_loss": 0.76144409, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.782947, + "num_input_tokens_seen": 192692190, + "step": 8955, + "time_per_iteration": 2.4787027835845947 + }, + { + "auxiliary_loss_clip": 0.01095902, + "auxiliary_loss_mlp": 0.01039341, + "balance_loss_clip": 1.04301655, + "balance_loss_mlp": 1.02632332, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.751230109663164, + "language_loss": 0.78521812, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80657053, + "num_input_tokens_seen": 192710380, + "step": 8956, + "time_per_iteration": 2.535402297973633 + }, + { + "auxiliary_loss_clip": 0.01013987, + "auxiliary_loss_mlp": 0.01002029, + "balance_loss_clip": 1.01658165, + "balance_loss_mlp": 1.00066435, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.863315823198414, + "language_loss": 0.63443756, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65459776, + "num_input_tokens_seen": 192768995, + "step": 8957, + "time_per_iteration": 3.022777557373047 + }, + { + "auxiliary_loss_clip": 0.0100215, + "auxiliary_loss_mlp": 0.01002058, + "balance_loss_clip": 1.01778018, + "balance_loss_mlp": 1.00081253, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7191699269255233, + "language_loss": 0.51628804, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.5363301, + "num_input_tokens_seen": 192825585, + "step": 8958, + "time_per_iteration": 3.1935698986053467 + }, + { + "auxiliary_loss_clip": 0.01113549, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.04735935, + "balance_loss_mlp": 1.01897371, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.5363286784557466, + "language_loss": 0.77134204, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79280829, + "num_input_tokens_seen": 192847335, + "step": 8959, + "time_per_iteration": 4.0312042236328125 + }, + { + "auxiliary_loss_clip": 0.01072703, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.04065204, + "balance_loss_mlp": 1.01934731, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.083603627907854, + "language_loss": 0.83926034, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.86030471, + "num_input_tokens_seen": 192862205, + "step": 8960, + "time_per_iteration": 2.552312135696411 + }, + { + "auxiliary_loss_clip": 0.01110725, + "auxiliary_loss_mlp": 0.01033732, + "balance_loss_clip": 1.04596865, + "balance_loss_mlp": 1.02062535, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.4426132832209628, + "language_loss": 0.78627533, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80771995, + "num_input_tokens_seen": 192883695, + "step": 8961, + "time_per_iteration": 2.578784704208374 + }, + { + "auxiliary_loss_clip": 0.01085596, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.04092646, + "balance_loss_mlp": 1.02215731, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.6450163053973204, + "language_loss": 0.84676862, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86797637, + "num_input_tokens_seen": 192900190, + "step": 8962, + "time_per_iteration": 2.5961110591888428 + }, + { + "auxiliary_loss_clip": 0.0102294, + "auxiliary_loss_mlp": 0.01008141, + "balance_loss_clip": 1.02195644, + "balance_loss_mlp": 1.00690103, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7581817291046048, + "language_loss": 0.54217392, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.5624848, + "num_input_tokens_seen": 192958675, + "step": 8963, + "time_per_iteration": 2.99432635307312 + }, + { + "auxiliary_loss_clip": 0.01020652, + "auxiliary_loss_mlp": 0.0100115, + "balance_loss_clip": 1.01487243, + "balance_loss_mlp": 0.99992782, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8042665812933333, + "language_loss": 0.63348716, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65370518, + "num_input_tokens_seen": 193033135, + "step": 8964, + "time_per_iteration": 3.153294086456299 + }, + { + "auxiliary_loss_clip": 0.01064892, + "auxiliary_loss_mlp": 0.0103039, + "balance_loss_clip": 1.04273272, + "balance_loss_mlp": 1.01747346, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.538755772051385, + "language_loss": 0.70165098, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.7226038, + "num_input_tokens_seen": 193055570, + "step": 8965, + "time_per_iteration": 2.7504098415374756 + }, + { + "auxiliary_loss_clip": 0.01093906, + "auxiliary_loss_mlp": 0.00780603, + "balance_loss_clip": 1.04043531, + "balance_loss_mlp": 1.00068784, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.1183362927272236, + "language_loss": 0.81944919, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.83819437, + "num_input_tokens_seen": 193073120, + "step": 8966, + "time_per_iteration": 2.517101526260376 + }, + { + "auxiliary_loss_clip": 0.01118571, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.04438639, + "balance_loss_mlp": 1.01681638, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 2.0874942991531356, + "language_loss": 0.72049826, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74198401, + "num_input_tokens_seen": 193090105, + "step": 8967, + "time_per_iteration": 2.425278425216675 + }, + { + "auxiliary_loss_clip": 0.01093318, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.03803992, + "balance_loss_mlp": 1.02028775, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.6491316624762082, + "language_loss": 0.81734586, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.83859533, + "num_input_tokens_seen": 193109325, + "step": 8968, + "time_per_iteration": 2.536625862121582 + }, + { + "auxiliary_loss_clip": 0.01087034, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.04017889, + "balance_loss_mlp": 1.01804233, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.6175813175480847, + "language_loss": 0.74179852, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76298416, + "num_input_tokens_seen": 193130595, + "step": 8969, + "time_per_iteration": 2.603116035461426 + }, + { + "auxiliary_loss_clip": 0.01083233, + "auxiliary_loss_mlp": 0.00779535, + "balance_loss_clip": 1.03412974, + "balance_loss_mlp": 1.00048661, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 2.027881385663437, + "language_loss": 0.82360041, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84222811, + "num_input_tokens_seen": 193148930, + "step": 8970, + "time_per_iteration": 2.563687324523926 + }, + { + "auxiliary_loss_clip": 0.01094261, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.04287839, + "balance_loss_mlp": 1.01723647, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.6249962019950386, + "language_loss": 0.75502366, + "learning_rate": 1.842237354749146e-06, + "loss": 0.7762574, + "num_input_tokens_seen": 193170140, + "step": 8971, + "time_per_iteration": 4.143650054931641 + }, + { + "auxiliary_loss_clip": 0.01028178, + "auxiliary_loss_mlp": 0.01022889, + "balance_loss_clip": 1.01115274, + "balance_loss_mlp": 1.02133906, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8838684999356912, + "language_loss": 0.60329342, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62380409, + "num_input_tokens_seen": 193227235, + "step": 8972, + "time_per_iteration": 4.478744983673096 + }, + { + "auxiliary_loss_clip": 0.01106998, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.04083681, + "balance_loss_mlp": 1.02920032, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.3959579417561976, + "language_loss": 0.78630352, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80780101, + "num_input_tokens_seen": 193248435, + "step": 8973, + "time_per_iteration": 2.625455379486084 + }, + { + "auxiliary_loss_clip": 0.01114307, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.04327464, + "balance_loss_mlp": 1.01948857, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 2.1972086219135614, + "language_loss": 0.7383824, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.7598629, + "num_input_tokens_seen": 193267490, + "step": 8974, + "time_per_iteration": 2.5646438598632812 + }, + { + "auxiliary_loss_clip": 0.01036839, + "auxiliary_loss_mlp": 0.01001365, + "balance_loss_clip": 1.01151776, + "balance_loss_mlp": 1.0000056, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7916370911807894, + "language_loss": 0.51082939, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53121144, + "num_input_tokens_seen": 193326050, + "step": 8975, + "time_per_iteration": 3.0168814659118652 + }, + { + "auxiliary_loss_clip": 0.01105263, + "auxiliary_loss_mlp": 0.01037987, + "balance_loss_clip": 1.04022694, + "balance_loss_mlp": 1.02445102, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.4245211467537218, + "language_loss": 0.72186494, + "learning_rate": 1.840296189214344e-06, + "loss": 0.7432974, + "num_input_tokens_seen": 193348785, + "step": 8976, + "time_per_iteration": 2.5610158443450928 + }, + { + "auxiliary_loss_clip": 0.01103257, + "auxiliary_loss_mlp": 0.00778583, + "balance_loss_clip": 1.03850222, + "balance_loss_mlp": 1.00054765, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 1.8269061544429346, + "language_loss": 0.69968975, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71850812, + "num_input_tokens_seen": 193367080, + "step": 8977, + "time_per_iteration": 2.5362131595611572 + }, + { + "auxiliary_loss_clip": 0.01049779, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.03922045, + "balance_loss_mlp": 1.01913714, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.632946950830686, + "language_loss": 0.72643685, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.74725741, + "num_input_tokens_seen": 193383715, + "step": 8978, + "time_per_iteration": 2.6060173511505127 + }, + { + "auxiliary_loss_clip": 0.01084868, + "auxiliary_loss_mlp": 0.01037756, + "balance_loss_clip": 1.04460382, + "balance_loss_mlp": 1.0236181, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 2.192766569406742, + "language_loss": 0.74499726, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76622355, + "num_input_tokens_seen": 193400560, + "step": 8979, + "time_per_iteration": 2.5180468559265137 + }, + { + "auxiliary_loss_clip": 0.01068923, + "auxiliary_loss_mlp": 0.01048696, + "balance_loss_clip": 1.03906214, + "balance_loss_mlp": 1.0346595, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 1.8912383304827163, + "language_loss": 0.77091855, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79209471, + "num_input_tokens_seen": 193418680, + "step": 8980, + "time_per_iteration": 2.6219706535339355 + }, + { + "auxiliary_loss_clip": 0.01117297, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.04084754, + "balance_loss_mlp": 1.02009559, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.9534513278907188, + "language_loss": 0.82139242, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.84289145, + "num_input_tokens_seen": 193439310, + "step": 8981, + "time_per_iteration": 2.5116164684295654 + }, + { + "auxiliary_loss_clip": 0.01109348, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.04024577, + "balance_loss_mlp": 1.01937175, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.8205172406161727, + "language_loss": 0.67156851, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.69298959, + "num_input_tokens_seen": 193458115, + "step": 8982, + "time_per_iteration": 2.499845027923584 + }, + { + "auxiliary_loss_clip": 0.01089981, + "auxiliary_loss_mlp": 0.0077749, + "balance_loss_clip": 1.04909825, + "balance_loss_mlp": 1.00057411, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.7815476162716233, + "language_loss": 0.82754982, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84622455, + "num_input_tokens_seen": 193477365, + "step": 8983, + "time_per_iteration": 2.593473434448242 + }, + { + "auxiliary_loss_clip": 0.01070461, + "auxiliary_loss_mlp": 0.01039526, + "balance_loss_clip": 1.03758121, + "balance_loss_mlp": 1.02609158, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 2.4227143735168664, + "language_loss": 0.70743388, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72853374, + "num_input_tokens_seen": 193495595, + "step": 8984, + "time_per_iteration": 4.203336715698242 + }, + { + "auxiliary_loss_clip": 0.0112326, + "auxiliary_loss_mlp": 0.0103192, + "balance_loss_clip": 1.04441535, + "balance_loss_mlp": 1.01762676, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.5836925335395131, + "language_loss": 0.79883492, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82038671, + "num_input_tokens_seen": 193514035, + "step": 8985, + "time_per_iteration": 2.4573049545288086 + }, + { + "auxiliary_loss_clip": 0.01078613, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.04247522, + "balance_loss_mlp": 1.01689267, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 1.4360547510313058, + "language_loss": 0.78544092, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80652893, + "num_input_tokens_seen": 193535445, + "step": 8986, + "time_per_iteration": 2.6330885887145996 + }, + { + "auxiliary_loss_clip": 0.01102399, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_clip": 1.03843665, + "balance_loss_mlp": 1.0274117, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.795559482137697, + "language_loss": 0.77023989, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79167491, + "num_input_tokens_seen": 193554780, + "step": 8987, + "time_per_iteration": 2.5025618076324463 + }, + { + "auxiliary_loss_clip": 0.01096866, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.04612744, + "balance_loss_mlp": 1.01732743, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 1.683826381520622, + "language_loss": 0.71214741, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.7334137, + "num_input_tokens_seen": 193573580, + "step": 8988, + "time_per_iteration": 2.55521559715271 + }, + { + "auxiliary_loss_clip": 0.01068989, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.03687239, + "balance_loss_mlp": 1.0241971, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.278784215331888, + "language_loss": 0.67399049, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69506097, + "num_input_tokens_seen": 193590490, + "step": 8989, + "time_per_iteration": 2.6570160388946533 + }, + { + "auxiliary_loss_clip": 0.01106654, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.04157364, + "balance_loss_mlp": 1.02267075, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.448483423801684, + "language_loss": 0.77653182, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79796034, + "num_input_tokens_seen": 193609900, + "step": 8990, + "time_per_iteration": 2.5411524772644043 + }, + { + "auxiliary_loss_clip": 0.01102653, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.04000306, + "balance_loss_mlp": 1.01894617, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.6017551209023546, + "language_loss": 0.68885064, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71018147, + "num_input_tokens_seen": 193629775, + "step": 8991, + "time_per_iteration": 2.5060043334960938 + }, + { + "auxiliary_loss_clip": 0.01057446, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.03233266, + "balance_loss_mlp": 1.01885355, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.6762348477797493, + "language_loss": 0.7629621, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78386265, + "num_input_tokens_seen": 193648070, + "step": 8992, + "time_per_iteration": 2.663022518157959 + }, + { + "auxiliary_loss_clip": 0.01092587, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.03773379, + "balance_loss_mlp": 1.02561188, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 5.067377167324964, + "language_loss": 0.76284885, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78417432, + "num_input_tokens_seen": 193665060, + "step": 8993, + "time_per_iteration": 2.5132789611816406 + }, + { + "auxiliary_loss_clip": 0.01102559, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.03830624, + "balance_loss_mlp": 1.01933825, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.655209224292126, + "language_loss": 0.70297706, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.7243197, + "num_input_tokens_seen": 193683620, + "step": 8994, + "time_per_iteration": 2.5414414405822754 + }, + { + "auxiliary_loss_clip": 0.01104711, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.0388931, + "balance_loss_mlp": 1.01791334, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 2.773144526056596, + "language_loss": 0.75235391, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.7737208, + "num_input_tokens_seen": 193702990, + "step": 8995, + "time_per_iteration": 2.508720636367798 + }, + { + "auxiliary_loss_clip": 0.01103203, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.04074347, + "balance_loss_mlp": 1.02124536, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.769913844903481, + "language_loss": 0.73602617, + "learning_rate": 1.832533059471282e-06, + "loss": 0.75738955, + "num_input_tokens_seen": 193721785, + "step": 8996, + "time_per_iteration": 2.509133815765381 + }, + { + "auxiliary_loss_clip": 0.0107142, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.03811729, + "balance_loss_mlp": 1.02269661, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.7164369096193197, + "language_loss": 0.73124826, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75230962, + "num_input_tokens_seen": 193740315, + "step": 8997, + "time_per_iteration": 2.5945827960968018 + }, + { + "auxiliary_loss_clip": 0.01117123, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.041857, + "balance_loss_mlp": 1.01996958, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 3.32853035554416, + "language_loss": 0.71868825, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.74018931, + "num_input_tokens_seen": 193757580, + "step": 8998, + "time_per_iteration": 3.9585671424865723 + }, + { + "auxiliary_loss_clip": 0.01083563, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.04006505, + "balance_loss_mlp": 1.02125859, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 2.0072910781724036, + "language_loss": 0.70586395, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72703695, + "num_input_tokens_seen": 193780965, + "step": 8999, + "time_per_iteration": 2.8291141986846924 + }, + { + "auxiliary_loss_clip": 0.0109611, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.04593599, + "balance_loss_mlp": 1.01734948, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.206618774537514, + "language_loss": 0.8056879, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.82695556, + "num_input_tokens_seen": 193797855, + "step": 9000, + "time_per_iteration": 2.5260419845581055 + }, + { + "auxiliary_loss_clip": 0.01072051, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.03823817, + "balance_loss_mlp": 1.01986396, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 2.0146210713259145, + "language_loss": 0.72810256, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.74916255, + "num_input_tokens_seen": 193817375, + "step": 9001, + "time_per_iteration": 2.6041643619537354 + }, + { + "auxiliary_loss_clip": 0.01086023, + "auxiliary_loss_mlp": 0.01035867, + "balance_loss_clip": 1.03626597, + "balance_loss_mlp": 1.02144289, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.4857618099253083, + "language_loss": 0.85122204, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87244093, + "num_input_tokens_seen": 193832205, + "step": 9002, + "time_per_iteration": 2.5565178394317627 + }, + { + "auxiliary_loss_clip": 0.01070506, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.03900218, + "balance_loss_mlp": 1.01750803, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.7964611942444704, + "language_loss": 0.78163749, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80262917, + "num_input_tokens_seen": 193849830, + "step": 9003, + "time_per_iteration": 2.581904888153076 + }, + { + "auxiliary_loss_clip": 0.01104407, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.04111397, + "balance_loss_mlp": 1.01695442, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 1.7479775767976422, + "language_loss": 0.69362724, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71497583, + "num_input_tokens_seen": 193869945, + "step": 9004, + "time_per_iteration": 2.5297434329986572 + }, + { + "auxiliary_loss_clip": 0.01032446, + "auxiliary_loss_mlp": 0.0100383, + "balance_loss_clip": 1.01700628, + "balance_loss_mlp": 1.00259006, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9507873703428759, + "language_loss": 0.59072959, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61109233, + "num_input_tokens_seen": 193930860, + "step": 9005, + "time_per_iteration": 3.1645686626434326 + }, + { + "auxiliary_loss_clip": 0.01119836, + "auxiliary_loss_mlp": 0.00779145, + "balance_loss_clip": 1.04315042, + "balance_loss_mlp": 1.00055075, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 2.503661858615993, + "language_loss": 0.77762699, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.79661679, + "num_input_tokens_seen": 193949075, + "step": 9006, + "time_per_iteration": 2.4798967838287354 + }, + { + "auxiliary_loss_clip": 0.01093643, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.04015505, + "balance_loss_mlp": 1.02164364, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.6273902704376957, + "language_loss": 0.83248681, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85374981, + "num_input_tokens_seen": 193967630, + "step": 9007, + "time_per_iteration": 2.5367870330810547 + }, + { + "auxiliary_loss_clip": 0.01109697, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.04833651, + "balance_loss_mlp": 1.02090502, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 1.769523976864984, + "language_loss": 0.66833198, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.68977237, + "num_input_tokens_seen": 193988730, + "step": 9008, + "time_per_iteration": 2.551980495452881 + }, + { + "auxiliary_loss_clip": 0.01123176, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.04349542, + "balance_loss_mlp": 1.01987219, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.1392734309853743, + "language_loss": 0.7437492, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76532358, + "num_input_tokens_seen": 194005160, + "step": 9009, + "time_per_iteration": 2.4375243186950684 + }, + { + "auxiliary_loss_clip": 0.01074373, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.04519367, + "balance_loss_mlp": 1.02201712, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.527981111402233, + "language_loss": 0.87615252, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89725482, + "num_input_tokens_seen": 194021700, + "step": 9010, + "time_per_iteration": 4.152284622192383 + }, + { + "auxiliary_loss_clip": 0.01117558, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.04202282, + "balance_loss_mlp": 1.02106142, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.916604434348325, + "language_loss": 0.65476918, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67628276, + "num_input_tokens_seen": 194042620, + "step": 9011, + "time_per_iteration": 3.913362979888916 + }, + { + "auxiliary_loss_clip": 0.01108997, + "auxiliary_loss_mlp": 0.01035496, + "balance_loss_clip": 1.04404783, + "balance_loss_mlp": 1.02279401, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 2.9076788680325305, + "language_loss": 0.79433954, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81578445, + "num_input_tokens_seen": 194061800, + "step": 9012, + "time_per_iteration": 2.587987184524536 + }, + { + "auxiliary_loss_clip": 0.01115896, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.04125905, + "balance_loss_mlp": 1.01943719, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 2.45168892639514, + "language_loss": 0.74331319, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76479733, + "num_input_tokens_seen": 194079890, + "step": 9013, + "time_per_iteration": 2.4541494846343994 + }, + { + "auxiliary_loss_clip": 0.01085163, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.04210329, + "balance_loss_mlp": 1.0156908, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 2.1217549578167363, + "language_loss": 0.72426325, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74540293, + "num_input_tokens_seen": 194097625, + "step": 9014, + "time_per_iteration": 2.537886142730713 + }, + { + "auxiliary_loss_clip": 0.01096929, + "auxiliary_loss_mlp": 0.01031673, + "balance_loss_clip": 1.04166389, + "balance_loss_mlp": 1.01890612, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.5417357201038617, + "language_loss": 0.8053478, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82663381, + "num_input_tokens_seen": 194116055, + "step": 9015, + "time_per_iteration": 2.5148675441741943 + }, + { + "auxiliary_loss_clip": 0.01112778, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.04505348, + "balance_loss_mlp": 1.02345538, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 2.1537406926877174, + "language_loss": 0.81404281, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83553588, + "num_input_tokens_seen": 194130365, + "step": 9016, + "time_per_iteration": 2.4955217838287354 + }, + { + "auxiliary_loss_clip": 0.01117713, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.04307175, + "balance_loss_mlp": 1.01567972, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 1.7458602782439685, + "language_loss": 0.81209755, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83355707, + "num_input_tokens_seen": 194148975, + "step": 9017, + "time_per_iteration": 2.48705792427063 + }, + { + "auxiliary_loss_clip": 0.01115706, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.04233098, + "balance_loss_mlp": 1.0226382, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.552297021775615, + "language_loss": 0.77496862, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79648167, + "num_input_tokens_seen": 194167185, + "step": 9018, + "time_per_iteration": 2.4355547428131104 + }, + { + "auxiliary_loss_clip": 0.01118101, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.03991246, + "balance_loss_mlp": 1.02251458, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.8039825740483209, + "language_loss": 0.66733825, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68887973, + "num_input_tokens_seen": 194192840, + "step": 9019, + "time_per_iteration": 2.691891670227051 + }, + { + "auxiliary_loss_clip": 0.01099589, + "auxiliary_loss_mlp": 0.01033212, + "balance_loss_clip": 1.03848469, + "balance_loss_mlp": 1.02052855, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.5870139414964746, + "language_loss": 0.69719082, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.71851885, + "num_input_tokens_seen": 194213150, + "step": 9020, + "time_per_iteration": 2.5603723526000977 + }, + { + "auxiliary_loss_clip": 0.01078879, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.03773665, + "balance_loss_mlp": 1.0227524, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.4363778407591015, + "language_loss": 0.80475461, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82589316, + "num_input_tokens_seen": 194234665, + "step": 9021, + "time_per_iteration": 2.699388265609741 + }, + { + "auxiliary_loss_clip": 0.01070706, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.0393846, + "balance_loss_mlp": 1.02040768, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.639121731758585, + "language_loss": 0.78925633, + "learning_rate": 1.822444805916788e-06, + "loss": 0.81030005, + "num_input_tokens_seen": 194253790, + "step": 9022, + "time_per_iteration": 2.6396095752716064 + }, + { + "auxiliary_loss_clip": 0.01086854, + "auxiliary_loss_mlp": 0.00780101, + "balance_loss_clip": 1.03927386, + "balance_loss_mlp": 1.00049472, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 3.05587403137694, + "language_loss": 0.82231033, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84097987, + "num_input_tokens_seen": 194274950, + "step": 9023, + "time_per_iteration": 4.137202024459839 + }, + { + "auxiliary_loss_clip": 0.01107288, + "auxiliary_loss_mlp": 0.01025177, + "balance_loss_clip": 1.04706645, + "balance_loss_mlp": 1.01251101, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.6495033188072563, + "language_loss": 0.71329069, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73461533, + "num_input_tokens_seen": 194296155, + "step": 9024, + "time_per_iteration": 2.5473644733428955 + }, + { + "auxiliary_loss_clip": 0.01106303, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.03913748, + "balance_loss_mlp": 1.01999533, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.6195878122578131, + "language_loss": 0.6538136, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67520154, + "num_input_tokens_seen": 194318025, + "step": 9025, + "time_per_iteration": 2.5864338874816895 + }, + { + "auxiliary_loss_clip": 0.01090572, + "auxiliary_loss_mlp": 0.00777148, + "balance_loss_clip": 1.04308677, + "balance_loss_mlp": 1.00068092, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 1.8745281898325687, + "language_loss": 0.7427398, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.76141703, + "num_input_tokens_seen": 194336150, + "step": 9026, + "time_per_iteration": 2.536637783050537 + }, + { + "auxiliary_loss_clip": 0.01094339, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.03773355, + "balance_loss_mlp": 1.02199745, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 1.7664861569379797, + "language_loss": 0.78525805, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80657399, + "num_input_tokens_seen": 194355980, + "step": 9027, + "time_per_iteration": 2.5759236812591553 + }, + { + "auxiliary_loss_clip": 0.01016637, + "auxiliary_loss_mlp": 0.01002336, + "balance_loss_clip": 1.02194202, + "balance_loss_mlp": 1.00109601, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7471516075967832, + "language_loss": 0.56450379, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58469343, + "num_input_tokens_seen": 194422660, + "step": 9028, + "time_per_iteration": 3.157339096069336 + }, + { + "auxiliary_loss_clip": 0.01078686, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.04549551, + "balance_loss_mlp": 1.0148257, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 1.9261029167722599, + "language_loss": 0.78139168, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80246216, + "num_input_tokens_seen": 194438545, + "step": 9029, + "time_per_iteration": 2.6227171421051025 + }, + { + "auxiliary_loss_clip": 0.01075207, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.03686082, + "balance_loss_mlp": 1.01488876, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.3998494869754328, + "language_loss": 0.83349359, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85453385, + "num_input_tokens_seen": 194458060, + "step": 9030, + "time_per_iteration": 2.64140248298645 + }, + { + "auxiliary_loss_clip": 0.01116574, + "auxiliary_loss_mlp": 0.01031438, + "balance_loss_clip": 1.04243982, + "balance_loss_mlp": 1.01870632, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.5513577913618413, + "language_loss": 0.75120014, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.77268028, + "num_input_tokens_seen": 194477405, + "step": 9031, + "time_per_iteration": 2.5356130599975586 + }, + { + "auxiliary_loss_clip": 0.01097662, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.03867698, + "balance_loss_mlp": 1.02099752, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.678612022239516, + "language_loss": 0.85188812, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87320197, + "num_input_tokens_seen": 194497085, + "step": 9032, + "time_per_iteration": 2.5736076831817627 + }, + { + "auxiliary_loss_clip": 0.01100064, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.04155254, + "balance_loss_mlp": 1.01660538, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.7127694221011431, + "language_loss": 0.74215233, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.76345086, + "num_input_tokens_seen": 194516785, + "step": 9033, + "time_per_iteration": 2.542412519454956 + }, + { + "auxiliary_loss_clip": 0.01084193, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.04476333, + "balance_loss_mlp": 1.01921129, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.6299852758397844, + "language_loss": 0.75447381, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77563953, + "num_input_tokens_seen": 194536475, + "step": 9034, + "time_per_iteration": 2.6160480976104736 + }, + { + "auxiliary_loss_clip": 0.01076327, + "auxiliary_loss_mlp": 0.01027021, + "balance_loss_clip": 1.04032707, + "balance_loss_mlp": 1.01460528, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.685275537695002, + "language_loss": 0.84404218, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86507565, + "num_input_tokens_seen": 194554495, + "step": 9035, + "time_per_iteration": 2.550493001937866 + }, + { + "auxiliary_loss_clip": 0.01012027, + "auxiliary_loss_mlp": 0.01002177, + "balance_loss_clip": 1.01664865, + "balance_loss_mlp": 1.00103843, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7334522958728567, + "language_loss": 0.55900288, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57914484, + "num_input_tokens_seen": 194617620, + "step": 9036, + "time_per_iteration": 3.102189302444458 + }, + { + "auxiliary_loss_clip": 0.01062583, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.03934431, + "balance_loss_mlp": 1.0200007, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.7346189545365498, + "language_loss": 0.74840391, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.76936233, + "num_input_tokens_seen": 194637690, + "step": 9037, + "time_per_iteration": 2.738175630569458 + }, + { + "auxiliary_loss_clip": 0.01088537, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.0402379, + "balance_loss_mlp": 1.02119708, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.8230399533095931, + "language_loss": 0.67052627, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.69175452, + "num_input_tokens_seen": 194659520, + "step": 9038, + "time_per_iteration": 4.091614246368408 + }, + { + "auxiliary_loss_clip": 0.01103298, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.03813899, + "balance_loss_mlp": 1.01962113, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.7856770335157206, + "language_loss": 0.78029317, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80164409, + "num_input_tokens_seen": 194677645, + "step": 9039, + "time_per_iteration": 2.493242025375366 + }, + { + "auxiliary_loss_clip": 0.01077026, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.03724289, + "balance_loss_mlp": 1.02368546, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.7986272858836, + "language_loss": 0.76939988, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.79053164, + "num_input_tokens_seen": 194697400, + "step": 9040, + "time_per_iteration": 2.6935248374938965 + }, + { + "auxiliary_loss_clip": 0.01020898, + "auxiliary_loss_mlp": 0.01005218, + "balance_loss_clip": 1.01508367, + "balance_loss_mlp": 1.00391817, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6664848620141901, + "language_loss": 0.52462161, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54488277, + "num_input_tokens_seen": 194761205, + "step": 9041, + "time_per_iteration": 3.0649948120117188 + }, + { + "auxiliary_loss_clip": 0.01093878, + "auxiliary_loss_mlp": 0.0103311, + "balance_loss_clip": 1.04118836, + "balance_loss_mlp": 1.02042627, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.7002406755207484, + "language_loss": 0.76177651, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78304642, + "num_input_tokens_seen": 194782445, + "step": 9042, + "time_per_iteration": 2.542037010192871 + }, + { + "auxiliary_loss_clip": 0.01080433, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.03911376, + "balance_loss_mlp": 1.01919937, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.529268964932057, + "language_loss": 0.67240989, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69352823, + "num_input_tokens_seen": 194800325, + "step": 9043, + "time_per_iteration": 2.568281888961792 + }, + { + "auxiliary_loss_clip": 0.01074086, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.03432524, + "balance_loss_mlp": 1.01793098, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.5691886471576417, + "language_loss": 0.84005636, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86110795, + "num_input_tokens_seen": 194818675, + "step": 9044, + "time_per_iteration": 2.547966241836548 + }, + { + "auxiliary_loss_clip": 0.01122706, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.04357493, + "balance_loss_mlp": 1.01798415, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 1.5053560858487776, + "language_loss": 0.62019563, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64174271, + "num_input_tokens_seen": 194836595, + "step": 9045, + "time_per_iteration": 2.468627691268921 + }, + { + "auxiliary_loss_clip": 0.01118515, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.04222465, + "balance_loss_mlp": 1.01896119, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.459408871790347, + "language_loss": 0.69826424, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.71977103, + "num_input_tokens_seen": 194857520, + "step": 9046, + "time_per_iteration": 2.4886889457702637 + }, + { + "auxiliary_loss_clip": 0.01115289, + "auxiliary_loss_mlp": 0.01028433, + "balance_loss_clip": 1.04142058, + "balance_loss_mlp": 1.01601779, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 1.5816061748555832, + "language_loss": 0.77521676, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79665393, + "num_input_tokens_seen": 194876020, + "step": 9047, + "time_per_iteration": 2.4403438568115234 + }, + { + "auxiliary_loss_clip": 0.01091968, + "auxiliary_loss_mlp": 0.01042406, + "balance_loss_clip": 1.0382719, + "balance_loss_mlp": 1.02805948, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.7409089972993403, + "language_loss": 0.72448951, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74583328, + "num_input_tokens_seen": 194894650, + "step": 9048, + "time_per_iteration": 2.5097286701202393 + }, + { + "auxiliary_loss_clip": 0.01069087, + "auxiliary_loss_mlp": 0.01038944, + "balance_loss_clip": 1.03419495, + "balance_loss_mlp": 1.02405453, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 1.9910600804744203, + "language_loss": 0.93381411, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95489442, + "num_input_tokens_seen": 194911935, + "step": 9049, + "time_per_iteration": 4.034209728240967 + }, + { + "auxiliary_loss_clip": 0.0110216, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.03793931, + "balance_loss_mlp": 1.01980495, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 2.088446925354752, + "language_loss": 0.74412161, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.76545954, + "num_input_tokens_seen": 194931620, + "step": 9050, + "time_per_iteration": 2.5553176403045654 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.04258132, + "balance_loss_mlp": 1.01551974, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 2.230695566743192, + "language_loss": 0.67268705, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69405735, + "num_input_tokens_seen": 194952560, + "step": 9051, + "time_per_iteration": 3.927534818649292 + }, + { + "auxiliary_loss_clip": 0.01082031, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.03891528, + "balance_loss_mlp": 1.0208056, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.5937844788047941, + "language_loss": 0.67508066, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69623375, + "num_input_tokens_seen": 194973915, + "step": 9052, + "time_per_iteration": 2.635026454925537 + }, + { + "auxiliary_loss_clip": 0.01119706, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.04293919, + "balance_loss_mlp": 1.01989174, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.8064645142648992, + "language_loss": 0.92955017, + "learning_rate": 1.810422473773436e-06, + "loss": 0.95107818, + "num_input_tokens_seen": 194990170, + "step": 9053, + "time_per_iteration": 2.476576805114746 + }, + { + "auxiliary_loss_clip": 0.0109285, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.04084659, + "balance_loss_mlp": 1.02219296, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 2.255512898023149, + "language_loss": 0.83863187, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85991693, + "num_input_tokens_seen": 195006395, + "step": 9054, + "time_per_iteration": 2.574559450149536 + }, + { + "auxiliary_loss_clip": 0.01089891, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.03971434, + "balance_loss_mlp": 1.02150166, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.0735788834740174, + "language_loss": 0.68450785, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.70576036, + "num_input_tokens_seen": 195025080, + "step": 9055, + "time_per_iteration": 2.5419602394104004 + }, + { + "auxiliary_loss_clip": 0.01003887, + "auxiliary_loss_mlp": 0.01006728, + "balance_loss_clip": 1.01447654, + "balance_loss_mlp": 1.0055356, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7325403050412669, + "language_loss": 0.57673115, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59683728, + "num_input_tokens_seen": 195085725, + "step": 9056, + "time_per_iteration": 3.107933282852173 + }, + { + "auxiliary_loss_clip": 0.01081425, + "auxiliary_loss_mlp": 0.01034808, + "balance_loss_clip": 1.03968549, + "balance_loss_mlp": 1.02115858, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.8900938113673744, + "language_loss": 0.69672889, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71789122, + "num_input_tokens_seen": 195102585, + "step": 9057, + "time_per_iteration": 2.536865234375 + }, + { + "auxiliary_loss_clip": 0.011042, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.03987265, + "balance_loss_mlp": 1.02339053, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.084231568448756, + "language_loss": 0.74814045, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.76954818, + "num_input_tokens_seen": 195120055, + "step": 9058, + "time_per_iteration": 2.6031816005706787 + }, + { + "auxiliary_loss_clip": 0.01022878, + "auxiliary_loss_mlp": 0.01002445, + "balance_loss_clip": 1.03018928, + "balance_loss_mlp": 1.00113988, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7891625960564215, + "language_loss": 0.62608814, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64634132, + "num_input_tokens_seen": 195181045, + "step": 9059, + "time_per_iteration": 3.2016448974609375 + }, + { + "auxiliary_loss_clip": 0.01105476, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.04072547, + "balance_loss_mlp": 1.02174878, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.7602327041229548, + "language_loss": 0.79458493, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81598985, + "num_input_tokens_seen": 195198840, + "step": 9060, + "time_per_iteration": 2.474339723587036 + }, + { + "auxiliary_loss_clip": 0.01108202, + "auxiliary_loss_mlp": 0.01034172, + "balance_loss_clip": 1.04040456, + "balance_loss_mlp": 1.02041507, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 2.0454414774652445, + "language_loss": 0.7960031, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.8174268, + "num_input_tokens_seen": 195218720, + "step": 9061, + "time_per_iteration": 2.514564275741577 + }, + { + "auxiliary_loss_clip": 0.0110595, + "auxiliary_loss_mlp": 0.01029024, + "balance_loss_clip": 1.04192817, + "balance_loss_mlp": 1.01636362, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.617909691956428, + "language_loss": 0.87140167, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.8927514, + "num_input_tokens_seen": 195235770, + "step": 9062, + "time_per_iteration": 4.0412561893463135 + }, + { + "auxiliary_loss_clip": 0.01094862, + "auxiliary_loss_mlp": 0.01039185, + "balance_loss_clip": 1.03947031, + "balance_loss_mlp": 1.02322268, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.752689680406049, + "language_loss": 0.82161796, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84295839, + "num_input_tokens_seen": 195254870, + "step": 9063, + "time_per_iteration": 2.5153136253356934 + }, + { + "auxiliary_loss_clip": 0.01117662, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.04068995, + "balance_loss_mlp": 1.0224216, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.6726697561653534, + "language_loss": 0.63886184, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.6604048, + "num_input_tokens_seen": 195273390, + "step": 9064, + "time_per_iteration": 2.4549875259399414 + }, + { + "auxiliary_loss_clip": 0.01122019, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.04322398, + "balance_loss_mlp": 1.01878119, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.7260763406910111, + "language_loss": 0.80192947, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.82348269, + "num_input_tokens_seen": 195295635, + "step": 9065, + "time_per_iteration": 2.536384344100952 + }, + { + "auxiliary_loss_clip": 0.01080658, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.03806341, + "balance_loss_mlp": 1.02019691, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 2.008307075802582, + "language_loss": 0.78218204, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80331123, + "num_input_tokens_seen": 195312545, + "step": 9066, + "time_per_iteration": 2.535959243774414 + }, + { + "auxiliary_loss_clip": 0.01105939, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.03986418, + "balance_loss_mlp": 1.01823354, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 2.595918914716557, + "language_loss": 0.7582323, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.77961141, + "num_input_tokens_seen": 195332955, + "step": 9067, + "time_per_iteration": 2.5060200691223145 + }, + { + "auxiliary_loss_clip": 0.01085609, + "auxiliary_loss_mlp": 0.01039591, + "balance_loss_clip": 1.04196668, + "balance_loss_mlp": 1.02341473, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 2.10697195476301, + "language_loss": 0.63929844, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.66055048, + "num_input_tokens_seen": 195355930, + "step": 9068, + "time_per_iteration": 2.6957337856292725 + }, + { + "auxiliary_loss_clip": 0.01079267, + "auxiliary_loss_mlp": 0.01038947, + "balance_loss_clip": 1.04342294, + "balance_loss_mlp": 1.02602434, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.5600711190904437, + "language_loss": 0.72444654, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74562865, + "num_input_tokens_seen": 195376445, + "step": 9069, + "time_per_iteration": 2.6031429767608643 + }, + { + "auxiliary_loss_clip": 0.01116853, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.04409575, + "balance_loss_mlp": 1.01727033, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.909572190195641, + "language_loss": 0.73602444, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.75748289, + "num_input_tokens_seen": 195393725, + "step": 9070, + "time_per_iteration": 2.4630160331726074 + }, + { + "auxiliary_loss_clip": 0.01100389, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.03927195, + "balance_loss_mlp": 1.01926994, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.728067610319141, + "language_loss": 0.60916376, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.63049346, + "num_input_tokens_seen": 195411380, + "step": 9071, + "time_per_iteration": 2.5674450397491455 + }, + { + "auxiliary_loss_clip": 0.0103673, + "auxiliary_loss_mlp": 0.01008913, + "balance_loss_clip": 1.0112921, + "balance_loss_mlp": 1.00785184, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.7029958887458033, + "language_loss": 0.57096499, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59142137, + "num_input_tokens_seen": 195482015, + "step": 9072, + "time_per_iteration": 3.1334991455078125 + }, + { + "auxiliary_loss_clip": 0.01086374, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.03635526, + "balance_loss_mlp": 1.02147961, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.9603807946327447, + "language_loss": 0.69697481, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.71819437, + "num_input_tokens_seen": 195500440, + "step": 9073, + "time_per_iteration": 2.5086681842803955 + }, + { + "auxiliary_loss_clip": 0.01090609, + "auxiliary_loss_mlp": 0.01035381, + "balance_loss_clip": 1.03710735, + "balance_loss_mlp": 1.02345467, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.7058093675047938, + "language_loss": 0.71336341, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73462331, + "num_input_tokens_seen": 195520860, + "step": 9074, + "time_per_iteration": 2.5629935264587402 + }, + { + "auxiliary_loss_clip": 0.01103203, + "auxiliary_loss_mlp": 0.01035529, + "balance_loss_clip": 1.03893256, + "balance_loss_mlp": 1.0223031, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 2.6146628613393816, + "language_loss": 0.68899393, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.71038115, + "num_input_tokens_seen": 195538615, + "step": 9075, + "time_per_iteration": 2.5431716442108154 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.04168296, + "balance_loss_mlp": 1.01895261, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 1.6926721136956269, + "language_loss": 0.80549955, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.8268503, + "num_input_tokens_seen": 195557460, + "step": 9076, + "time_per_iteration": 2.5223312377929688 + }, + { + "auxiliary_loss_clip": 0.011058, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03995824, + "balance_loss_mlp": 1.01967537, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.6555350372595592, + "language_loss": 0.79970592, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82108682, + "num_input_tokens_seen": 195577985, + "step": 9077, + "time_per_iteration": 4.047502756118774 + }, + { + "auxiliary_loss_clip": 0.0110051, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.03917205, + "balance_loss_mlp": 1.01942587, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.894238518577005, + "language_loss": 0.67647582, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69780362, + "num_input_tokens_seen": 195597620, + "step": 9078, + "time_per_iteration": 2.550739049911499 + }, + { + "auxiliary_loss_clip": 0.01109873, + "auxiliary_loss_mlp": 0.01034241, + "balance_loss_clip": 1.04154336, + "balance_loss_mlp": 1.02065682, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.8468251683262842, + "language_loss": 0.80614364, + "learning_rate": 1.800344536188764e-06, + "loss": 0.82758474, + "num_input_tokens_seen": 195615910, + "step": 9079, + "time_per_iteration": 2.5157032012939453 + }, + { + "auxiliary_loss_clip": 0.01121227, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.04185867, + "balance_loss_mlp": 1.02120781, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.6250412989172773, + "language_loss": 0.75791788, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77948761, + "num_input_tokens_seen": 195635620, + "step": 9080, + "time_per_iteration": 2.509351968765259 + }, + { + "auxiliary_loss_clip": 0.01083773, + "auxiliary_loss_mlp": 0.01033218, + "balance_loss_clip": 1.04247427, + "balance_loss_mlp": 1.0196104, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 2.5313179506053665, + "language_loss": 0.83466905, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85583901, + "num_input_tokens_seen": 195652495, + "step": 9081, + "time_per_iteration": 2.5934481620788574 + }, + { + "auxiliary_loss_clip": 0.01119761, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.04099703, + "balance_loss_mlp": 1.01591873, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.587279804384638, + "language_loss": 0.6961087, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.71759737, + "num_input_tokens_seen": 195671965, + "step": 9082, + "time_per_iteration": 2.4794790744781494 + }, + { + "auxiliary_loss_clip": 0.0111162, + "auxiliary_loss_mlp": 0.01026862, + "balance_loss_clip": 1.03848565, + "balance_loss_mlp": 1.01380253, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.5365838081598642, + "language_loss": 0.66570497, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.6870898, + "num_input_tokens_seen": 195694725, + "step": 9083, + "time_per_iteration": 2.6104798316955566 + }, + { + "auxiliary_loss_clip": 0.01088187, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.03902233, + "balance_loss_mlp": 1.01745641, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.5591530590376184, + "language_loss": 0.78839791, + "learning_rate": 1.798407050044766e-06, + "loss": 0.8095783, + "num_input_tokens_seen": 195714090, + "step": 9084, + "time_per_iteration": 2.581435441970825 + }, + { + "auxiliary_loss_clip": 0.01106717, + "auxiliary_loss_mlp": 0.0103006, + "balance_loss_clip": 1.04016316, + "balance_loss_mlp": 1.01772213, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 1.6391898469510031, + "language_loss": 0.74987864, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77124643, + "num_input_tokens_seen": 195733585, + "step": 9085, + "time_per_iteration": 2.5202181339263916 + }, + { + "auxiliary_loss_clip": 0.0109495, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.03759813, + "balance_loss_mlp": 1.02146649, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 2.6708084047764484, + "language_loss": 0.74579656, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76709282, + "num_input_tokens_seen": 195752820, + "step": 9086, + "time_per_iteration": 2.5556633472442627 + }, + { + "auxiliary_loss_clip": 0.01105921, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.04442954, + "balance_loss_mlp": 1.01579285, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.858594621293695, + "language_loss": 0.77230805, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79365206, + "num_input_tokens_seen": 195773740, + "step": 9087, + "time_per_iteration": 2.57173490524292 + }, + { + "auxiliary_loss_clip": 0.01107877, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.04070973, + "balance_loss_mlp": 1.02658439, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.7475981304184451, + "language_loss": 0.77945274, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.80093998, + "num_input_tokens_seen": 195792125, + "step": 9088, + "time_per_iteration": 3.9667201042175293 + }, + { + "auxiliary_loss_clip": 0.00990799, + "auxiliary_loss_mlp": 0.01006302, + "balance_loss_clip": 1.02214718, + "balance_loss_mlp": 1.00496662, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7256127711200637, + "language_loss": 0.57723439, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.5972054, + "num_input_tokens_seen": 195854935, + "step": 9089, + "time_per_iteration": 3.3960487842559814 + }, + { + "auxiliary_loss_clip": 0.01080469, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.03545165, + "balance_loss_mlp": 1.02040172, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 2.303141692724602, + "language_loss": 0.76927555, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.79041648, + "num_input_tokens_seen": 195874715, + "step": 9090, + "time_per_iteration": 4.38240385055542 + }, + { + "auxiliary_loss_clip": 0.01102576, + "auxiliary_loss_mlp": 0.01042154, + "balance_loss_clip": 1.03686881, + "balance_loss_mlp": 1.02724147, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 1.9385398322316094, + "language_loss": 0.74510801, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.76655531, + "num_input_tokens_seen": 195892610, + "step": 9091, + "time_per_iteration": 2.6056618690490723 + }, + { + "auxiliary_loss_clip": 0.01099007, + "auxiliary_loss_mlp": 0.01040081, + "balance_loss_clip": 1.04156756, + "balance_loss_mlp": 1.02673006, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.8631650745749775, + "language_loss": 0.78329861, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80468947, + "num_input_tokens_seen": 195911085, + "step": 9092, + "time_per_iteration": 2.5692572593688965 + }, + { + "auxiliary_loss_clip": 0.01118386, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.04047203, + "balance_loss_mlp": 1.02091837, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 2.1848289904732026, + "language_loss": 0.7534014, + "learning_rate": 1.794920057818476e-06, + "loss": 0.77493477, + "num_input_tokens_seen": 195929845, + "step": 9093, + "time_per_iteration": 2.437397003173828 + }, + { + "auxiliary_loss_clip": 0.01107541, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.03831017, + "balance_loss_mlp": 1.02063358, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 2.47315706242933, + "language_loss": 0.68827981, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.70970935, + "num_input_tokens_seen": 195946350, + "step": 9094, + "time_per_iteration": 2.498081684112549 + }, + { + "auxiliary_loss_clip": 0.01096584, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.04172862, + "balance_loss_mlp": 1.02101076, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 2.791637293165317, + "language_loss": 0.67918885, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70048821, + "num_input_tokens_seen": 195959840, + "step": 9095, + "time_per_iteration": 2.5432376861572266 + }, + { + "auxiliary_loss_clip": 0.01081157, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.04041123, + "balance_loss_mlp": 1.02455235, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.6299039721037056, + "language_loss": 0.66538858, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.68656522, + "num_input_tokens_seen": 195981125, + "step": 9096, + "time_per_iteration": 2.6109724044799805 + }, + { + "auxiliary_loss_clip": 0.01011775, + "auxiliary_loss_mlp": 0.01003995, + "balance_loss_clip": 1.01351559, + "balance_loss_mlp": 1.00286222, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.738005427499236, + "language_loss": 0.57543331, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59559101, + "num_input_tokens_seen": 196038880, + "step": 9097, + "time_per_iteration": 3.2659971714019775 + }, + { + "auxiliary_loss_clip": 0.01038376, + "auxiliary_loss_mlp": 0.01000251, + "balance_loss_clip": 1.02757049, + "balance_loss_mlp": 0.9989875, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9084254325376868, + "language_loss": 0.64743853, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.6678248, + "num_input_tokens_seen": 196099215, + "step": 9098, + "time_per_iteration": 3.0056493282318115 + }, + { + "auxiliary_loss_clip": 0.01108314, + "auxiliary_loss_mlp": 0.01040052, + "balance_loss_clip": 1.04011679, + "balance_loss_mlp": 1.02658141, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 2.984623768745805, + "language_loss": 0.73477781, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75626153, + "num_input_tokens_seen": 196120370, + "step": 9099, + "time_per_iteration": 2.5574231147766113 + }, + { + "auxiliary_loss_clip": 0.01091775, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.0395577, + "balance_loss_mlp": 1.02181256, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.7626508583066285, + "language_loss": 0.72341287, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.7446667, + "num_input_tokens_seen": 196139075, + "step": 9100, + "time_per_iteration": 2.5990545749664307 + }, + { + "auxiliary_loss_clip": 0.01098564, + "auxiliary_loss_mlp": 0.00778006, + "balance_loss_clip": 1.03804183, + "balance_loss_mlp": 1.00050116, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.600064196409353, + "language_loss": 0.67562419, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69438994, + "num_input_tokens_seen": 196159990, + "step": 9101, + "time_per_iteration": 2.618515968322754 + }, + { + "auxiliary_loss_clip": 0.01117881, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.04132891, + "balance_loss_mlp": 1.02014613, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.8279016812901012, + "language_loss": 0.77826166, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.79977489, + "num_input_tokens_seen": 196180570, + "step": 9102, + "time_per_iteration": 3.999450922012329 + }, + { + "auxiliary_loss_clip": 0.01083312, + "auxiliary_loss_mlp": 0.01038635, + "balance_loss_clip": 1.03858352, + "balance_loss_mlp": 1.0250634, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.5498565358355725, + "language_loss": 0.72191298, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74313241, + "num_input_tokens_seen": 196200300, + "step": 9103, + "time_per_iteration": 2.583484172821045 + }, + { + "auxiliary_loss_clip": 0.01080912, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.03785479, + "balance_loss_mlp": 1.01891077, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.262450183206754, + "language_loss": 0.65676373, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67788798, + "num_input_tokens_seen": 196228525, + "step": 9104, + "time_per_iteration": 2.840561628341675 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.04129589, + "balance_loss_mlp": 1.01854873, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.7408767094865505, + "language_loss": 0.81442159, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83577794, + "num_input_tokens_seen": 196247690, + "step": 9105, + "time_per_iteration": 2.493154525756836 + }, + { + "auxiliary_loss_clip": 0.01113804, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.03906465, + "balance_loss_mlp": 1.01835442, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.6555005225863288, + "language_loss": 0.80694574, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82838863, + "num_input_tokens_seen": 196268555, + "step": 9106, + "time_per_iteration": 2.4877054691314697 + }, + { + "auxiliary_loss_clip": 0.01105646, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.04004073, + "balance_loss_mlp": 1.02254546, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.913084479988653, + "language_loss": 0.69445604, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71585894, + "num_input_tokens_seen": 196285585, + "step": 9107, + "time_per_iteration": 2.436495304107666 + }, + { + "auxiliary_loss_clip": 0.01108692, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.03984332, + "balance_loss_mlp": 1.01862025, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.834317841816304, + "language_loss": 0.63570476, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65710622, + "num_input_tokens_seen": 196305085, + "step": 9108, + "time_per_iteration": 2.4891598224639893 + }, + { + "auxiliary_loss_clip": 0.01113689, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.03898191, + "balance_loss_mlp": 1.01943946, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.8931104655526947, + "language_loss": 0.7508353, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77229202, + "num_input_tokens_seen": 196323945, + "step": 9109, + "time_per_iteration": 2.4471399784088135 + }, + { + "auxiliary_loss_clip": 0.01085764, + "auxiliary_loss_mlp": 0.01035552, + "balance_loss_clip": 1.03822112, + "balance_loss_mlp": 1.02164602, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 2.090752762874654, + "language_loss": 0.77305746, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79427063, + "num_input_tokens_seen": 196342200, + "step": 9110, + "time_per_iteration": 2.524799346923828 + }, + { + "auxiliary_loss_clip": 0.01098424, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.03757334, + "balance_loss_mlp": 1.02447963, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.6453499484418048, + "language_loss": 0.71195185, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.7333045, + "num_input_tokens_seen": 196362940, + "step": 9111, + "time_per_iteration": 2.532341241836548 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.03801739, + "balance_loss_mlp": 1.02556968, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.5708025229136655, + "language_loss": 0.7111752, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73258579, + "num_input_tokens_seen": 196383070, + "step": 9112, + "time_per_iteration": 2.491527557373047 + }, + { + "auxiliary_loss_clip": 0.01063511, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.03749204, + "balance_loss_mlp": 1.0194912, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.106109233701406, + "language_loss": 0.87545246, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.89640844, + "num_input_tokens_seen": 196398485, + "step": 9113, + "time_per_iteration": 2.563586950302124 + }, + { + "auxiliary_loss_clip": 0.01068971, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.04083657, + "balance_loss_mlp": 1.01605475, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.8262083479260478, + "language_loss": 0.73062158, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75160921, + "num_input_tokens_seen": 196417725, + "step": 9114, + "time_per_iteration": 2.645136594772339 + }, + { + "auxiliary_loss_clip": 0.01089049, + "auxiliary_loss_mlp": 0.00778895, + "balance_loss_clip": 1.03517735, + "balance_loss_mlp": 1.00048113, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.699455994830301, + "language_loss": 0.72309041, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74176985, + "num_input_tokens_seen": 196437840, + "step": 9115, + "time_per_iteration": 2.554471254348755 + }, + { + "auxiliary_loss_clip": 0.01079076, + "auxiliary_loss_mlp": 0.00784086, + "balance_loss_clip": 1.03540432, + "balance_loss_mlp": 1.00055194, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 3.0392767638638833, + "language_loss": 0.71745825, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.73608983, + "num_input_tokens_seen": 196457300, + "step": 9116, + "time_per_iteration": 2.570301055908203 + }, + { + "auxiliary_loss_clip": 0.0109202, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.03866792, + "balance_loss_mlp": 1.0210259, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 1.831736325316943, + "language_loss": 0.76199234, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78325313, + "num_input_tokens_seen": 196476720, + "step": 9117, + "time_per_iteration": 4.118849277496338 + }, + { + "auxiliary_loss_clip": 0.01072489, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.03727889, + "balance_loss_mlp": 1.02071762, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.608966523139845, + "language_loss": 0.62744117, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64849943, + "num_input_tokens_seen": 196496765, + "step": 9118, + "time_per_iteration": 2.679600954055786 + }, + { + "auxiliary_loss_clip": 0.0111964, + "auxiliary_loss_mlp": 0.01034913, + "balance_loss_clip": 1.04235554, + "balance_loss_mlp": 1.02076876, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 1.8077619388272952, + "language_loss": 0.79115003, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81269562, + "num_input_tokens_seen": 196516220, + "step": 9119, + "time_per_iteration": 2.437008857727051 + }, + { + "auxiliary_loss_clip": 0.01088306, + "auxiliary_loss_mlp": 0.00777235, + "balance_loss_clip": 1.04052639, + "balance_loss_mlp": 1.00047243, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.6639332567077498, + "language_loss": 0.8252663, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84392166, + "num_input_tokens_seen": 196533860, + "step": 9120, + "time_per_iteration": 2.564718246459961 + }, + { + "auxiliary_loss_clip": 0.01086781, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.04268312, + "balance_loss_mlp": 1.02075052, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.8427964101374965, + "language_loss": 0.80481839, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82602894, + "num_input_tokens_seen": 196551305, + "step": 9121, + "time_per_iteration": 2.5649173259735107 + }, + { + "auxiliary_loss_clip": 0.01072688, + "auxiliary_loss_mlp": 0.01037771, + "balance_loss_clip": 1.03675938, + "balance_loss_mlp": 1.02424037, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 1.7488541856164557, + "language_loss": 0.6105932, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63169777, + "num_input_tokens_seen": 196569420, + "step": 9122, + "time_per_iteration": 2.602536201477051 + }, + { + "auxiliary_loss_clip": 0.01098432, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.043926, + "balance_loss_mlp": 1.01874411, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.7573495670477277, + "language_loss": 0.71690816, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.73819005, + "num_input_tokens_seen": 196590610, + "step": 9123, + "time_per_iteration": 2.5634117126464844 + }, + { + "auxiliary_loss_clip": 0.01116408, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.04015279, + "balance_loss_mlp": 1.0193882, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 1.9136437632850238, + "language_loss": 0.83482903, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85631156, + "num_input_tokens_seen": 196606495, + "step": 9124, + "time_per_iteration": 2.398930549621582 + }, + { + "auxiliary_loss_clip": 0.01095095, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.04533219, + "balance_loss_mlp": 1.01855481, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.5351450925972767, + "language_loss": 0.80110234, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82236671, + "num_input_tokens_seen": 196626365, + "step": 9125, + "time_per_iteration": 2.5828540325164795 + }, + { + "auxiliary_loss_clip": 0.01107175, + "auxiliary_loss_mlp": 0.01027021, + "balance_loss_clip": 1.04084373, + "balance_loss_mlp": 1.01386058, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 3.0606151099691106, + "language_loss": 0.75035864, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.77170062, + "num_input_tokens_seen": 196644465, + "step": 9126, + "time_per_iteration": 2.4639127254486084 + }, + { + "auxiliary_loss_clip": 0.01105443, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.03810513, + "balance_loss_mlp": 1.02152359, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.3772695154877743, + "language_loss": 0.67038572, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.69180691, + "num_input_tokens_seen": 196659160, + "step": 9127, + "time_per_iteration": 2.425903558731079 + }, + { + "auxiliary_loss_clip": 0.01076585, + "auxiliary_loss_mlp": 0.01038997, + "balance_loss_clip": 1.03655708, + "balance_loss_mlp": 1.0248524, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.8447291734302858, + "language_loss": 0.83357036, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85472608, + "num_input_tokens_seen": 196677410, + "step": 9128, + "time_per_iteration": 3.9779253005981445 + }, + { + "auxiliary_loss_clip": 0.01072459, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.03464818, + "balance_loss_mlp": 1.02082539, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 2.0017747803100794, + "language_loss": 0.73788595, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.75896126, + "num_input_tokens_seen": 196696765, + "step": 9129, + "time_per_iteration": 3.969512462615967 + }, + { + "auxiliary_loss_clip": 0.01080618, + "auxiliary_loss_mlp": 0.01032104, + "balance_loss_clip": 1.04056239, + "balance_loss_mlp": 1.01736975, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 2.2532207184077753, + "language_loss": 0.62545079, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.64657807, + "num_input_tokens_seen": 196714895, + "step": 9130, + "time_per_iteration": 2.51059627532959 + }, + { + "auxiliary_loss_clip": 0.01119053, + "auxiliary_loss_mlp": 0.00779855, + "balance_loss_clip": 1.04100037, + "balance_loss_mlp": 1.00054634, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 1.9105519234002244, + "language_loss": 0.62973791, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.64872694, + "num_input_tokens_seen": 196735510, + "step": 9131, + "time_per_iteration": 2.4850692749023438 + }, + { + "auxiliary_loss_clip": 0.01106628, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.03817344, + "balance_loss_mlp": 1.02033734, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.7730810477569552, + "language_loss": 0.74746978, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76888829, + "num_input_tokens_seen": 196752855, + "step": 9132, + "time_per_iteration": 2.4604263305664062 + }, + { + "auxiliary_loss_clip": 0.01103407, + "auxiliary_loss_mlp": 0.01033218, + "balance_loss_clip": 1.03541756, + "balance_loss_mlp": 1.02043891, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.5455535841392838, + "language_loss": 0.81355548, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83492172, + "num_input_tokens_seen": 196772230, + "step": 9133, + "time_per_iteration": 2.5067145824432373 + }, + { + "auxiliary_loss_clip": 0.01095738, + "auxiliary_loss_mlp": 0.00779809, + "balance_loss_clip": 1.04185629, + "balance_loss_mlp": 1.00050223, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 2.8572169195949657, + "language_loss": 0.70104015, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.7197957, + "num_input_tokens_seen": 196790405, + "step": 9134, + "time_per_iteration": 2.5060536861419678 + }, + { + "auxiliary_loss_clip": 0.01083428, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.03891206, + "balance_loss_mlp": 1.01834941, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 2.3247278216553227, + "language_loss": 0.60963535, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63077718, + "num_input_tokens_seen": 196813785, + "step": 9135, + "time_per_iteration": 2.798015594482422 + }, + { + "auxiliary_loss_clip": 0.01108389, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.04058135, + "balance_loss_mlp": 1.02442193, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.621284163673247, + "language_loss": 0.72280502, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74427259, + "num_input_tokens_seen": 196834390, + "step": 9136, + "time_per_iteration": 2.510739326477051 + }, + { + "auxiliary_loss_clip": 0.01055392, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.0366801, + "balance_loss_mlp": 1.02279103, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 2.150397847710687, + "language_loss": 0.6840117, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70494604, + "num_input_tokens_seen": 196853290, + "step": 9137, + "time_per_iteration": 2.623727560043335 + }, + { + "auxiliary_loss_clip": 0.01029173, + "auxiliary_loss_mlp": 0.01004429, + "balance_loss_clip": 1.01301086, + "balance_loss_mlp": 1.00317121, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.738408896043037, + "language_loss": 0.65278959, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67312562, + "num_input_tokens_seen": 196913120, + "step": 9138, + "time_per_iteration": 3.08752179145813 + }, + { + "auxiliary_loss_clip": 0.01105975, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.04013109, + "balance_loss_mlp": 1.01946318, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.721759196132981, + "language_loss": 0.7530849, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.7744785, + "num_input_tokens_seen": 196931530, + "step": 9139, + "time_per_iteration": 2.4891281127929688 + }, + { + "auxiliary_loss_clip": 0.01106773, + "auxiliary_loss_mlp": 0.01027908, + "balance_loss_clip": 1.04008865, + "balance_loss_mlp": 1.01459229, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 2.3467820454198414, + "language_loss": 0.71452582, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73587263, + "num_input_tokens_seen": 196949430, + "step": 9140, + "time_per_iteration": 2.438797950744629 + }, + { + "auxiliary_loss_clip": 0.01090437, + "auxiliary_loss_mlp": 0.01038885, + "balance_loss_clip": 1.03823698, + "balance_loss_mlp": 1.02468073, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.5769503163687173, + "language_loss": 0.76217079, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78346395, + "num_input_tokens_seen": 196968265, + "step": 9141, + "time_per_iteration": 4.071706533432007 + }, + { + "auxiliary_loss_clip": 0.01077993, + "auxiliary_loss_mlp": 0.01031494, + "balance_loss_clip": 1.03779268, + "balance_loss_mlp": 1.01802313, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 3.5262210632840434, + "language_loss": 0.75118554, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77228034, + "num_input_tokens_seen": 196984930, + "step": 9142, + "time_per_iteration": 2.5213260650634766 + }, + { + "auxiliary_loss_clip": 0.01095864, + "auxiliary_loss_mlp": 0.01039581, + "balance_loss_clip": 1.04196513, + "balance_loss_mlp": 1.02478075, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 2.2130451407335676, + "language_loss": 0.7688356, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.79019004, + "num_input_tokens_seen": 197002320, + "step": 9143, + "time_per_iteration": 2.5143415927886963 + }, + { + "auxiliary_loss_clip": 0.01088321, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.03633654, + "balance_loss_mlp": 1.02380419, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 2.3315388845196683, + "language_loss": 0.7925086, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.81377703, + "num_input_tokens_seen": 197020825, + "step": 9144, + "time_per_iteration": 2.5635287761688232 + }, + { + "auxiliary_loss_clip": 0.01107282, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.04062295, + "balance_loss_mlp": 1.01655817, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 1.7750501888665176, + "language_loss": 0.71220309, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73357707, + "num_input_tokens_seen": 197040450, + "step": 9145, + "time_per_iteration": 2.609837770462036 + }, + { + "auxiliary_loss_clip": 0.01105179, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.03844261, + "balance_loss_mlp": 1.02624846, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.5784478187536122, + "language_loss": 0.70195919, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72339964, + "num_input_tokens_seen": 197063930, + "step": 9146, + "time_per_iteration": 2.6340017318725586 + }, + { + "auxiliary_loss_clip": 0.01093216, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.03790307, + "balance_loss_mlp": 1.01970375, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.9021895845585632, + "language_loss": 0.64284205, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66409314, + "num_input_tokens_seen": 197082660, + "step": 9147, + "time_per_iteration": 2.5407562255859375 + }, + { + "auxiliary_loss_clip": 0.01118328, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.0420177, + "balance_loss_mlp": 1.02191019, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.005561458294525, + "language_loss": 0.80791456, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.82944727, + "num_input_tokens_seen": 197100675, + "step": 9148, + "time_per_iteration": 2.4265477657318115 + }, + { + "auxiliary_loss_clip": 0.01094073, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.03888357, + "balance_loss_mlp": 1.02508557, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 2.0276907510681923, + "language_loss": 0.78787553, + "learning_rate": 1.773237789559453e-06, + "loss": 0.80919522, + "num_input_tokens_seen": 197121320, + "step": 9149, + "time_per_iteration": 2.5356132984161377 + }, + { + "auxiliary_loss_clip": 0.01083523, + "auxiliary_loss_mlp": 0.01030848, + "balance_loss_clip": 1.04270065, + "balance_loss_mlp": 1.01780057, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 1.990246946373643, + "language_loss": 0.71891922, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74006289, + "num_input_tokens_seen": 197138965, + "step": 9150, + "time_per_iteration": 2.5562422275543213 + }, + { + "auxiliary_loss_clip": 0.01098412, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.03697562, + "balance_loss_mlp": 1.01520419, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 2.6785010730275243, + "language_loss": 0.75028467, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77156746, + "num_input_tokens_seen": 197156460, + "step": 9151, + "time_per_iteration": 2.596449613571167 + }, + { + "auxiliary_loss_clip": 0.01093745, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.04185796, + "balance_loss_mlp": 1.01677477, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 1.9818561647717376, + "language_loss": 0.76722634, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.7884655, + "num_input_tokens_seen": 197175140, + "step": 9152, + "time_per_iteration": 2.5623090267181396 + }, + { + "auxiliary_loss_clip": 0.01092609, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.03945243, + "balance_loss_mlp": 1.02146626, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 2.17604482513302, + "language_loss": 0.82853448, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84980094, + "num_input_tokens_seen": 197194345, + "step": 9153, + "time_per_iteration": 2.5895745754241943 + }, + { + "auxiliary_loss_clip": 0.01105398, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.04132116, + "balance_loss_mlp": 1.02054644, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7448215628126078, + "language_loss": 0.74097008, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76236272, + "num_input_tokens_seen": 197215535, + "step": 9154, + "time_per_iteration": 2.5770320892333984 + }, + { + "auxiliary_loss_clip": 0.0110139, + "auxiliary_loss_mlp": 0.01036955, + "balance_loss_clip": 1.03950894, + "balance_loss_mlp": 1.0220542, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.6040906212417723, + "language_loss": 0.72715348, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74853694, + "num_input_tokens_seen": 197234945, + "step": 9155, + "time_per_iteration": 2.5813794136047363 + }, + { + "auxiliary_loss_clip": 0.01019778, + "auxiliary_loss_mlp": 0.01006418, + "balance_loss_clip": 1.01199257, + "balance_loss_mlp": 1.00525022, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7558752093278022, + "language_loss": 0.55362141, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57388335, + "num_input_tokens_seen": 197302285, + "step": 9156, + "time_per_iteration": 4.678028106689453 + }, + { + "auxiliary_loss_clip": 0.01105708, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.03867173, + "balance_loss_mlp": 1.02353179, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.6879531568231816, + "language_loss": 0.82917577, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.85060465, + "num_input_tokens_seen": 197321575, + "step": 9157, + "time_per_iteration": 2.476391553878784 + }, + { + "auxiliary_loss_clip": 0.01124945, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.04426289, + "balance_loss_mlp": 1.02002263, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.1893102699275193, + "language_loss": 0.75811672, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77971506, + "num_input_tokens_seen": 197340255, + "step": 9158, + "time_per_iteration": 2.533538579940796 + }, + { + "auxiliary_loss_clip": 0.01084393, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.04571247, + "balance_loss_mlp": 1.02116609, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.5500404014963354, + "language_loss": 0.69946766, + "learning_rate": 1.769368719290979e-06, + "loss": 0.72065622, + "num_input_tokens_seen": 197360360, + "step": 9159, + "time_per_iteration": 2.6505727767944336 + }, + { + "auxiliary_loss_clip": 0.01072303, + "auxiliary_loss_mlp": 0.00781035, + "balance_loss_clip": 1.03583479, + "balance_loss_mlp": 1.00068808, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.5153440413151205, + "language_loss": 0.68243515, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.7009685, + "num_input_tokens_seen": 197381905, + "step": 9160, + "time_per_iteration": 2.670353889465332 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.04089808, + "balance_loss_mlp": 1.01799345, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 1.9684035863285854, + "language_loss": 0.7110492, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.7325114, + "num_input_tokens_seen": 197398555, + "step": 9161, + "time_per_iteration": 2.4335145950317383 + }, + { + "auxiliary_loss_clip": 0.01105933, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_clip": 1.04021287, + "balance_loss_mlp": 1.02861023, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 1.5774414850628562, + "language_loss": 0.69455498, + "learning_rate": 1.768208168081359e-06, + "loss": 0.7160356, + "num_input_tokens_seen": 197419630, + "step": 9162, + "time_per_iteration": 2.521819591522217 + }, + { + "auxiliary_loss_clip": 0.01116622, + "auxiliary_loss_mlp": 0.01040321, + "balance_loss_clip": 1.0411334, + "balance_loss_mlp": 1.02647448, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6553985364701462, + "language_loss": 0.85569215, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87726164, + "num_input_tokens_seen": 197438480, + "step": 9163, + "time_per_iteration": 2.5141422748565674 + }, + { + "auxiliary_loss_clip": 0.01086386, + "auxiliary_loss_mlp": 0.01030542, + "balance_loss_clip": 1.04094076, + "balance_loss_mlp": 1.01815653, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.600719152211844, + "language_loss": 0.80660868, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.82777798, + "num_input_tokens_seen": 197456755, + "step": 9164, + "time_per_iteration": 2.587387800216675 + }, + { + "auxiliary_loss_clip": 0.01101675, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.04425907, + "balance_loss_mlp": 1.01598656, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 5.563106636748619, + "language_loss": 0.73255289, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75386882, + "num_input_tokens_seen": 197475530, + "step": 9165, + "time_per_iteration": 2.5309908390045166 + }, + { + "auxiliary_loss_clip": 0.01098258, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.03604424, + "balance_loss_mlp": 1.0234741, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 1.8399017242598643, + "language_loss": 0.78852338, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.80988455, + "num_input_tokens_seen": 197490835, + "step": 9166, + "time_per_iteration": 2.514575481414795 + }, + { + "auxiliary_loss_clip": 0.0108733, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.03898525, + "balance_loss_mlp": 1.01568174, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.063875918047984, + "language_loss": 0.76256746, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78372562, + "num_input_tokens_seen": 197508770, + "step": 9167, + "time_per_iteration": 4.028958797454834 + }, + { + "auxiliary_loss_clip": 0.01106141, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.0403254, + "balance_loss_mlp": 1.01353669, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.3393596160960572, + "language_loss": 0.79977405, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.82110852, + "num_input_tokens_seen": 197527340, + "step": 9168, + "time_per_iteration": 2.5214364528656006 + }, + { + "auxiliary_loss_clip": 0.01109792, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.04228282, + "balance_loss_mlp": 1.02236533, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.9029849325307056, + "language_loss": 0.69417214, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71562958, + "num_input_tokens_seen": 197547280, + "step": 9169, + "time_per_iteration": 3.9857683181762695 + }, + { + "auxiliary_loss_clip": 0.01100711, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.03834057, + "balance_loss_mlp": 1.01503897, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.5817310487204845, + "language_loss": 0.8574301, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87871295, + "num_input_tokens_seen": 197565045, + "step": 9170, + "time_per_iteration": 2.5382516384124756 + }, + { + "auxiliary_loss_clip": 0.01022052, + "auxiliary_loss_mlp": 0.0100934, + "balance_loss_clip": 1.02317309, + "balance_loss_mlp": 1.00824964, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7807236115022959, + "language_loss": 0.59895837, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61927235, + "num_input_tokens_seen": 197625005, + "step": 9171, + "time_per_iteration": 3.140933036804199 + }, + { + "auxiliary_loss_clip": 0.01082134, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.03718758, + "balance_loss_mlp": 1.02271545, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.4660932265456612, + "language_loss": 0.70381343, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72499764, + "num_input_tokens_seen": 197645050, + "step": 9172, + "time_per_iteration": 2.561021089553833 + }, + { + "auxiliary_loss_clip": 0.01113419, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.0386126, + "balance_loss_mlp": 1.02325964, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.9716673658821415, + "language_loss": 0.76315284, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.78465366, + "num_input_tokens_seen": 197663910, + "step": 9173, + "time_per_iteration": 2.4683685302734375 + }, + { + "auxiliary_loss_clip": 0.01079776, + "auxiliary_loss_mlp": 0.01034029, + "balance_loss_clip": 1.03888226, + "balance_loss_mlp": 1.02085638, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.7655376245804446, + "language_loss": 0.7507844, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.77192247, + "num_input_tokens_seen": 197681580, + "step": 9174, + "time_per_iteration": 2.5839126110076904 + }, + { + "auxiliary_loss_clip": 0.01097694, + "auxiliary_loss_mlp": 0.01034736, + "balance_loss_clip": 1.03930354, + "balance_loss_mlp": 1.02108097, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.6920780240866962, + "language_loss": 0.72375453, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.7450788, + "num_input_tokens_seen": 197702095, + "step": 9175, + "time_per_iteration": 2.5833590030670166 + }, + { + "auxiliary_loss_clip": 0.01105176, + "auxiliary_loss_mlp": 0.01035179, + "balance_loss_clip": 1.03984344, + "balance_loss_mlp": 1.0221796, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.8501389094615037, + "language_loss": 0.69192517, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71332866, + "num_input_tokens_seen": 197720720, + "step": 9176, + "time_per_iteration": 2.4982094764709473 + }, + { + "auxiliary_loss_clip": 0.01108837, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.04262686, + "balance_loss_mlp": 1.0168674, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.874877164610153, + "language_loss": 0.7087543, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73014444, + "num_input_tokens_seen": 197741820, + "step": 9177, + "time_per_iteration": 2.5430185794830322 + }, + { + "auxiliary_loss_clip": 0.01106671, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_clip": 1.04155588, + "balance_loss_mlp": 1.01709437, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.9892516647270948, + "language_loss": 0.80115205, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82252049, + "num_input_tokens_seen": 197759160, + "step": 9178, + "time_per_iteration": 2.5194714069366455 + }, + { + "auxiliary_loss_clip": 0.01060868, + "auxiliary_loss_mlp": 0.010427, + "balance_loss_clip": 1.03909206, + "balance_loss_mlp": 1.02772176, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.64946562382608, + "language_loss": 0.74999326, + "learning_rate": 1.761633217089826e-06, + "loss": 0.771029, + "num_input_tokens_seen": 197779760, + "step": 9179, + "time_per_iteration": 2.649186134338379 + }, + { + "auxiliary_loss_clip": 0.01108367, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.04219985, + "balance_loss_mlp": 1.02173209, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.8884235826219604, + "language_loss": 0.70282441, + "learning_rate": 1.761246535912924e-06, + "loss": 0.7242564, + "num_input_tokens_seen": 197801545, + "step": 9180, + "time_per_iteration": 4.117374658584595 + }, + { + "auxiliary_loss_clip": 0.01101634, + "auxiliary_loss_mlp": 0.01039154, + "balance_loss_clip": 1.03708255, + "balance_loss_mlp": 1.02445543, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 2.1526085735987754, + "language_loss": 0.67376161, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69516945, + "num_input_tokens_seen": 197820760, + "step": 9181, + "time_per_iteration": 2.453310251235962 + }, + { + "auxiliary_loss_clip": 0.01121457, + "auxiliary_loss_mlp": 0.01035355, + "balance_loss_clip": 1.04230785, + "balance_loss_mlp": 1.02113318, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 2.55002588403152, + "language_loss": 0.79004532, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81161344, + "num_input_tokens_seen": 197840195, + "step": 9182, + "time_per_iteration": 2.4443359375 + }, + { + "auxiliary_loss_clip": 0.0108524, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.04285705, + "balance_loss_mlp": 1.01830125, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 1.9555105219588143, + "language_loss": 0.82945639, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.85062373, + "num_input_tokens_seen": 197859475, + "step": 9183, + "time_per_iteration": 2.5513663291931152 + }, + { + "auxiliary_loss_clip": 0.01097979, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.04266405, + "balance_loss_mlp": 1.01783514, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.335718359865023, + "language_loss": 0.67263347, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69391632, + "num_input_tokens_seen": 197879395, + "step": 9184, + "time_per_iteration": 2.509280204772949 + }, + { + "auxiliary_loss_clip": 0.01107038, + "auxiliary_loss_mlp": 0.01026893, + "balance_loss_clip": 1.04120624, + "balance_loss_mlp": 1.01353574, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.5318485470120131, + "language_loss": 0.76411086, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78545016, + "num_input_tokens_seen": 197900815, + "step": 9185, + "time_per_iteration": 2.514237403869629 + }, + { + "auxiliary_loss_clip": 0.01080424, + "auxiliary_loss_mlp": 0.01038923, + "balance_loss_clip": 1.03792369, + "balance_loss_mlp": 1.02533317, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.6228321761931424, + "language_loss": 0.73983657, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76103008, + "num_input_tokens_seen": 197918985, + "step": 9186, + "time_per_iteration": 2.576178789138794 + }, + { + "auxiliary_loss_clip": 0.0109096, + "auxiliary_loss_mlp": 0.0103927, + "balance_loss_clip": 1.04237854, + "balance_loss_mlp": 1.0264256, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 2.609100748385845, + "language_loss": 0.66526282, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68656516, + "num_input_tokens_seen": 197937725, + "step": 9187, + "time_per_iteration": 2.5852842330932617 + }, + { + "auxiliary_loss_clip": 0.01098024, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.04319823, + "balance_loss_mlp": 1.01892519, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.770885431369122, + "language_loss": 0.77942699, + "learning_rate": 1.758153413657318e-06, + "loss": 0.80072892, + "num_input_tokens_seen": 197955635, + "step": 9188, + "time_per_iteration": 2.5298502445220947 + }, + { + "auxiliary_loss_clip": 0.01090343, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.03690934, + "balance_loss_mlp": 1.01836514, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 2.057583358003744, + "language_loss": 0.81450272, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83572763, + "num_input_tokens_seen": 197974490, + "step": 9189, + "time_per_iteration": 2.549921751022339 + }, + { + "auxiliary_loss_clip": 0.01105028, + "auxiliary_loss_mlp": 0.00778966, + "balance_loss_clip": 1.04539251, + "balance_loss_mlp": 1.00062943, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.6706897542536003, + "language_loss": 0.76772618, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78656614, + "num_input_tokens_seen": 197995735, + "step": 9190, + "time_per_iteration": 2.555598020553589 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01040572, + "balance_loss_clip": 1.04229021, + "balance_loss_mlp": 1.02555132, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 2.4086762118701026, + "language_loss": 0.79314518, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81478131, + "num_input_tokens_seen": 198009685, + "step": 9191, + "time_per_iteration": 2.430757761001587 + }, + { + "auxiliary_loss_clip": 0.01050224, + "auxiliary_loss_mlp": 0.01039219, + "balance_loss_clip": 1.03385007, + "balance_loss_mlp": 1.02570677, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 2.020162398912156, + "language_loss": 0.68935078, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.71024525, + "num_input_tokens_seen": 198026845, + "step": 9192, + "time_per_iteration": 2.587289333343506 + }, + { + "auxiliary_loss_clip": 0.01104152, + "auxiliary_loss_mlp": 0.01035985, + "balance_loss_clip": 1.04056144, + "balance_loss_mlp": 1.02468407, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.5713703940231383, + "language_loss": 0.77511835, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79651976, + "num_input_tokens_seen": 198045275, + "step": 9193, + "time_per_iteration": 2.4938783645629883 + }, + { + "auxiliary_loss_clip": 0.01081378, + "auxiliary_loss_mlp": 0.01039668, + "balance_loss_clip": 1.03514051, + "balance_loss_mlp": 1.02680564, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 2.066148756922934, + "language_loss": 0.78680611, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80801654, + "num_input_tokens_seen": 198065760, + "step": 9194, + "time_per_iteration": 2.541266441345215 + }, + { + "auxiliary_loss_clip": 0.01086151, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.04136622, + "balance_loss_mlp": 1.01788878, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 2.7485939878889067, + "language_loss": 0.69781792, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71899217, + "num_input_tokens_seen": 198087595, + "step": 9195, + "time_per_iteration": 2.6922056674957275 + }, + { + "auxiliary_loss_clip": 0.01103764, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.04274714, + "balance_loss_mlp": 1.02105522, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 1.899151660849669, + "language_loss": 0.74021649, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76160312, + "num_input_tokens_seen": 198104620, + "step": 9196, + "time_per_iteration": 3.9732065200805664 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01039332, + "balance_loss_clip": 1.03968906, + "balance_loss_mlp": 1.02588522, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.6677252795448336, + "language_loss": 0.77082938, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.79223979, + "num_input_tokens_seen": 198123565, + "step": 9197, + "time_per_iteration": 2.5867080688476562 + }, + { + "auxiliary_loss_clip": 0.0109507, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.03950548, + "balance_loss_mlp": 1.01805758, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.5151205237243892, + "language_loss": 0.76182538, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78308052, + "num_input_tokens_seen": 198148270, + "step": 9198, + "time_per_iteration": 2.7584619522094727 + }, + { + "auxiliary_loss_clip": 0.01115267, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.04025531, + "balance_loss_mlp": 1.01865351, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.4706455079424041, + "language_loss": 0.79330885, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81477469, + "num_input_tokens_seen": 198168810, + "step": 9199, + "time_per_iteration": 2.4893972873687744 + }, + { + "auxiliary_loss_clip": 0.01079218, + "auxiliary_loss_mlp": 0.01040842, + "balance_loss_clip": 1.04091012, + "balance_loss_mlp": 1.02689481, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.7137617212319778, + "language_loss": 0.63614225, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.65734285, + "num_input_tokens_seen": 198186200, + "step": 9200, + "time_per_iteration": 2.5208206176757812 + }, + { + "auxiliary_loss_clip": 0.01101228, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.04303098, + "balance_loss_mlp": 1.01818943, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 2.879344971807889, + "language_loss": 0.66383064, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68517119, + "num_input_tokens_seen": 198207050, + "step": 9201, + "time_per_iteration": 2.562274694442749 + }, + { + "auxiliary_loss_clip": 0.01105506, + "auxiliary_loss_mlp": 0.01035989, + "balance_loss_clip": 1.04017401, + "balance_loss_mlp": 1.02209544, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 2.0042205530307355, + "language_loss": 0.60201788, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.62343287, + "num_input_tokens_seen": 198224565, + "step": 9202, + "time_per_iteration": 2.4919886589050293 + }, + { + "auxiliary_loss_clip": 0.01104143, + "auxiliary_loss_mlp": 0.00779003, + "balance_loss_clip": 1.04066801, + "balance_loss_mlp": 1.0006454, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.7100448142338436, + "language_loss": 0.64723033, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66606176, + "num_input_tokens_seen": 198244790, + "step": 9203, + "time_per_iteration": 2.4906156063079834 + }, + { + "auxiliary_loss_clip": 0.01106722, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.04158974, + "balance_loss_mlp": 1.01817298, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.6096063324095269, + "language_loss": 0.63812453, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65950263, + "num_input_tokens_seen": 198264375, + "step": 9204, + "time_per_iteration": 2.5132346153259277 + }, + { + "auxiliary_loss_clip": 0.01102352, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.03899431, + "balance_loss_mlp": 1.01966357, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.5328619863137083, + "language_loss": 0.77263117, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79397285, + "num_input_tokens_seen": 198283895, + "step": 9205, + "time_per_iteration": 2.4941303730010986 + }, + { + "auxiliary_loss_clip": 0.01061297, + "auxiliary_loss_mlp": 0.01042529, + "balance_loss_clip": 1.03522825, + "balance_loss_mlp": 1.02912998, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.3740122182094447, + "language_loss": 0.72682238, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74786061, + "num_input_tokens_seen": 198310035, + "step": 9206, + "time_per_iteration": 2.733731985092163 + }, + { + "auxiliary_loss_clip": 0.01075209, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.03915, + "balance_loss_mlp": 1.02182937, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 1.8946634245307485, + "language_loss": 0.758605, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77970582, + "num_input_tokens_seen": 198327810, + "step": 9207, + "time_per_iteration": 4.146893501281738 + }, + { + "auxiliary_loss_clip": 0.01088413, + "auxiliary_loss_mlp": 0.01035368, + "balance_loss_clip": 1.04185939, + "balance_loss_mlp": 1.02173662, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.687840505111995, + "language_loss": 0.61475122, + "learning_rate": 1.750423192272189e-06, + "loss": 0.63598907, + "num_input_tokens_seen": 198343150, + "step": 9208, + "time_per_iteration": 3.919584035873413 + }, + { + "auxiliary_loss_clip": 0.011179, + "auxiliary_loss_mlp": 0.01033245, + "balance_loss_clip": 1.04124129, + "balance_loss_mlp": 1.02083516, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.349807259570041, + "language_loss": 0.64433825, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66584969, + "num_input_tokens_seen": 198360925, + "step": 9209, + "time_per_iteration": 2.4354405403137207 + }, + { + "auxiliary_loss_clip": 0.01082377, + "auxiliary_loss_mlp": 0.01036992, + "balance_loss_clip": 1.03850055, + "balance_loss_mlp": 1.0234735, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 2.0565174411934253, + "language_loss": 0.82662362, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.8478173, + "num_input_tokens_seen": 198379265, + "step": 9210, + "time_per_iteration": 2.579228401184082 + }, + { + "auxiliary_loss_clip": 0.01094109, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.03816915, + "balance_loss_mlp": 1.01708341, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 1.7463750827661504, + "language_loss": 0.73090172, + "learning_rate": 1.74926398270663e-06, + "loss": 0.75213116, + "num_input_tokens_seen": 198399490, + "step": 9211, + "time_per_iteration": 2.565182685852051 + }, + { + "auxiliary_loss_clip": 0.01087926, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.03816795, + "balance_loss_mlp": 1.02590227, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.8217122878549783, + "language_loss": 0.66436338, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68564624, + "num_input_tokens_seen": 198419110, + "step": 9212, + "time_per_iteration": 2.5884103775024414 + }, + { + "auxiliary_loss_clip": 0.01086892, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.03799629, + "balance_loss_mlp": 1.01877153, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.48753934672067, + "language_loss": 0.51700836, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53821349, + "num_input_tokens_seen": 198441360, + "step": 9213, + "time_per_iteration": 2.6679532527923584 + }, + { + "auxiliary_loss_clip": 0.01089248, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.04117131, + "balance_loss_mlp": 1.01992059, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.899639597140007, + "language_loss": 0.85721362, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.8784374, + "num_input_tokens_seen": 198459835, + "step": 9214, + "time_per_iteration": 2.612727642059326 + }, + { + "auxiliary_loss_clip": 0.01106719, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.04115772, + "balance_loss_mlp": 1.01795077, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.715380146980994, + "language_loss": 0.70102763, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72239769, + "num_input_tokens_seen": 198478955, + "step": 9215, + "time_per_iteration": 2.5462141036987305 + }, + { + "auxiliary_loss_clip": 0.01091735, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03968751, + "balance_loss_mlp": 1.01712251, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.7737109008137109, + "language_loss": 0.72747707, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.74870473, + "num_input_tokens_seen": 198499030, + "step": 9216, + "time_per_iteration": 2.5531907081604004 + }, + { + "auxiliary_loss_clip": 0.01090614, + "auxiliary_loss_mlp": 0.01034556, + "balance_loss_clip": 1.03867733, + "balance_loss_mlp": 1.02096581, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.7882205078234656, + "language_loss": 0.71838796, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.7396397, + "num_input_tokens_seen": 198520265, + "step": 9217, + "time_per_iteration": 2.564373016357422 + }, + { + "auxiliary_loss_clip": 0.01097552, + "auxiliary_loss_mlp": 0.01030546, + "balance_loss_clip": 1.03987527, + "balance_loss_mlp": 1.01794577, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 2.035867409822138, + "language_loss": 0.78844202, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.8097229, + "num_input_tokens_seen": 198539645, + "step": 9218, + "time_per_iteration": 2.497901439666748 + }, + { + "auxiliary_loss_clip": 0.01080965, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.04139018, + "balance_loss_mlp": 1.0216198, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 1.5912847223586926, + "language_loss": 0.72165167, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74283254, + "num_input_tokens_seen": 198558710, + "step": 9219, + "time_per_iteration": 4.07722020149231 + }, + { + "auxiliary_loss_clip": 0.01106695, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.04478467, + "balance_loss_mlp": 1.02616024, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.5292799372259482, + "language_loss": 0.71628213, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73774904, + "num_input_tokens_seen": 198577050, + "step": 9220, + "time_per_iteration": 2.466636896133423 + }, + { + "auxiliary_loss_clip": 0.01116874, + "auxiliary_loss_mlp": 0.01029364, + "balance_loss_clip": 1.04248166, + "balance_loss_mlp": 1.01699603, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.603728200896238, + "language_loss": 0.79406881, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81553119, + "num_input_tokens_seen": 198595290, + "step": 9221, + "time_per_iteration": 2.508007526397705 + }, + { + "auxiliary_loss_clip": 0.01084255, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.04107738, + "balance_loss_mlp": 1.0226804, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.71273680902452, + "language_loss": 0.83991504, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.8611176, + "num_input_tokens_seen": 198614110, + "step": 9222, + "time_per_iteration": 2.569549083709717 + }, + { + "auxiliary_loss_clip": 0.01095921, + "auxiliary_loss_mlp": 0.00780453, + "balance_loss_clip": 1.04429078, + "balance_loss_mlp": 1.00068831, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.779201483993442, + "language_loss": 0.75553381, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.7742976, + "num_input_tokens_seen": 198633880, + "step": 9223, + "time_per_iteration": 2.6545822620391846 + }, + { + "auxiliary_loss_clip": 0.01089096, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.03746164, + "balance_loss_mlp": 1.01799572, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.654842526743467, + "language_loss": 0.81837451, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.83958721, + "num_input_tokens_seen": 198653505, + "step": 9224, + "time_per_iteration": 2.570343494415283 + }, + { + "auxiliary_loss_clip": 0.01103901, + "auxiliary_loss_mlp": 0.01039199, + "balance_loss_clip": 1.04258847, + "balance_loss_mlp": 1.0249896, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 2.924712296735478, + "language_loss": 0.57292801, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59435898, + "num_input_tokens_seen": 198671890, + "step": 9225, + "time_per_iteration": 2.465592861175537 + }, + { + "auxiliary_loss_clip": 0.011098, + "auxiliary_loss_mlp": 0.01040961, + "balance_loss_clip": 1.04142785, + "balance_loss_mlp": 1.02723408, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.7076493355117588, + "language_loss": 0.67659676, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69810438, + "num_input_tokens_seen": 198691995, + "step": 9226, + "time_per_iteration": 2.4971272945404053 + }, + { + "auxiliary_loss_clip": 0.01082783, + "auxiliary_loss_mlp": 0.01035597, + "balance_loss_clip": 1.03682566, + "balance_loss_mlp": 1.0223825, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.4226819068687537, + "language_loss": 0.74622321, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76740706, + "num_input_tokens_seen": 198712440, + "step": 9227, + "time_per_iteration": 2.5563652515411377 + }, + { + "auxiliary_loss_clip": 0.01086514, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.04072285, + "balance_loss_mlp": 1.02244318, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 1.7024379315647222, + "language_loss": 0.73279512, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75402629, + "num_input_tokens_seen": 198731515, + "step": 9228, + "time_per_iteration": 2.5901026725769043 + }, + { + "auxiliary_loss_clip": 0.01119724, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.04320979, + "balance_loss_mlp": 1.01994228, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 1.8819798815413604, + "language_loss": 0.75824916, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.779778, + "num_input_tokens_seen": 198749750, + "step": 9229, + "time_per_iteration": 2.438626289367676 + }, + { + "auxiliary_loss_clip": 0.01106636, + "auxiliary_loss_mlp": 0.00780636, + "balance_loss_clip": 1.04112196, + "balance_loss_mlp": 1.00065458, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.3236273080605032, + "language_loss": 0.68636477, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70523751, + "num_input_tokens_seen": 198768320, + "step": 9230, + "time_per_iteration": 2.460205554962158 + }, + { + "auxiliary_loss_clip": 0.01074993, + "auxiliary_loss_mlp": 0.01037882, + "balance_loss_clip": 1.04173183, + "balance_loss_mlp": 1.02432156, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.4380402272907897, + "language_loss": 0.68247604, + "learning_rate": 1.741538124855163e-06, + "loss": 0.70360482, + "num_input_tokens_seen": 198787230, + "step": 9231, + "time_per_iteration": 2.63240647315979 + }, + { + "auxiliary_loss_clip": 0.01121724, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.04160237, + "balance_loss_mlp": 1.02056408, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.7247195959441262, + "language_loss": 0.78360164, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80517721, + "num_input_tokens_seen": 198806720, + "step": 9232, + "time_per_iteration": 2.4751672744750977 + }, + { + "auxiliary_loss_clip": 0.01077286, + "auxiliary_loss_mlp": 0.01039847, + "balance_loss_clip": 1.03732443, + "balance_loss_mlp": 1.02740765, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.868661665832895, + "language_loss": 0.82469606, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.8458674, + "num_input_tokens_seen": 198826235, + "step": 9233, + "time_per_iteration": 2.610532760620117 + }, + { + "auxiliary_loss_clip": 0.01107132, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.03935003, + "balance_loss_mlp": 1.0208565, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 1.9009873106337758, + "language_loss": 0.74874985, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77016509, + "num_input_tokens_seen": 198842655, + "step": 9234, + "time_per_iteration": 2.467454433441162 + }, + { + "auxiliary_loss_clip": 0.01093309, + "auxiliary_loss_mlp": 0.01027402, + "balance_loss_clip": 1.03771067, + "balance_loss_mlp": 1.01448524, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 1.8331524637345673, + "language_loss": 0.64860666, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.66981375, + "num_input_tokens_seen": 198861210, + "step": 9235, + "time_per_iteration": 4.066441059112549 + }, + { + "auxiliary_loss_clip": 0.01068323, + "auxiliary_loss_mlp": 0.0103844, + "balance_loss_clip": 1.03497648, + "balance_loss_mlp": 1.024189, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.6786027778018175, + "language_loss": 0.67977679, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70084441, + "num_input_tokens_seen": 198880045, + "step": 9236, + "time_per_iteration": 2.5944149494171143 + }, + { + "auxiliary_loss_clip": 0.01114178, + "auxiliary_loss_mlp": 0.01026073, + "balance_loss_clip": 1.04104543, + "balance_loss_mlp": 1.01308477, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 1.8044176316727805, + "language_loss": 0.86101902, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88242155, + "num_input_tokens_seen": 198900210, + "step": 9237, + "time_per_iteration": 2.4905622005462646 + }, + { + "auxiliary_loss_clip": 0.01103918, + "auxiliary_loss_mlp": 0.01035727, + "balance_loss_clip": 1.03964496, + "balance_loss_mlp": 1.02239323, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.7466273296123922, + "language_loss": 0.73228812, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75368452, + "num_input_tokens_seen": 198919055, + "step": 9238, + "time_per_iteration": 2.505208969116211 + }, + { + "auxiliary_loss_clip": 0.01105297, + "auxiliary_loss_mlp": 0.01035355, + "balance_loss_clip": 1.03877127, + "balance_loss_mlp": 1.02198577, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.7718463834648075, + "language_loss": 0.78341162, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80481815, + "num_input_tokens_seen": 198943505, + "step": 9239, + "time_per_iteration": 2.7422568798065186 + }, + { + "auxiliary_loss_clip": 0.01099755, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.04400873, + "balance_loss_mlp": 1.02085876, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 2.909052275421406, + "language_loss": 0.80204511, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82338107, + "num_input_tokens_seen": 198963590, + "step": 9240, + "time_per_iteration": 2.5336532592773438 + }, + { + "auxiliary_loss_clip": 0.01092663, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.03866982, + "balance_loss_mlp": 1.01976562, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.6955885057970665, + "language_loss": 0.65210652, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67336637, + "num_input_tokens_seen": 198982680, + "step": 9241, + "time_per_iteration": 2.5456349849700928 + }, + { + "auxiliary_loss_clip": 0.01106903, + "auxiliary_loss_mlp": 0.00780362, + "balance_loss_clip": 1.0403893, + "balance_loss_mlp": 1.00078428, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 2.210930230615193, + "language_loss": 0.73202395, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.75089657, + "num_input_tokens_seen": 199000185, + "step": 9242, + "time_per_iteration": 2.458486795425415 + }, + { + "auxiliary_loss_clip": 0.01099951, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.04111719, + "balance_loss_mlp": 1.01851809, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 2.0197453483179038, + "language_loss": 0.64184648, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.6631707, + "num_input_tokens_seen": 199018380, + "step": 9243, + "time_per_iteration": 2.5198066234588623 + }, + { + "auxiliary_loss_clip": 0.01093115, + "auxiliary_loss_mlp": 0.00779455, + "balance_loss_clip": 1.04385209, + "balance_loss_mlp": 1.00068641, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.9618191590220142, + "language_loss": 0.75548482, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77421045, + "num_input_tokens_seen": 199037115, + "step": 9244, + "time_per_iteration": 2.5300848484039307 + }, + { + "auxiliary_loss_clip": 0.01088247, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.03660905, + "balance_loss_mlp": 1.0192709, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.2204909745079613, + "language_loss": 0.75075567, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.77195168, + "num_input_tokens_seen": 199053375, + "step": 9245, + "time_per_iteration": 2.5011208057403564 + }, + { + "auxiliary_loss_clip": 0.01095539, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.0407145, + "balance_loss_mlp": 1.01717925, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 1.8688731923645474, + "language_loss": 0.79824197, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.81951022, + "num_input_tokens_seen": 199070930, + "step": 9246, + "time_per_iteration": 4.055572032928467 + }, + { + "auxiliary_loss_clip": 0.01118172, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.04184794, + "balance_loss_mlp": 1.02235401, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 2.1333204894513274, + "language_loss": 0.74150485, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76304698, + "num_input_tokens_seen": 199088675, + "step": 9247, + "time_per_iteration": 3.837332010269165 + }, + { + "auxiliary_loss_clip": 0.01090557, + "auxiliary_loss_mlp": 0.01037667, + "balance_loss_clip": 1.04223704, + "balance_loss_mlp": 1.02373767, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 3.072340831102055, + "language_loss": 0.76336503, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78464723, + "num_input_tokens_seen": 199103075, + "step": 9248, + "time_per_iteration": 2.4634597301483154 + }, + { + "auxiliary_loss_clip": 0.01002373, + "auxiliary_loss_mlp": 0.0101745, + "balance_loss_clip": 1.01323473, + "balance_loss_mlp": 1.0158999, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.9326988346757178, + "language_loss": 0.59425795, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61445618, + "num_input_tokens_seen": 199160325, + "step": 9249, + "time_per_iteration": 3.2572782039642334 + }, + { + "auxiliary_loss_clip": 0.01115831, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.04008746, + "balance_loss_mlp": 1.01501489, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 2.032307795013662, + "language_loss": 0.7987175, + "learning_rate": 1.734202189316832e-06, + "loss": 0.8201642, + "num_input_tokens_seen": 199179760, + "step": 9250, + "time_per_iteration": 2.455143690109253 + }, + { + "auxiliary_loss_clip": 0.01097361, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.04034638, + "balance_loss_mlp": 1.01945937, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 2.530557714749168, + "language_loss": 0.69266272, + "learning_rate": 1.733816187358836e-06, + "loss": 0.71397579, + "num_input_tokens_seen": 199196695, + "step": 9251, + "time_per_iteration": 2.4834001064300537 + }, + { + "auxiliary_loss_clip": 0.01104412, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.0392251, + "balance_loss_mlp": 1.02159047, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.4356812225421856, + "language_loss": 0.75465035, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77604079, + "num_input_tokens_seen": 199217845, + "step": 9252, + "time_per_iteration": 2.5266635417938232 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.04030156, + "balance_loss_mlp": 1.02816772, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.6903378060132759, + "language_loss": 0.72977388, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.75127274, + "num_input_tokens_seen": 199239250, + "step": 9253, + "time_per_iteration": 2.550400495529175 + }, + { + "auxiliary_loss_clip": 0.01089939, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.04258347, + "balance_loss_mlp": 1.01829481, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 2.2341789860743235, + "language_loss": 0.83002907, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.8512364, + "num_input_tokens_seen": 199258320, + "step": 9254, + "time_per_iteration": 2.560373067855835 + }, + { + "auxiliary_loss_clip": 0.01009794, + "auxiliary_loss_mlp": 0.01001992, + "balance_loss_clip": 1.01173735, + "balance_loss_mlp": 1.00078833, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.8631110262052163, + "language_loss": 0.64773756, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66785538, + "num_input_tokens_seen": 199314840, + "step": 9255, + "time_per_iteration": 2.931689500808716 + }, + { + "auxiliary_loss_clip": 0.01109319, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.04387558, + "balance_loss_mlp": 1.02181458, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.7686840986770445, + "language_loss": 0.69374335, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.7151804, + "num_input_tokens_seen": 199335405, + "step": 9256, + "time_per_iteration": 2.491339683532715 + }, + { + "auxiliary_loss_clip": 0.01080028, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.03836131, + "balance_loss_mlp": 1.02357054, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.6734769700026066, + "language_loss": 0.75631863, + "learning_rate": 1.73150038809119e-06, + "loss": 0.77747607, + "num_input_tokens_seen": 199354345, + "step": 9257, + "time_per_iteration": 2.5530436038970947 + }, + { + "auxiliary_loss_clip": 0.01072036, + "auxiliary_loss_mlp": 0.01038141, + "balance_loss_clip": 1.03826046, + "balance_loss_mlp": 1.02551627, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 2.1831305137775425, + "language_loss": 0.61184752, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63294929, + "num_input_tokens_seen": 199372250, + "step": 9258, + "time_per_iteration": 2.549044132232666 + }, + { + "auxiliary_loss_clip": 0.01084156, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.03869677, + "balance_loss_mlp": 1.0216881, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 2.4548452288542957, + "language_loss": 0.79328382, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81448293, + "num_input_tokens_seen": 199392815, + "step": 9259, + "time_per_iteration": 4.227951765060425 + }, + { + "auxiliary_loss_clip": 0.01090791, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.04476595, + "balance_loss_mlp": 1.01908207, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 2.663759364643233, + "language_loss": 0.81606507, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.83730078, + "num_input_tokens_seen": 199412375, + "step": 9260, + "time_per_iteration": 2.5687355995178223 + }, + { + "auxiliary_loss_clip": 0.01116996, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.04079843, + "balance_loss_mlp": 1.02592003, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.4563889734153799, + "language_loss": 0.6893599, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71092761, + "num_input_tokens_seen": 199431490, + "step": 9261, + "time_per_iteration": 2.455390453338623 + }, + { + "auxiliary_loss_clip": 0.01008894, + "auxiliary_loss_mlp": 0.01008667, + "balance_loss_clip": 1.01299107, + "balance_loss_mlp": 1.00742137, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7339201105782515, + "language_loss": 0.61076093, + "learning_rate": 1.729570835226108e-06, + "loss": 0.6309365, + "num_input_tokens_seen": 199495855, + "step": 9262, + "time_per_iteration": 3.127674102783203 + }, + { + "auxiliary_loss_clip": 0.01107265, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.03898454, + "balance_loss_mlp": 1.02182496, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.583542829623885, + "language_loss": 0.64424407, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66566527, + "num_input_tokens_seen": 199515870, + "step": 9263, + "time_per_iteration": 2.513731002807617 + }, + { + "auxiliary_loss_clip": 0.01094103, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.03721333, + "balance_loss_mlp": 1.02268004, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 1.9422521802920185, + "language_loss": 0.73405981, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75536156, + "num_input_tokens_seen": 199535745, + "step": 9264, + "time_per_iteration": 2.5249881744384766 + }, + { + "auxiliary_loss_clip": 0.01092151, + "auxiliary_loss_mlp": 0.01032481, + "balance_loss_clip": 1.04420948, + "balance_loss_mlp": 1.0196836, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.8149015085378841, + "language_loss": 0.76012391, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78137028, + "num_input_tokens_seen": 199554035, + "step": 9265, + "time_per_iteration": 2.529029130935669 + }, + { + "auxiliary_loss_clip": 0.01090566, + "auxiliary_loss_mlp": 0.01035711, + "balance_loss_clip": 1.04343843, + "balance_loss_mlp": 1.02395749, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.4577502533725906, + "language_loss": 0.71163094, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73289371, + "num_input_tokens_seen": 199576120, + "step": 9266, + "time_per_iteration": 2.558650493621826 + }, + { + "auxiliary_loss_clip": 0.01092536, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.03723621, + "balance_loss_mlp": 1.02162528, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7630051463071472, + "language_loss": 0.68233591, + "learning_rate": 1.727641538728533e-06, + "loss": 0.70360923, + "num_input_tokens_seen": 199593780, + "step": 9267, + "time_per_iteration": 2.505279541015625 + }, + { + "auxiliary_loss_clip": 0.01101381, + "auxiliary_loss_mlp": 0.01040067, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.02805042, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 1.7759911221232416, + "language_loss": 0.74283338, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76424783, + "num_input_tokens_seen": 199613220, + "step": 9268, + "time_per_iteration": 2.4949119091033936 + }, + { + "auxiliary_loss_clip": 0.01100569, + "auxiliary_loss_mlp": 0.00778388, + "balance_loss_clip": 1.04228377, + "balance_loss_mlp": 1.00063443, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 5.91038309572778, + "language_loss": 0.75202835, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77081788, + "num_input_tokens_seen": 199632085, + "step": 9269, + "time_per_iteration": 2.5097103118896484 + }, + { + "auxiliary_loss_clip": 0.01080783, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.0362798, + "balance_loss_mlp": 1.02665782, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.752304719484006, + "language_loss": 0.82920396, + "learning_rate": 1.726484084647256e-06, + "loss": 0.85041732, + "num_input_tokens_seen": 199649295, + "step": 9270, + "time_per_iteration": 2.577744722366333 + }, + { + "auxiliary_loss_clip": 0.01078539, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.03960729, + "balance_loss_mlp": 1.02154529, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.039160541877292, + "language_loss": 0.79870939, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81984228, + "num_input_tokens_seen": 199668870, + "step": 9271, + "time_per_iteration": 2.607126235961914 + }, + { + "auxiliary_loss_clip": 0.010959, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.03912878, + "balance_loss_mlp": 1.01856589, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 2.7505060381085005, + "language_loss": 0.9043057, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92558718, + "num_input_tokens_seen": 199684870, + "step": 9272, + "time_per_iteration": 2.5241966247558594 + }, + { + "auxiliary_loss_clip": 0.01081829, + "auxiliary_loss_mlp": 0.01035763, + "balance_loss_clip": 1.03838217, + "balance_loss_mlp": 1.02266216, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 3.1245126381532513, + "language_loss": 0.8360104, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.85718632, + "num_input_tokens_seen": 199701975, + "step": 9273, + "time_per_iteration": 2.5700020790100098 + }, + { + "auxiliary_loss_clip": 0.01107853, + "auxiliary_loss_mlp": 0.01044497, + "balance_loss_clip": 1.0403105, + "balance_loss_mlp": 1.03023911, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.0113604283432736, + "language_loss": 0.74207306, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76359653, + "num_input_tokens_seen": 199721865, + "step": 9274, + "time_per_iteration": 3.970804452896118 + }, + { + "auxiliary_loss_clip": 0.01098581, + "auxiliary_loss_mlp": 0.01037471, + "balance_loss_clip": 1.04348898, + "balance_loss_mlp": 1.02238536, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 3.1529550296440334, + "language_loss": 0.77492416, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.79628468, + "num_input_tokens_seen": 199736455, + "step": 9275, + "time_per_iteration": 2.478365421295166 + }, + { + "auxiliary_loss_clip": 0.01095198, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.04570842, + "balance_loss_mlp": 1.02034461, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.6467697340085854, + "language_loss": 0.74755067, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76883817, + "num_input_tokens_seen": 199753125, + "step": 9276, + "time_per_iteration": 2.497954845428467 + }, + { + "auxiliary_loss_clip": 0.01093211, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.03752685, + "balance_loss_mlp": 1.02029276, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.8156550063238388, + "language_loss": 0.75466031, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77592182, + "num_input_tokens_seen": 199771365, + "step": 9277, + "time_per_iteration": 2.521918296813965 + }, + { + "auxiliary_loss_clip": 0.01113667, + "auxiliary_loss_mlp": 0.01034808, + "balance_loss_clip": 1.04093957, + "balance_loss_mlp": 1.02258301, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.5628479388314578, + "language_loss": 0.71591604, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73740071, + "num_input_tokens_seen": 199790035, + "step": 9278, + "time_per_iteration": 2.4639861583709717 + }, + { + "auxiliary_loss_clip": 0.01081133, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.04056311, + "balance_loss_mlp": 1.02147543, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.495962257206422, + "language_loss": 0.75562453, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77679193, + "num_input_tokens_seen": 199811125, + "step": 9279, + "time_per_iteration": 2.59468150138855 + }, + { + "auxiliary_loss_clip": 0.01091525, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.03641486, + "balance_loss_mlp": 1.01882541, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 1.6981623161557238, + "language_loss": 0.67529666, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.6965313, + "num_input_tokens_seen": 199829915, + "step": 9280, + "time_per_iteration": 2.542447805404663 + }, + { + "auxiliary_loss_clip": 0.01103634, + "auxiliary_loss_mlp": 0.01034959, + "balance_loss_clip": 1.03740191, + "balance_loss_mlp": 1.02180445, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.7770168701942677, + "language_loss": 0.73322701, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75461298, + "num_input_tokens_seen": 199850670, + "step": 9281, + "time_per_iteration": 2.5163285732269287 + }, + { + "auxiliary_loss_clip": 0.01083285, + "auxiliary_loss_mlp": 0.00779232, + "balance_loss_clip": 1.03815842, + "balance_loss_mlp": 1.0006516, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 3.6943782207616516, + "language_loss": 0.75311273, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77173793, + "num_input_tokens_seen": 199867645, + "step": 9282, + "time_per_iteration": 2.5148966312408447 + }, + { + "auxiliary_loss_clip": 0.01055072, + "auxiliary_loss_mlp": 0.01027882, + "balance_loss_clip": 1.03591621, + "balance_loss_mlp": 1.01497209, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.6332340247689752, + "language_loss": 0.66520387, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68603343, + "num_input_tokens_seen": 199886320, + "step": 9283, + "time_per_iteration": 2.600102186203003 + }, + { + "auxiliary_loss_clip": 0.01082147, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.04025006, + "balance_loss_mlp": 1.01647758, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 1.736971701433833, + "language_loss": 0.83185786, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85296154, + "num_input_tokens_seen": 199904895, + "step": 9284, + "time_per_iteration": 2.525642156600952 + }, + { + "auxiliary_loss_clip": 0.01094415, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.03983152, + "balance_loss_mlp": 1.01942468, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.4818812691141, + "language_loss": 0.84902525, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87029779, + "num_input_tokens_seen": 199921090, + "step": 9285, + "time_per_iteration": 3.983140230178833 + }, + { + "auxiliary_loss_clip": 0.01095487, + "auxiliary_loss_mlp": 0.01035272, + "balance_loss_clip": 1.04445159, + "balance_loss_mlp": 1.02245712, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 2.2899737188898897, + "language_loss": 0.739434, + "learning_rate": 1.720312582354912e-06, + "loss": 0.76074159, + "num_input_tokens_seen": 199939925, + "step": 9286, + "time_per_iteration": 3.9933645725250244 + }, + { + "auxiliary_loss_clip": 0.011176, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.04173601, + "balance_loss_mlp": 1.01922727, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.521674727756836, + "language_loss": 0.73988008, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76137811, + "num_input_tokens_seen": 199960015, + "step": 9287, + "time_per_iteration": 2.5026416778564453 + }, + { + "auxiliary_loss_clip": 0.01085357, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.0401082, + "balance_loss_mlp": 1.02059257, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 1.812618440616331, + "language_loss": 0.75078499, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77198637, + "num_input_tokens_seen": 199980505, + "step": 9288, + "time_per_iteration": 2.5750415325164795 + }, + { + "auxiliary_loss_clip": 0.0109793, + "auxiliary_loss_mlp": 0.01040972, + "balance_loss_clip": 1.04142201, + "balance_loss_mlp": 1.02694106, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 1.8684444398752031, + "language_loss": 0.77912235, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.80051136, + "num_input_tokens_seen": 199999020, + "step": 9289, + "time_per_iteration": 2.490391254425049 + }, + { + "auxiliary_loss_clip": 0.01089832, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.04026651, + "balance_loss_mlp": 1.01961005, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.9931515947304326, + "language_loss": 0.61641121, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63764417, + "num_input_tokens_seen": 200019020, + "step": 9290, + "time_per_iteration": 2.589698314666748 + }, + { + "auxiliary_loss_clip": 0.01069887, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.03727591, + "balance_loss_mlp": 1.01799047, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 1.99513969450353, + "language_loss": 0.68211615, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70313239, + "num_input_tokens_seen": 200038110, + "step": 9291, + "time_per_iteration": 2.6348633766174316 + }, + { + "auxiliary_loss_clip": 0.01087587, + "auxiliary_loss_mlp": 0.01040363, + "balance_loss_clip": 1.03834689, + "balance_loss_mlp": 1.02665949, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 1.9033052917186968, + "language_loss": 0.84039527, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.86167479, + "num_input_tokens_seen": 200056210, + "step": 9292, + "time_per_iteration": 2.5760200023651123 + }, + { + "auxiliary_loss_clip": 0.01090803, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_clip": 1.03965688, + "balance_loss_mlp": 1.02976382, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 1.9843063886465395, + "language_loss": 0.73565292, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75698531, + "num_input_tokens_seen": 200075620, + "step": 9293, + "time_per_iteration": 2.585282564163208 + }, + { + "auxiliary_loss_clip": 0.01086915, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.04291773, + "balance_loss_mlp": 1.02358222, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.917885591815324, + "language_loss": 0.72830331, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.74953365, + "num_input_tokens_seen": 200095945, + "step": 9294, + "time_per_iteration": 2.566697597503662 + }, + { + "auxiliary_loss_clip": 0.01096271, + "auxiliary_loss_mlp": 0.00778572, + "balance_loss_clip": 1.04185438, + "balance_loss_mlp": 1.00055933, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 3.2071246328439478, + "language_loss": 0.68850303, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70725149, + "num_input_tokens_seen": 200114185, + "step": 9295, + "time_per_iteration": 2.5137336254119873 + }, + { + "auxiliary_loss_clip": 0.01118034, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.04238164, + "balance_loss_mlp": 1.02000856, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.6830333382786253, + "language_loss": 0.8088147, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.83032608, + "num_input_tokens_seen": 200135030, + "step": 9296, + "time_per_iteration": 2.498904228210449 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.04156971, + "balance_loss_mlp": 1.02053773, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.5418736161327142, + "language_loss": 0.65418136, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.6755777, + "num_input_tokens_seen": 200154290, + "step": 9297, + "time_per_iteration": 2.4731290340423584 + }, + { + "auxiliary_loss_clip": 0.01085182, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.04076934, + "balance_loss_mlp": 1.02447581, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 4.353326276851129, + "language_loss": 0.75283599, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77406657, + "num_input_tokens_seen": 200171555, + "step": 9298, + "time_per_iteration": 2.5365984439849854 + }, + { + "auxiliary_loss_clip": 0.01022225, + "auxiliary_loss_mlp": 0.01011753, + "balance_loss_clip": 1.01473808, + "balance_loss_mlp": 1.01031613, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6898499808688568, + "language_loss": 0.52403605, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.5443759, + "num_input_tokens_seen": 200237010, + "step": 9299, + "time_per_iteration": 4.640992879867554 + }, + { + "auxiliary_loss_clip": 0.01101531, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.03911531, + "balance_loss_mlp": 1.02251053, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.9641336693452203, + "language_loss": 0.68984199, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.7112059, + "num_input_tokens_seen": 200260820, + "step": 9300, + "time_per_iteration": 2.578544855117798 + }, + { + "auxiliary_loss_clip": 0.0106684, + "auxiliary_loss_mlp": 0.01056868, + "balance_loss_clip": 1.03424251, + "balance_loss_mlp": 1.04112053, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 2.5914567381755673, + "language_loss": 0.82179344, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84303057, + "num_input_tokens_seen": 200278035, + "step": 9301, + "time_per_iteration": 2.5620477199554443 + }, + { + "auxiliary_loss_clip": 0.0111653, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.04094195, + "balance_loss_mlp": 1.0186826, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 1.9838914674656247, + "language_loss": 0.67614442, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69762802, + "num_input_tokens_seen": 200297255, + "step": 9302, + "time_per_iteration": 2.4601659774780273 + }, + { + "auxiliary_loss_clip": 0.01087106, + "auxiliary_loss_mlp": 0.01029089, + "balance_loss_clip": 1.03883791, + "balance_loss_mlp": 1.01439667, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 1.7643076344158408, + "language_loss": 0.7068975, + "learning_rate": 1.713758337453878e-06, + "loss": 0.72805953, + "num_input_tokens_seen": 200317505, + "step": 9303, + "time_per_iteration": 2.56658935546875 + }, + { + "auxiliary_loss_clip": 0.01048328, + "auxiliary_loss_mlp": 0.01040544, + "balance_loss_clip": 1.03522325, + "balance_loss_mlp": 1.02629852, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.5437345924679553, + "language_loss": 0.72553188, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74642062, + "num_input_tokens_seen": 200338350, + "step": 9304, + "time_per_iteration": 2.640552043914795 + }, + { + "auxiliary_loss_clip": 0.01104772, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.03918827, + "balance_loss_mlp": 1.0174787, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 2.574699500793046, + "language_loss": 0.77849531, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.79984838, + "num_input_tokens_seen": 200353965, + "step": 9305, + "time_per_iteration": 2.457414150238037 + }, + { + "auxiliary_loss_clip": 0.01069891, + "auxiliary_loss_mlp": 0.01028578, + "balance_loss_clip": 1.0419302, + "balance_loss_mlp": 1.01618588, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.7003098950157054, + "language_loss": 0.69356662, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.71455139, + "num_input_tokens_seen": 200373595, + "step": 9306, + "time_per_iteration": 2.584599733352661 + }, + { + "auxiliary_loss_clip": 0.01029315, + "auxiliary_loss_mlp": 0.00999423, + "balance_loss_clip": 1.0271194, + "balance_loss_mlp": 0.99796295, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.912729018959205, + "language_loss": 0.60291362, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62320101, + "num_input_tokens_seen": 200429155, + "step": 9307, + "time_per_iteration": 3.182514190673828 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.04290819, + "balance_loss_mlp": 1.02424192, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.8699753921918938, + "language_loss": 0.74462932, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76603103, + "num_input_tokens_seen": 200448290, + "step": 9308, + "time_per_iteration": 2.490313768386841 + }, + { + "auxiliary_loss_clip": 0.01053342, + "auxiliary_loss_mlp": 0.01040782, + "balance_loss_clip": 1.03301692, + "balance_loss_mlp": 1.02619076, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.6853228988856277, + "language_loss": 0.69468719, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71562839, + "num_input_tokens_seen": 200466555, + "step": 9309, + "time_per_iteration": 2.660897970199585 + }, + { + "auxiliary_loss_clip": 0.01098506, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.04697764, + "balance_loss_mlp": 1.01946986, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 1.937708834969653, + "language_loss": 0.75094032, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77227211, + "num_input_tokens_seen": 200485980, + "step": 9310, + "time_per_iteration": 2.5810277462005615 + }, + { + "auxiliary_loss_clip": 0.01113526, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.04496145, + "balance_loss_mlp": 1.02305007, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 2.711967732391968, + "language_loss": 0.69641823, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71792674, + "num_input_tokens_seen": 200504555, + "step": 9311, + "time_per_iteration": 2.545013904571533 + }, + { + "auxiliary_loss_clip": 0.01105245, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.04048085, + "balance_loss_mlp": 1.01713586, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.8467369688724573, + "language_loss": 0.72413164, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74548721, + "num_input_tokens_seen": 200522700, + "step": 9312, + "time_per_iteration": 2.455768585205078 + }, + { + "auxiliary_loss_clip": 0.01081713, + "auxiliary_loss_mlp": 0.01034118, + "balance_loss_clip": 1.04454732, + "balance_loss_mlp": 1.02093911, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 2.0213994092993803, + "language_loss": 0.88676202, + "learning_rate": 1.709904360003822e-06, + "loss": 0.90792036, + "num_input_tokens_seen": 200541910, + "step": 9313, + "time_per_iteration": 2.566394329071045 + }, + { + "auxiliary_loss_clip": 0.01082397, + "auxiliary_loss_mlp": 0.01039983, + "balance_loss_clip": 1.04077995, + "balance_loss_mlp": 1.0264765, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.7542980308830367, + "language_loss": 0.77739775, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79862154, + "num_input_tokens_seen": 200562600, + "step": 9314, + "time_per_iteration": 4.09737753868103 + }, + { + "auxiliary_loss_clip": 0.01083923, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.04189873, + "balance_loss_mlp": 1.01637721, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.725925785043537, + "language_loss": 0.70239872, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72353458, + "num_input_tokens_seen": 200584795, + "step": 9315, + "time_per_iteration": 2.642232656478882 + }, + { + "auxiliary_loss_clip": 0.01099104, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.04084754, + "balance_loss_mlp": 1.02124083, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 2.3466157697495342, + "language_loss": 0.66817617, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68951464, + "num_input_tokens_seen": 200606945, + "step": 9316, + "time_per_iteration": 2.5871376991271973 + }, + { + "auxiliary_loss_clip": 0.01080465, + "auxiliary_loss_mlp": 0.01040429, + "balance_loss_clip": 1.03617787, + "balance_loss_mlp": 1.02412152, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 1.755190536486853, + "language_loss": 0.86403739, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88524634, + "num_input_tokens_seen": 200626340, + "step": 9317, + "time_per_iteration": 2.5591933727264404 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.0104064, + "balance_loss_clip": 1.04104567, + "balance_loss_mlp": 1.02541077, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.7553215166881713, + "language_loss": 0.77102286, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.7925303, + "num_input_tokens_seen": 200644520, + "step": 9318, + "time_per_iteration": 2.5606868267059326 + }, + { + "auxiliary_loss_clip": 0.01102805, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.0395155, + "balance_loss_mlp": 1.02323949, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 2.0032960414796546, + "language_loss": 0.7625218, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78390718, + "num_input_tokens_seen": 200664845, + "step": 9319, + "time_per_iteration": 2.5377800464630127 + }, + { + "auxiliary_loss_clip": 0.01103737, + "auxiliary_loss_mlp": 0.01037138, + "balance_loss_clip": 1.04062605, + "balance_loss_mlp": 1.02430534, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.3141254376372462, + "language_loss": 0.85132855, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87273729, + "num_input_tokens_seen": 200686535, + "step": 9320, + "time_per_iteration": 2.5374696254730225 + }, + { + "auxiliary_loss_clip": 0.01028231, + "auxiliary_loss_mlp": 0.01001496, + "balance_loss_clip": 1.01253211, + "balance_loss_mlp": 1.00014341, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7613338293110075, + "language_loss": 0.52622485, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54652214, + "num_input_tokens_seen": 200736965, + "step": 9321, + "time_per_iteration": 2.8539934158325195 + }, + { + "auxiliary_loss_clip": 0.01093869, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.0395925, + "balance_loss_mlp": 1.01948094, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.7512591917987006, + "language_loss": 0.74331373, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76457858, + "num_input_tokens_seen": 200757420, + "step": 9322, + "time_per_iteration": 2.5240020751953125 + }, + { + "auxiliary_loss_clip": 0.01116583, + "auxiliary_loss_mlp": 0.01039749, + "balance_loss_clip": 1.04081357, + "balance_loss_mlp": 1.02565861, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.6386026453852902, + "language_loss": 0.73673928, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.75830257, + "num_input_tokens_seen": 200779520, + "step": 9323, + "time_per_iteration": 2.5664193630218506 + }, + { + "auxiliary_loss_clip": 0.01098692, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.04614127, + "balance_loss_mlp": 1.01757002, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.6210714051402144, + "language_loss": 0.61352491, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.63482839, + "num_input_tokens_seen": 200799485, + "step": 9324, + "time_per_iteration": 2.519693374633789 + }, + { + "auxiliary_loss_clip": 0.01068208, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.03529477, + "balance_loss_mlp": 1.0210259, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 2.651532342142055, + "language_loss": 0.87609637, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89713061, + "num_input_tokens_seen": 200817540, + "step": 9325, + "time_per_iteration": 4.1239097118377686 + }, + { + "auxiliary_loss_clip": 0.01096069, + "auxiliary_loss_mlp": 0.01032719, + "balance_loss_clip": 1.03896105, + "balance_loss_mlp": 1.01808619, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.8715085336868573, + "language_loss": 0.7361629, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.75745082, + "num_input_tokens_seen": 200838380, + "step": 9326, + "time_per_iteration": 3.9401166439056396 + }, + { + "auxiliary_loss_clip": 0.01096577, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.0368495, + "balance_loss_mlp": 1.01568902, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 1.7930087848277274, + "language_loss": 0.77959871, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80087447, + "num_input_tokens_seen": 200855640, + "step": 9327, + "time_per_iteration": 2.51155161857605 + }, + { + "auxiliary_loss_clip": 0.01103822, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.04401231, + "balance_loss_mlp": 1.02429903, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 1.356905068509738, + "language_loss": 0.78480852, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80623037, + "num_input_tokens_seen": 200876585, + "step": 9328, + "time_per_iteration": 2.5410964488983154 + }, + { + "auxiliary_loss_clip": 0.01116132, + "auxiliary_loss_mlp": 0.01034911, + "balance_loss_clip": 1.03969145, + "balance_loss_mlp": 1.02154779, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.708109772133597, + "language_loss": 0.73773026, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.75924075, + "num_input_tokens_seen": 200898175, + "step": 9329, + "time_per_iteration": 2.471771717071533 + }, + { + "auxiliary_loss_clip": 0.01095525, + "auxiliary_loss_mlp": 0.00781933, + "balance_loss_clip": 1.03888118, + "balance_loss_mlp": 1.00073934, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.687060134715222, + "language_loss": 0.83453941, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85331392, + "num_input_tokens_seen": 200917515, + "step": 9330, + "time_per_iteration": 2.5128958225250244 + }, + { + "auxiliary_loss_clip": 0.01034566, + "auxiliary_loss_mlp": 0.0100374, + "balance_loss_clip": 1.00933194, + "balance_loss_mlp": 1.00242889, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7425270161079834, + "language_loss": 0.57877588, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.599159, + "num_input_tokens_seen": 200978615, + "step": 9331, + "time_per_iteration": 3.028409481048584 + }, + { + "auxiliary_loss_clip": 0.01074927, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.03952336, + "balance_loss_mlp": 1.01985168, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 2.543857163323522, + "language_loss": 0.81805015, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.83913302, + "num_input_tokens_seen": 200997745, + "step": 9332, + "time_per_iteration": 2.5723209381103516 + }, + { + "auxiliary_loss_clip": 0.01106845, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.0399518, + "balance_loss_mlp": 1.02789855, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.7845036997940813, + "language_loss": 0.82140809, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.84292042, + "num_input_tokens_seen": 201016370, + "step": 9333, + "time_per_iteration": 2.445894718170166 + }, + { + "auxiliary_loss_clip": 0.0111741, + "auxiliary_loss_mlp": 0.01031106, + "balance_loss_clip": 1.04032993, + "balance_loss_mlp": 1.0175581, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.8068891989590958, + "language_loss": 0.72846437, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.74994957, + "num_input_tokens_seen": 201034310, + "step": 9334, + "time_per_iteration": 2.451359510421753 + }, + { + "auxiliary_loss_clip": 0.01094502, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.04175854, + "balance_loss_mlp": 1.02418983, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.9111185582024228, + "language_loss": 0.7116437, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73296463, + "num_input_tokens_seen": 201052030, + "step": 9335, + "time_per_iteration": 2.4623754024505615 + }, + { + "auxiliary_loss_clip": 0.01095757, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.03796649, + "balance_loss_mlp": 1.01923966, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.8285113154508108, + "language_loss": 0.76608229, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78737336, + "num_input_tokens_seen": 201068445, + "step": 9336, + "time_per_iteration": 2.501359701156616 + }, + { + "auxiliary_loss_clip": 0.01106045, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.0408473, + "balance_loss_mlp": 1.02100968, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.519291145203388, + "language_loss": 0.65058565, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.67199063, + "num_input_tokens_seen": 201082140, + "step": 9337, + "time_per_iteration": 2.501108407974243 + }, + { + "auxiliary_loss_clip": 0.01018636, + "auxiliary_loss_mlp": 0.01004656, + "balance_loss_clip": 1.01288676, + "balance_loss_mlp": 1.00327337, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.9028773799307366, + "language_loss": 0.62648898, + "learning_rate": 1.700274261035102e-06, + "loss": 0.6467219, + "num_input_tokens_seen": 201137245, + "step": 9338, + "time_per_iteration": 4.592622518539429 + }, + { + "auxiliary_loss_clip": 0.01087234, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.03774321, + "balance_loss_mlp": 1.02137613, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.721288560810694, + "language_loss": 0.65562618, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67684627, + "num_input_tokens_seen": 201157270, + "step": 9339, + "time_per_iteration": 2.647664785385132 + }, + { + "auxiliary_loss_clip": 0.0110144, + "auxiliary_loss_mlp": 0.01039003, + "balance_loss_clip": 1.03817976, + "balance_loss_mlp": 1.02430439, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 1.8335804634480417, + "language_loss": 0.69521511, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.71661961, + "num_input_tokens_seen": 201174530, + "step": 9340, + "time_per_iteration": 2.470161199569702 + }, + { + "auxiliary_loss_clip": 0.01075133, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.04116488, + "balance_loss_mlp": 1.0166949, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.7048968767494106, + "language_loss": 0.77598095, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79703194, + "num_input_tokens_seen": 201194905, + "step": 9341, + "time_per_iteration": 2.569662570953369 + }, + { + "auxiliary_loss_clip": 0.01069596, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.03711796, + "balance_loss_mlp": 1.02038991, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.7616449453089351, + "language_loss": 0.79708469, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81812882, + "num_input_tokens_seen": 201213715, + "step": 9342, + "time_per_iteration": 2.5970067977905273 + }, + { + "auxiliary_loss_clip": 0.01087472, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.03596735, + "balance_loss_mlp": 1.02030599, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.4726732194073406, + "language_loss": 0.76311004, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78432977, + "num_input_tokens_seen": 201231415, + "step": 9343, + "time_per_iteration": 2.5477468967437744 + }, + { + "auxiliary_loss_clip": 0.01075999, + "auxiliary_loss_mlp": 0.0104146, + "balance_loss_clip": 1.03995824, + "balance_loss_mlp": 1.0268929, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.5434459574128396, + "language_loss": 0.68969709, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.7108717, + "num_input_tokens_seen": 201249625, + "step": 9344, + "time_per_iteration": 2.578456401824951 + }, + { + "auxiliary_loss_clip": 0.01120209, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.04186499, + "balance_loss_mlp": 1.02782035, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.394742418342836, + "language_loss": 0.66157317, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68320268, + "num_input_tokens_seen": 201271205, + "step": 9345, + "time_per_iteration": 2.5272390842437744 + }, + { + "auxiliary_loss_clip": 0.01098461, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.04245281, + "balance_loss_mlp": 1.01570249, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 2.887067195695619, + "language_loss": 0.87209845, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89337111, + "num_input_tokens_seen": 201287700, + "step": 9346, + "time_per_iteration": 2.4946236610412598 + }, + { + "auxiliary_loss_clip": 0.01093969, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.03962684, + "balance_loss_mlp": 1.02405536, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.1756111514903353, + "language_loss": 0.59291816, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61425, + "num_input_tokens_seen": 201307530, + "step": 9347, + "time_per_iteration": 2.546563148498535 + }, + { + "auxiliary_loss_clip": 0.01109022, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.04159546, + "balance_loss_mlp": 1.02199697, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.202826563521858, + "language_loss": 0.69460559, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71606636, + "num_input_tokens_seen": 201326210, + "step": 9348, + "time_per_iteration": 2.47279953956604 + }, + { + "auxiliary_loss_clip": 0.01072595, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.03666246, + "balance_loss_mlp": 1.01682401, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.8567671369802317, + "language_loss": 0.79212308, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.8131665, + "num_input_tokens_seen": 201346120, + "step": 9349, + "time_per_iteration": 2.5647430419921875 + }, + { + "auxiliary_loss_clip": 0.01066363, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.03918004, + "balance_loss_mlp": 1.01766264, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.05775972128526, + "language_loss": 0.66679895, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.68778169, + "num_input_tokens_seen": 201365700, + "step": 9350, + "time_per_iteration": 2.6609325408935547 + }, + { + "auxiliary_loss_clip": 0.01066854, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.03620052, + "balance_loss_mlp": 1.02397323, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 1.9111347331461692, + "language_loss": 0.78524262, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80629921, + "num_input_tokens_seen": 201382795, + "step": 9351, + "time_per_iteration": 2.53749418258667 + }, + { + "auxiliary_loss_clip": 0.01096786, + "auxiliary_loss_mlp": 0.00782062, + "balance_loss_clip": 1.0372777, + "balance_loss_mlp": 1.00068676, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 5.9289398313801405, + "language_loss": 0.59483498, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.6136235, + "num_input_tokens_seen": 201402780, + "step": 9352, + "time_per_iteration": 2.5356128215789795 + }, + { + "auxiliary_loss_clip": 0.01103524, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.03924417, + "balance_loss_mlp": 1.02062571, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.2757303188688032, + "language_loss": 0.71945453, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.74082631, + "num_input_tokens_seen": 201424140, + "step": 9353, + "time_per_iteration": 3.9559500217437744 + }, + { + "auxiliary_loss_clip": 0.01097631, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.04075432, + "balance_loss_mlp": 1.01838589, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 2.954146471685363, + "language_loss": 0.75954449, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.7808435, + "num_input_tokens_seen": 201439645, + "step": 9354, + "time_per_iteration": 2.474313259124756 + }, + { + "auxiliary_loss_clip": 0.01086394, + "auxiliary_loss_mlp": 0.01036099, + "balance_loss_clip": 1.03822613, + "balance_loss_mlp": 1.02199686, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 1.8839044570786492, + "language_loss": 0.72590309, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74712807, + "num_input_tokens_seen": 201459970, + "step": 9355, + "time_per_iteration": 2.5554637908935547 + }, + { + "auxiliary_loss_clip": 0.01108494, + "auxiliary_loss_mlp": 0.01031524, + "balance_loss_clip": 1.04587519, + "balance_loss_mlp": 1.01796365, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 3.3898321611831657, + "language_loss": 0.73341668, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75481689, + "num_input_tokens_seen": 201480055, + "step": 9356, + "time_per_iteration": 2.525449752807617 + }, + { + "auxiliary_loss_clip": 0.01117469, + "auxiliary_loss_mlp": 0.01037932, + "balance_loss_clip": 1.04173851, + "balance_loss_mlp": 1.02398419, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 2.0224932981179053, + "language_loss": 0.83618098, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85773504, + "num_input_tokens_seen": 201497645, + "step": 9357, + "time_per_iteration": 2.4407761096954346 + }, + { + "auxiliary_loss_clip": 0.01108113, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.04209375, + "balance_loss_mlp": 1.02047086, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 2.168255583316525, + "language_loss": 0.7263571, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74777186, + "num_input_tokens_seen": 201515455, + "step": 9358, + "time_per_iteration": 2.439659595489502 + }, + { + "auxiliary_loss_clip": 0.0111608, + "auxiliary_loss_mlp": 0.01039935, + "balance_loss_clip": 1.04016066, + "balance_loss_mlp": 1.02579069, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 2.171814598165084, + "language_loss": 0.77446914, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79602933, + "num_input_tokens_seen": 201534500, + "step": 9359, + "time_per_iteration": 2.502164125442505 + }, + { + "auxiliary_loss_clip": 0.01094319, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.03901529, + "balance_loss_mlp": 1.0186367, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 2.216634594282929, + "language_loss": 0.70749545, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72875041, + "num_input_tokens_seen": 201553280, + "step": 9360, + "time_per_iteration": 2.554283857345581 + }, + { + "auxiliary_loss_clip": 0.0098931, + "auxiliary_loss_mlp": 0.01001198, + "balance_loss_clip": 1.00877142, + "balance_loss_mlp": 0.99982721, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7761666190405964, + "language_loss": 0.55545962, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57536471, + "num_input_tokens_seen": 201610030, + "step": 9361, + "time_per_iteration": 3.0604088306427 + }, + { + "auxiliary_loss_clip": 0.01092789, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.04036677, + "balance_loss_mlp": 1.02165341, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.4099499426569733, + "language_loss": 0.81834769, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83962321, + "num_input_tokens_seen": 201628370, + "step": 9362, + "time_per_iteration": 2.5234804153442383 + }, + { + "auxiliary_loss_clip": 0.01081709, + "auxiliary_loss_mlp": 0.00779182, + "balance_loss_clip": 1.03781831, + "balance_loss_mlp": 1.0007143, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.5499987246127391, + "language_loss": 0.74649417, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.7651031, + "num_input_tokens_seen": 201649790, + "step": 9363, + "time_per_iteration": 2.6863410472869873 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01036139, + "balance_loss_clip": 1.04055965, + "balance_loss_mlp": 1.02203679, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.7516005771775127, + "language_loss": 0.82860672, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85007286, + "num_input_tokens_seen": 201669175, + "step": 9364, + "time_per_iteration": 2.540658473968506 + }, + { + "auxiliary_loss_clip": 0.01082093, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.0398984, + "balance_loss_mlp": 1.02545309, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.5993207666541873, + "language_loss": 0.65378082, + "learning_rate": 1.689881739637642e-06, + "loss": 0.67498195, + "num_input_tokens_seen": 201687000, + "step": 9365, + "time_per_iteration": 5.608316421508789 + }, + { + "auxiliary_loss_clip": 0.01095961, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.03909826, + "balance_loss_mlp": 1.02043688, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 2.905348074503702, + "language_loss": 0.81920666, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.84051549, + "num_input_tokens_seen": 201703335, + "step": 9366, + "time_per_iteration": 2.478684186935425 + }, + { + "auxiliary_loss_clip": 0.01116301, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.04207671, + "balance_loss_mlp": 1.01936865, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.4341249102675007, + "language_loss": 0.73091567, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75239962, + "num_input_tokens_seen": 201723495, + "step": 9367, + "time_per_iteration": 2.4521920680999756 + }, + { + "auxiliary_loss_clip": 0.01015954, + "auxiliary_loss_mlp": 0.01006957, + "balance_loss_clip": 1.01003432, + "balance_loss_mlp": 1.00566912, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6314343942919807, + "language_loss": 0.53468823, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55491734, + "num_input_tokens_seen": 201792615, + "step": 9368, + "time_per_iteration": 3.186318874359131 + }, + { + "auxiliary_loss_clip": 0.0111822, + "auxiliary_loss_mlp": 0.01038076, + "balance_loss_clip": 1.04290056, + "balance_loss_mlp": 1.02458167, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.5888621606133488, + "language_loss": 0.69027066, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71183366, + "num_input_tokens_seen": 201812520, + "step": 9369, + "time_per_iteration": 2.4595141410827637 + }, + { + "auxiliary_loss_clip": 0.01078857, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.03251541, + "balance_loss_mlp": 1.02394247, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.7774194240599999, + "language_loss": 0.76348633, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.78465319, + "num_input_tokens_seen": 201834185, + "step": 9370, + "time_per_iteration": 2.646710157394409 + }, + { + "auxiliary_loss_clip": 0.01094619, + "auxiliary_loss_mlp": 0.0104017, + "balance_loss_clip": 1.03926575, + "balance_loss_mlp": 1.02560282, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 2.0931807385049046, + "language_loss": 0.75960672, + "learning_rate": 1.687573444537108e-06, + "loss": 0.7809546, + "num_input_tokens_seen": 201851305, + "step": 9371, + "time_per_iteration": 2.5399081707000732 + }, + { + "auxiliary_loss_clip": 0.01103419, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.03909945, + "balance_loss_mlp": 1.0250448, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 1.7658444397580044, + "language_loss": 0.7634809, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78488874, + "num_input_tokens_seen": 201870350, + "step": 9372, + "time_per_iteration": 2.4865591526031494 + }, + { + "auxiliary_loss_clip": 0.01093863, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.04035068, + "balance_loss_mlp": 1.01776052, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 1.7958054660474314, + "language_loss": 0.71429646, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73555338, + "num_input_tokens_seen": 201886800, + "step": 9373, + "time_per_iteration": 2.4878973960876465 + }, + { + "auxiliary_loss_clip": 0.01086695, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.04110098, + "balance_loss_mlp": 1.0165844, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.3577822923322795, + "language_loss": 0.8297689, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.85094649, + "num_input_tokens_seen": 201904730, + "step": 9374, + "time_per_iteration": 2.574108600616455 + }, + { + "auxiliary_loss_clip": 0.01103083, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.0374006, + "balance_loss_mlp": 1.01620722, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.795146844449404, + "language_loss": 0.66032219, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68164122, + "num_input_tokens_seen": 201924850, + "step": 9375, + "time_per_iteration": 2.5572903156280518 + }, + { + "auxiliary_loss_clip": 0.01081679, + "auxiliary_loss_mlp": 0.00779072, + "balance_loss_clip": 1.04026556, + "balance_loss_mlp": 1.00072157, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 3.1147110309488273, + "language_loss": 0.81263268, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83124024, + "num_input_tokens_seen": 201939500, + "step": 9376, + "time_per_iteration": 2.5040855407714844 + }, + { + "auxiliary_loss_clip": 0.01099208, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.03924596, + "balance_loss_mlp": 1.01938117, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.5789774007679611, + "language_loss": 0.69129324, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71262097, + "num_input_tokens_seen": 201963000, + "step": 9377, + "time_per_iteration": 2.7307350635528564 + }, + { + "auxiliary_loss_clip": 0.0107326, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.03635621, + "balance_loss_mlp": 1.01978076, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.6503791962724985, + "language_loss": 0.7483809, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76943803, + "num_input_tokens_seen": 201983145, + "step": 9378, + "time_per_iteration": 4.0777106285095215 + }, + { + "auxiliary_loss_clip": 0.01121979, + "auxiliary_loss_mlp": 0.01035866, + "balance_loss_clip": 1.03973579, + "balance_loss_mlp": 1.02113128, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.3336134405220403, + "language_loss": 0.8208679, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84244633, + "num_input_tokens_seen": 202000335, + "step": 9379, + "time_per_iteration": 2.466536521911621 + }, + { + "auxiliary_loss_clip": 0.01093352, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.03600299, + "balance_loss_mlp": 1.02138305, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 29.115151500785927, + "language_loss": 0.71889842, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.74018377, + "num_input_tokens_seen": 202018275, + "step": 9380, + "time_per_iteration": 2.552446126937866 + }, + { + "auxiliary_loss_clip": 0.01081939, + "auxiliary_loss_mlp": 0.01035896, + "balance_loss_clip": 1.04081762, + "balance_loss_mlp": 1.02159667, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 11.028864980676847, + "language_loss": 0.74123496, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76241326, + "num_input_tokens_seen": 202034330, + "step": 9381, + "time_per_iteration": 2.569207191467285 + }, + { + "auxiliary_loss_clip": 0.01070586, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.03451538, + "balance_loss_mlp": 1.02838755, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 2.184535453895943, + "language_loss": 0.72066593, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74179524, + "num_input_tokens_seen": 202053100, + "step": 9382, + "time_per_iteration": 2.629441022872925 + }, + { + "auxiliary_loss_clip": 0.01032908, + "auxiliary_loss_mlp": 0.01004034, + "balance_loss_clip": 1.00712228, + "balance_loss_mlp": 1.00272274, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7345425021180814, + "language_loss": 0.54474592, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56511533, + "num_input_tokens_seen": 202120125, + "step": 9383, + "time_per_iteration": 3.139288902282715 + }, + { + "auxiliary_loss_clip": 0.01104914, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.03857338, + "balance_loss_mlp": 1.01421213, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 2.1403042806846586, + "language_loss": 0.70690972, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.7282449, + "num_input_tokens_seen": 202138030, + "step": 9384, + "time_per_iteration": 2.436194896697998 + }, + { + "auxiliary_loss_clip": 0.01096254, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.03930497, + "balance_loss_mlp": 1.02077794, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 4.74359108786476, + "language_loss": 0.76141202, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78272033, + "num_input_tokens_seen": 202155580, + "step": 9385, + "time_per_iteration": 2.519178628921509 + }, + { + "auxiliary_loss_clip": 0.0110318, + "auxiliary_loss_mlp": 0.01039931, + "balance_loss_clip": 1.03725147, + "balance_loss_mlp": 1.02635336, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 2.1459301118703564, + "language_loss": 0.82368672, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84511787, + "num_input_tokens_seen": 202170365, + "step": 9386, + "time_per_iteration": 2.4534382820129395 + }, + { + "auxiliary_loss_clip": 0.01110158, + "auxiliary_loss_mlp": 0.01038513, + "balance_loss_clip": 1.04300714, + "balance_loss_mlp": 1.02402329, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 2.351820097177839, + "language_loss": 0.70326614, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72475284, + "num_input_tokens_seen": 202189095, + "step": 9387, + "time_per_iteration": 2.435744047164917 + }, + { + "auxiliary_loss_clip": 0.01107108, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.03891683, + "balance_loss_mlp": 1.02021921, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.4570194871801712, + "language_loss": 0.74547422, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76688081, + "num_input_tokens_seen": 202213500, + "step": 9388, + "time_per_iteration": 2.587777853012085 + }, + { + "auxiliary_loss_clip": 0.01102389, + "auxiliary_loss_mlp": 0.01031374, + "balance_loss_clip": 1.03913355, + "balance_loss_mlp": 1.02004349, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.639523002144538, + "language_loss": 0.82232487, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.8436625, + "num_input_tokens_seen": 202231920, + "step": 9389, + "time_per_iteration": 2.4620988368988037 + }, + { + "auxiliary_loss_clip": 0.0108374, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.03568053, + "balance_loss_mlp": 1.02082825, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 2.2150844784183152, + "language_loss": 0.64422178, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66541201, + "num_input_tokens_seen": 202247600, + "step": 9390, + "time_per_iteration": 2.5156898498535156 + }, + { + "auxiliary_loss_clip": 0.01094305, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.04106951, + "balance_loss_mlp": 1.02101731, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.9917076135645535, + "language_loss": 0.92117333, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.9424448, + "num_input_tokens_seen": 202265350, + "step": 9391, + "time_per_iteration": 2.4820170402526855 + }, + { + "auxiliary_loss_clip": 0.01114476, + "auxiliary_loss_mlp": 0.01036129, + "balance_loss_clip": 1.04362476, + "balance_loss_mlp": 1.02163315, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 14.174589579843452, + "language_loss": 0.60143054, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62293661, + "num_input_tokens_seen": 202284285, + "step": 9392, + "time_per_iteration": 2.512420654296875 + }, + { + "auxiliary_loss_clip": 0.01067556, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.03366494, + "balance_loss_mlp": 1.01628256, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 3.653709242172947, + "language_loss": 0.81624293, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.8372246, + "num_input_tokens_seen": 202303450, + "step": 9393, + "time_per_iteration": 4.029984951019287 + }, + { + "auxiliary_loss_clip": 0.01093965, + "auxiliary_loss_mlp": 0.01032379, + "balance_loss_clip": 1.04054797, + "balance_loss_mlp": 1.01936102, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.7112145960869292, + "language_loss": 0.87060952, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.891873, + "num_input_tokens_seen": 202322315, + "step": 9394, + "time_per_iteration": 2.5184600353240967 + }, + { + "auxiliary_loss_clip": 0.01106168, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.04131913, + "balance_loss_mlp": 1.02177274, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 1.8338407416644804, + "language_loss": 0.84957963, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.87098885, + "num_input_tokens_seen": 202339905, + "step": 9395, + "time_per_iteration": 2.4612433910369873 + }, + { + "auxiliary_loss_clip": 0.01022752, + "auxiliary_loss_mlp": 0.01000531, + "balance_loss_clip": 1.00687122, + "balance_loss_mlp": 0.99924308, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.7891596904655186, + "language_loss": 0.58259112, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60282391, + "num_input_tokens_seen": 202397320, + "step": 9396, + "time_per_iteration": 3.0456273555755615 + }, + { + "auxiliary_loss_clip": 0.01095602, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.03892756, + "balance_loss_mlp": 1.01551449, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.094040922041608, + "language_loss": 0.70133048, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72256958, + "num_input_tokens_seen": 202416865, + "step": 9397, + "time_per_iteration": 2.5575335025787354 + }, + { + "auxiliary_loss_clip": 0.01083189, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.04034114, + "balance_loss_mlp": 1.01828861, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.8305187943385772, + "language_loss": 0.67003191, + "learning_rate": 1.67719144001275e-06, + "loss": 0.6911692, + "num_input_tokens_seen": 202436210, + "step": 9398, + "time_per_iteration": 2.564967155456543 + }, + { + "auxiliary_loss_clip": 0.01019814, + "auxiliary_loss_mlp": 0.01000163, + "balance_loss_clip": 1.0125438, + "balance_loss_mlp": 0.99882776, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.7629324144258872, + "language_loss": 0.58170462, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60190427, + "num_input_tokens_seen": 202492925, + "step": 9399, + "time_per_iteration": 3.0295050144195557 + }, + { + "auxiliary_loss_clip": 0.01075128, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.03390789, + "balance_loss_mlp": 1.02019286, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 2.1766322002223486, + "language_loss": 0.73212469, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.7532289, + "num_input_tokens_seen": 202511905, + "step": 9400, + "time_per_iteration": 2.6605114936828613 + }, + { + "auxiliary_loss_clip": 0.01089444, + "auxiliary_loss_mlp": 0.01036066, + "balance_loss_clip": 1.04037666, + "balance_loss_mlp": 1.0214684, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 2.225060894863654, + "language_loss": 0.60637355, + "learning_rate": 1.676038429548412e-06, + "loss": 0.62762868, + "num_input_tokens_seen": 202529815, + "step": 9401, + "time_per_iteration": 2.526677370071411 + }, + { + "auxiliary_loss_clip": 0.01075484, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.03486371, + "balance_loss_mlp": 1.0174123, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 2.087328897728121, + "language_loss": 0.81506264, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83612239, + "num_input_tokens_seen": 202547710, + "step": 9402, + "time_per_iteration": 2.5203795433044434 + }, + { + "auxiliary_loss_clip": 0.0106273, + "auxiliary_loss_mlp": 0.01047571, + "balance_loss_clip": 1.03018987, + "balance_loss_mlp": 1.03335512, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.5374350337747393, + "language_loss": 0.77743149, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.79853445, + "num_input_tokens_seen": 202568835, + "step": 9403, + "time_per_iteration": 4.168110609054565 + }, + { + "auxiliary_loss_clip": 0.01065985, + "auxiliary_loss_mlp": 0.01039712, + "balance_loss_clip": 1.03489923, + "balance_loss_mlp": 1.02545488, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.6030998161597876, + "language_loss": 0.69084072, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.71189767, + "num_input_tokens_seen": 202587385, + "step": 9404, + "time_per_iteration": 2.589371919631958 + }, + { + "auxiliary_loss_clip": 0.01080216, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.03797126, + "balance_loss_mlp": 1.02106857, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 2.2512683831680587, + "language_loss": 0.67109734, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.69223011, + "num_input_tokens_seen": 202604815, + "step": 9405, + "time_per_iteration": 3.8953075408935547 + }, + { + "auxiliary_loss_clip": 0.01086061, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.03858364, + "balance_loss_mlp": 1.02424145, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.8182358668950613, + "language_loss": 0.74475616, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76598346, + "num_input_tokens_seen": 202623775, + "step": 9406, + "time_per_iteration": 2.6251168251037598 + }, + { + "auxiliary_loss_clip": 0.01058021, + "auxiliary_loss_mlp": 0.01043919, + "balance_loss_clip": 1.0350858, + "balance_loss_mlp": 1.02808785, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.649672670956682, + "language_loss": 0.79705191, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81807131, + "num_input_tokens_seen": 202643375, + "step": 9407, + "time_per_iteration": 2.6244752407073975 + }, + { + "auxiliary_loss_clip": 0.01077044, + "auxiliary_loss_mlp": 0.01041602, + "balance_loss_clip": 1.03638947, + "balance_loss_mlp": 1.02749348, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.2846761871609351, + "language_loss": 0.70876223, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.72994864, + "num_input_tokens_seen": 202668400, + "step": 9408, + "time_per_iteration": 2.6649887561798096 + }, + { + "auxiliary_loss_clip": 0.01057198, + "auxiliary_loss_mlp": 0.01037407, + "balance_loss_clip": 1.03679538, + "balance_loss_mlp": 1.0243063, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 2.3716085011980863, + "language_loss": 0.80956477, + "learning_rate": 1.672964276570308e-06, + "loss": 0.8305108, + "num_input_tokens_seen": 202685125, + "step": 9409, + "time_per_iteration": 2.546924114227295 + }, + { + "auxiliary_loss_clip": 0.01075357, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.03697324, + "balance_loss_mlp": 1.01653075, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.858625772791942, + "language_loss": 0.78088737, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80193472, + "num_input_tokens_seen": 202703830, + "step": 9410, + "time_per_iteration": 2.5497310161590576 + }, + { + "auxiliary_loss_clip": 0.01116369, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.03980482, + "balance_loss_mlp": 1.02326012, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 7.446181681339198, + "language_loss": 0.82800347, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.84953153, + "num_input_tokens_seen": 202719835, + "step": 9411, + "time_per_iteration": 2.405588150024414 + }, + { + "auxiliary_loss_clip": 0.01109557, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.04035711, + "balance_loss_mlp": 1.02169394, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.8678530545453293, + "language_loss": 0.67435604, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69581002, + "num_input_tokens_seen": 202736795, + "step": 9412, + "time_per_iteration": 2.4469382762908936 + }, + { + "auxiliary_loss_clip": 0.01100958, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.03900814, + "balance_loss_mlp": 1.01996875, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.7325844297024373, + "language_loss": 0.58400619, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60532981, + "num_input_tokens_seen": 202756900, + "step": 9413, + "time_per_iteration": 2.5351545810699463 + }, + { + "auxiliary_loss_clip": 0.01039719, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_clip": 1.03396344, + "balance_loss_mlp": 1.02951181, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.4950254959108706, + "language_loss": 0.69468296, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71551245, + "num_input_tokens_seen": 202775145, + "step": 9414, + "time_per_iteration": 2.752764940261841 + }, + { + "auxiliary_loss_clip": 0.01046374, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.04226613, + "balance_loss_mlp": 1.01994228, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 1.5689700704933616, + "language_loss": 0.78476363, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80553991, + "num_input_tokens_seen": 202794505, + "step": 9415, + "time_per_iteration": 3.084099292755127 + }, + { + "auxiliary_loss_clip": 0.01017551, + "auxiliary_loss_mlp": 0.01000621, + "balance_loss_clip": 1.01169515, + "balance_loss_mlp": 0.99913043, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.6880353143695048, + "language_loss": 0.4913716, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51155329, + "num_input_tokens_seen": 202858580, + "step": 9416, + "time_per_iteration": 3.231870174407959 + }, + { + "auxiliary_loss_clip": 0.01105681, + "auxiliary_loss_mlp": 0.00778479, + "balance_loss_clip": 1.04046166, + "balance_loss_mlp": 1.00063443, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 2.172518967719053, + "language_loss": 0.63093424, + "learning_rate": 1.6698909172706e-06, + "loss": 0.64977586, + "num_input_tokens_seen": 202878565, + "step": 9417, + "time_per_iteration": 4.145868539810181 + }, + { + "auxiliary_loss_clip": 0.01094911, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.03759074, + "balance_loss_mlp": 1.01949406, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.8383635581712696, + "language_loss": 0.68847793, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.70975226, + "num_input_tokens_seen": 202897350, + "step": 9418, + "time_per_iteration": 2.525287628173828 + }, + { + "auxiliary_loss_clip": 0.01102261, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.03611207, + "balance_loss_mlp": 1.01968968, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.7344435819400916, + "language_loss": 0.64671314, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66808105, + "num_input_tokens_seen": 202916745, + "step": 9419, + "time_per_iteration": 2.5749728679656982 + }, + { + "auxiliary_loss_clip": 0.00980493, + "auxiliary_loss_mlp": 0.01001849, + "balance_loss_clip": 1.02113509, + "balance_loss_mlp": 1.00051999, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7307536742843144, + "language_loss": 0.59746456, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61728793, + "num_input_tokens_seen": 202982375, + "step": 9420, + "time_per_iteration": 3.3665356636047363 + }, + { + "auxiliary_loss_clip": 0.01094499, + "auxiliary_loss_mlp": 0.00777418, + "balance_loss_clip": 1.03804779, + "balance_loss_mlp": 1.00068271, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 3.4817096128869083, + "language_loss": 0.7416178, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.760337, + "num_input_tokens_seen": 203002430, + "step": 9421, + "time_per_iteration": 3.6940691471099854 + }, + { + "auxiliary_loss_clip": 0.01080835, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.03623271, + "balance_loss_mlp": 1.02114046, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.9559146145023203, + "language_loss": 0.72359157, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.74474359, + "num_input_tokens_seen": 203019425, + "step": 9422, + "time_per_iteration": 2.5585556030273438 + }, + { + "auxiliary_loss_clip": 0.01100022, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.03861618, + "balance_loss_mlp": 1.02404594, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 2.0356192255274093, + "language_loss": 0.81664658, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83800226, + "num_input_tokens_seen": 203039035, + "step": 9423, + "time_per_iteration": 2.490614652633667 + }, + { + "auxiliary_loss_clip": 0.0108896, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.03953981, + "balance_loss_mlp": 1.02209818, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 3.6150233903554683, + "language_loss": 0.80418873, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82542968, + "num_input_tokens_seen": 203059320, + "step": 9424, + "time_per_iteration": 2.5020620822906494 + }, + { + "auxiliary_loss_clip": 0.01118925, + "auxiliary_loss_mlp": 0.0077928, + "balance_loss_clip": 1.04055691, + "balance_loss_mlp": 1.00065362, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 2.564330603796423, + "language_loss": 0.7878592, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80684125, + "num_input_tokens_seen": 203078490, + "step": 9425, + "time_per_iteration": 2.497753143310547 + }, + { + "auxiliary_loss_clip": 0.01088672, + "auxiliary_loss_mlp": 0.01032089, + "balance_loss_clip": 1.04097962, + "balance_loss_mlp": 1.01980996, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 2.0296281377014953, + "language_loss": 0.58980584, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61101347, + "num_input_tokens_seen": 203096065, + "step": 9426, + "time_per_iteration": 2.5102851390838623 + }, + { + "auxiliary_loss_clip": 0.01106139, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.03941488, + "balance_loss_mlp": 1.01765335, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 1.796206811815177, + "language_loss": 0.82112414, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.84248096, + "num_input_tokens_seen": 203115270, + "step": 9427, + "time_per_iteration": 2.4644699096679688 + }, + { + "auxiliary_loss_clip": 0.01113368, + "auxiliary_loss_mlp": 0.01039309, + "balance_loss_clip": 1.04111814, + "balance_loss_mlp": 1.0263989, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 2.125590270236602, + "language_loss": 0.86064696, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88217378, + "num_input_tokens_seen": 203134290, + "step": 9428, + "time_per_iteration": 2.4551732540130615 + }, + { + "auxiliary_loss_clip": 0.01098909, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.04163599, + "balance_loss_mlp": 1.02226639, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 3.609021184077942, + "language_loss": 0.73840356, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.75975227, + "num_input_tokens_seen": 203152935, + "step": 9429, + "time_per_iteration": 2.5118939876556396 + }, + { + "auxiliary_loss_clip": 0.01097656, + "auxiliary_loss_mlp": 0.00778307, + "balance_loss_clip": 1.04087269, + "balance_loss_mlp": 1.00060236, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.8205326441061975, + "language_loss": 0.75889879, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77765846, + "num_input_tokens_seen": 203170110, + "step": 9430, + "time_per_iteration": 2.4916088581085205 + }, + { + "auxiliary_loss_clip": 0.01114744, + "auxiliary_loss_mlp": 0.01037135, + "balance_loss_clip": 1.03901601, + "balance_loss_mlp": 1.02436805, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 1.9401905578720708, + "language_loss": 0.72547269, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74699152, + "num_input_tokens_seen": 203188825, + "step": 9431, + "time_per_iteration": 2.461052179336548 + }, + { + "auxiliary_loss_clip": 0.01065318, + "auxiliary_loss_mlp": 0.01028101, + "balance_loss_clip": 1.03592169, + "balance_loss_mlp": 1.01690173, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.6855312487177037, + "language_loss": 0.73577082, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75670499, + "num_input_tokens_seen": 203206860, + "step": 9432, + "time_per_iteration": 4.047347784042358 + }, + { + "auxiliary_loss_clip": 0.0106487, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.03491437, + "balance_loss_mlp": 1.02045584, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.451088614947256, + "language_loss": 0.7793355, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80031335, + "num_input_tokens_seen": 203225625, + "step": 9433, + "time_per_iteration": 2.5833349227905273 + }, + { + "auxiliary_loss_clip": 0.01119104, + "auxiliary_loss_mlp": 0.01036789, + "balance_loss_clip": 1.04115546, + "balance_loss_mlp": 1.02208424, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 2.3265658008367014, + "language_loss": 0.63420534, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65576428, + "num_input_tokens_seen": 203242920, + "step": 9434, + "time_per_iteration": 2.462963819503784 + }, + { + "auxiliary_loss_clip": 0.01102738, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.03923178, + "balance_loss_mlp": 1.0181613, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 3.558208126694677, + "language_loss": 0.66670179, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68802869, + "num_input_tokens_seen": 203261995, + "step": 9435, + "time_per_iteration": 2.507568836212158 + }, + { + "auxiliary_loss_clip": 0.01087486, + "auxiliary_loss_mlp": 0.00777052, + "balance_loss_clip": 1.03451324, + "balance_loss_mlp": 1.0004406, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.4413560649446744, + "language_loss": 0.71698755, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.7356329, + "num_input_tokens_seen": 203280670, + "step": 9436, + "time_per_iteration": 2.5607750415802 + }, + { + "auxiliary_loss_clip": 0.01115893, + "auxiliary_loss_mlp": 0.01029627, + "balance_loss_clip": 1.03940606, + "balance_loss_mlp": 1.01690793, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.4831179980093574, + "language_loss": 0.74134529, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76280046, + "num_input_tokens_seen": 203304800, + "step": 9437, + "time_per_iteration": 2.533665895462036 + }, + { + "auxiliary_loss_clip": 0.01110695, + "auxiliary_loss_mlp": 0.01037511, + "balance_loss_clip": 1.04507685, + "balance_loss_mlp": 1.02385581, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 4.586062492112935, + "language_loss": 0.61103177, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63251376, + "num_input_tokens_seen": 203324060, + "step": 9438, + "time_per_iteration": 2.553173065185547 + }, + { + "auxiliary_loss_clip": 0.01093587, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.03759265, + "balance_loss_mlp": 1.01976085, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.5419524357195797, + "language_loss": 0.75169981, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77295387, + "num_input_tokens_seen": 203344360, + "step": 9439, + "time_per_iteration": 2.565565347671509 + }, + { + "auxiliary_loss_clip": 0.01090119, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.04070902, + "balance_loss_mlp": 1.01821721, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 2.1955986691178753, + "language_loss": 0.83503217, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.85625279, + "num_input_tokens_seen": 203362115, + "step": 9440, + "time_per_iteration": 2.5131170749664307 + }, + { + "auxiliary_loss_clip": 0.0109035, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.04280412, + "balance_loss_mlp": 1.01937723, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 2.970678133934226, + "language_loss": 0.75842035, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77965122, + "num_input_tokens_seen": 203380550, + "step": 9441, + "time_per_iteration": 2.5338056087493896 + }, + { + "auxiliary_loss_clip": 0.01066992, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.03539598, + "balance_loss_mlp": 1.02189708, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 2.170151432542838, + "language_loss": 0.83286309, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85388839, + "num_input_tokens_seen": 203396590, + "step": 9442, + "time_per_iteration": 4.396801233291626 + }, + { + "auxiliary_loss_clip": 0.01079323, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.03894317, + "balance_loss_mlp": 1.02189159, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 1.9325878290285396, + "language_loss": 0.74638987, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76752347, + "num_input_tokens_seen": 203414280, + "step": 9443, + "time_per_iteration": 2.5884387493133545 + }, + { + "auxiliary_loss_clip": 0.01090962, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.04151821, + "balance_loss_mlp": 1.02077508, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 15.964238316934368, + "language_loss": 0.77076805, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79201061, + "num_input_tokens_seen": 203433280, + "step": 9444, + "time_per_iteration": 3.9045515060424805 + }, + { + "auxiliary_loss_clip": 0.01078897, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.03926492, + "balance_loss_mlp": 1.02637458, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.6698607109285892, + "language_loss": 0.8117913, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83297622, + "num_input_tokens_seen": 203449935, + "step": 9445, + "time_per_iteration": 2.536237955093384 + }, + { + "auxiliary_loss_clip": 0.01111728, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.03750467, + "balance_loss_mlp": 1.01746058, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.7257507590376844, + "language_loss": 0.70936918, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73078978, + "num_input_tokens_seen": 203473025, + "step": 9446, + "time_per_iteration": 2.5391793251037598 + }, + { + "auxiliary_loss_clip": 0.01083297, + "auxiliary_loss_mlp": 0.01030667, + "balance_loss_clip": 1.03807449, + "balance_loss_mlp": 1.01745844, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.7918222530505385, + "language_loss": 0.73676616, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75790578, + "num_input_tokens_seen": 203492895, + "step": 9447, + "time_per_iteration": 2.577991008758545 + }, + { + "auxiliary_loss_clip": 0.01095719, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.03916073, + "balance_loss_mlp": 1.01923656, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 1.9090742499401847, + "language_loss": 0.75600755, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77728343, + "num_input_tokens_seen": 203513710, + "step": 9448, + "time_per_iteration": 2.6002845764160156 + }, + { + "auxiliary_loss_clip": 0.01077417, + "auxiliary_loss_mlp": 0.01047603, + "balance_loss_clip": 1.04152215, + "balance_loss_mlp": 1.03289819, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 3.9000155296748638, + "language_loss": 0.764036, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78528619, + "num_input_tokens_seen": 203531630, + "step": 9449, + "time_per_iteration": 2.6387405395507812 + }, + { + "auxiliary_loss_clip": 0.01092247, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.03796792, + "balance_loss_mlp": 1.02940083, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.8773256139476746, + "language_loss": 0.74969959, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.77105463, + "num_input_tokens_seen": 203551885, + "step": 9450, + "time_per_iteration": 2.5935237407684326 + }, + { + "auxiliary_loss_clip": 0.01098666, + "auxiliary_loss_mlp": 0.01041225, + "balance_loss_clip": 1.04010057, + "balance_loss_mlp": 1.02849901, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 1.7835323244515542, + "language_loss": 0.66870308, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69010198, + "num_input_tokens_seen": 203572250, + "step": 9451, + "time_per_iteration": 2.540926694869995 + }, + { + "auxiliary_loss_clip": 0.01097976, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.04162991, + "balance_loss_mlp": 1.02101231, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 2.134714426876444, + "language_loss": 0.72405922, + "learning_rate": 1.656454488573026e-06, + "loss": 0.74539757, + "num_input_tokens_seen": 203590605, + "step": 9452, + "time_per_iteration": 2.567732334136963 + }, + { + "auxiliary_loss_clip": 0.01072739, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.03696907, + "balance_loss_mlp": 1.01821232, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.4604787835322857, + "language_loss": 0.70019937, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72123009, + "num_input_tokens_seen": 203610080, + "step": 9453, + "time_per_iteration": 2.5927844047546387 + }, + { + "auxiliary_loss_clip": 0.01074982, + "auxiliary_loss_mlp": 0.00776834, + "balance_loss_clip": 1.03964078, + "balance_loss_mlp": 1.00052214, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 1.6693168278268553, + "language_loss": 0.6951369, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71365505, + "num_input_tokens_seen": 203630060, + "step": 9454, + "time_per_iteration": 2.639882802963257 + }, + { + "auxiliary_loss_clip": 0.01091509, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.03745794, + "balance_loss_mlp": 1.01875281, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 2.1794462253836904, + "language_loss": 0.60818696, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62940937, + "num_input_tokens_seen": 203649065, + "step": 9455, + "time_per_iteration": 2.545353889465332 + }, + { + "auxiliary_loss_clip": 0.01079057, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.04187667, + "balance_loss_mlp": 1.02318907, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 1.9365470811141872, + "language_loss": 0.73136652, + "learning_rate": 1.6549199011198e-06, + "loss": 0.7525245, + "num_input_tokens_seen": 203667545, + "step": 9456, + "time_per_iteration": 4.2139482498168945 + }, + { + "auxiliary_loss_clip": 0.01096742, + "auxiliary_loss_mlp": 0.0103383, + "balance_loss_clip": 1.04058337, + "balance_loss_mlp": 1.02214742, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.6286602000541983, + "language_loss": 0.76840627, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.78971201, + "num_input_tokens_seen": 203686025, + "step": 9457, + "time_per_iteration": 2.535165548324585 + }, + { + "auxiliary_loss_clip": 0.01106867, + "auxiliary_loss_mlp": 0.01038314, + "balance_loss_clip": 1.04031324, + "balance_loss_mlp": 1.02411044, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 2.130854745358702, + "language_loss": 0.66387045, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68532223, + "num_input_tokens_seen": 203705540, + "step": 9458, + "time_per_iteration": 2.576923131942749 + }, + { + "auxiliary_loss_clip": 0.01107528, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.04087985, + "balance_loss_mlp": 1.01901865, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.4836150302899553, + "language_loss": 0.68432301, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70572507, + "num_input_tokens_seen": 203723670, + "step": 9459, + "time_per_iteration": 2.495619058609009 + }, + { + "auxiliary_loss_clip": 0.01094387, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.04466414, + "balance_loss_mlp": 1.01752114, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 2.6441649992716476, + "language_loss": 0.7739737, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.79522461, + "num_input_tokens_seen": 203739705, + "step": 9460, + "time_per_iteration": 2.5425009727478027 + }, + { + "auxiliary_loss_clip": 0.01063785, + "auxiliary_loss_mlp": 0.01038373, + "balance_loss_clip": 1.04143739, + "balance_loss_mlp": 1.02544427, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.7557753419034754, + "language_loss": 0.72239977, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74342132, + "num_input_tokens_seen": 203759000, + "step": 9461, + "time_per_iteration": 2.6721818447113037 + }, + { + "auxiliary_loss_clip": 0.01108065, + "auxiliary_loss_mlp": 0.01035529, + "balance_loss_clip": 1.04146004, + "balance_loss_mlp": 1.02198076, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.0306228557785957, + "language_loss": 0.73077583, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75221175, + "num_input_tokens_seen": 203774295, + "step": 9462, + "time_per_iteration": 2.483259439468384 + }, + { + "auxiliary_loss_clip": 0.01104763, + "auxiliary_loss_mlp": 0.01027423, + "balance_loss_clip": 1.04117942, + "balance_loss_mlp": 1.01579404, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.879072729455511, + "language_loss": 0.72607529, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.74739718, + "num_input_tokens_seen": 203792710, + "step": 9463, + "time_per_iteration": 2.502007484436035 + }, + { + "auxiliary_loss_clip": 0.01103562, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.03938937, + "balance_loss_mlp": 1.02239203, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.6870168593844759, + "language_loss": 0.74091959, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76230133, + "num_input_tokens_seen": 203811645, + "step": 9464, + "time_per_iteration": 2.55359148979187 + }, + { + "auxiliary_loss_clip": 0.01111071, + "auxiliary_loss_mlp": 0.00779355, + "balance_loss_clip": 1.04410815, + "balance_loss_mlp": 1.000494, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.769660508790602, + "language_loss": 0.84323192, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.86213619, + "num_input_tokens_seen": 203830040, + "step": 9465, + "time_per_iteration": 2.506274938583374 + }, + { + "auxiliary_loss_clip": 0.01091975, + "auxiliary_loss_mlp": 0.01031084, + "balance_loss_clip": 1.03672528, + "balance_loss_mlp": 1.01829278, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.6205015566012335, + "language_loss": 0.72101516, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74224573, + "num_input_tokens_seen": 203851245, + "step": 9466, + "time_per_iteration": 2.552924394607544 + }, + { + "auxiliary_loss_clip": 0.01012171, + "auxiliary_loss_mlp": 0.01011964, + "balance_loss_clip": 1.01473498, + "balance_loss_mlp": 1.01068866, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7136169982730888, + "language_loss": 0.55360389, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57384515, + "num_input_tokens_seen": 203916400, + "step": 9467, + "time_per_iteration": 3.165024995803833 + }, + { + "auxiliary_loss_clip": 0.0110356, + "auxiliary_loss_mlp": 0.0103707, + "balance_loss_clip": 1.03952265, + "balance_loss_mlp": 1.02233529, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.0135385430032136, + "language_loss": 0.63934481, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.6607511, + "num_input_tokens_seen": 203935870, + "step": 9468, + "time_per_iteration": 2.4992141723632812 + }, + { + "auxiliary_loss_clip": 0.01076692, + "auxiliary_loss_mlp": 0.01036774, + "balance_loss_clip": 1.04569268, + "balance_loss_mlp": 1.02273762, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 3.073879514966335, + "language_loss": 0.79506242, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81619704, + "num_input_tokens_seen": 203954950, + "step": 9469, + "time_per_iteration": 2.6233291625976562 + }, + { + "auxiliary_loss_clip": 0.01081622, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_clip": 1.03799653, + "balance_loss_mlp": 1.02755094, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 2.0029674554466883, + "language_loss": 0.69418216, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71542817, + "num_input_tokens_seen": 203972715, + "step": 9470, + "time_per_iteration": 2.5358221530914307 + }, + { + "auxiliary_loss_clip": 0.01094198, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.04446721, + "balance_loss_mlp": 1.0194416, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.8625336860816522, + "language_loss": 0.74761277, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76888323, + "num_input_tokens_seen": 203990775, + "step": 9471, + "time_per_iteration": 3.996042251586914 + }, + { + "auxiliary_loss_clip": 0.01080819, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.04471493, + "balance_loss_mlp": 1.0238502, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.9089221384102186, + "language_loss": 0.57199448, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.593189, + "num_input_tokens_seen": 204008845, + "step": 9472, + "time_per_iteration": 2.5407848358154297 + }, + { + "auxiliary_loss_clip": 0.01081619, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.03866649, + "balance_loss_mlp": 1.01703477, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.8229204346989407, + "language_loss": 0.73551911, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75663805, + "num_input_tokens_seen": 204023755, + "step": 9473, + "time_per_iteration": 2.528855800628662 + }, + { + "auxiliary_loss_clip": 0.01011334, + "auxiliary_loss_mlp": 0.01004846, + "balance_loss_clip": 1.01323795, + "balance_loss_mlp": 1.00378537, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6643820292021884, + "language_loss": 0.57619989, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.5963617, + "num_input_tokens_seen": 204091255, + "step": 9474, + "time_per_iteration": 3.1388726234436035 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.04095721, + "balance_loss_mlp": 1.02106166, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.974610340818634, + "language_loss": 0.53845769, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.55985069, + "num_input_tokens_seen": 204113285, + "step": 9475, + "time_per_iteration": 2.6227529048919678 + }, + { + "auxiliary_loss_clip": 0.01120946, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.04392171, + "balance_loss_mlp": 1.02434707, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.5653650758841813, + "language_loss": 0.7969045, + "learning_rate": 1.647250122983675e-06, + "loss": 0.81849223, + "num_input_tokens_seen": 204133045, + "step": 9476, + "time_per_iteration": 2.5134217739105225 + }, + { + "auxiliary_loss_clip": 0.0109753, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.04461002, + "balance_loss_mlp": 1.02069187, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 1.9254784409755665, + "language_loss": 0.66371632, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.6850282, + "num_input_tokens_seen": 204152590, + "step": 9477, + "time_per_iteration": 2.549880266189575 + }, + { + "auxiliary_loss_clip": 0.01086259, + "auxiliary_loss_mlp": 0.01033241, + "balance_loss_clip": 1.04178381, + "balance_loss_mlp": 1.01953173, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.5829538631276852, + "language_loss": 0.70742726, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.7286222, + "num_input_tokens_seen": 204171815, + "step": 9478, + "time_per_iteration": 2.6134653091430664 + }, + { + "auxiliary_loss_clip": 0.01086397, + "auxiliary_loss_mlp": 0.01029484, + "balance_loss_clip": 1.04140365, + "balance_loss_mlp": 1.0174861, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.7115212843528569, + "language_loss": 0.69079643, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71195519, + "num_input_tokens_seen": 204188535, + "step": 9479, + "time_per_iteration": 2.555166721343994 + }, + { + "auxiliary_loss_clip": 0.01078164, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.0416503, + "balance_loss_mlp": 1.0167973, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.39808027966487, + "language_loss": 0.71106356, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73213422, + "num_input_tokens_seen": 204208365, + "step": 9480, + "time_per_iteration": 2.573410749435425 + }, + { + "auxiliary_loss_clip": 0.01088462, + "auxiliary_loss_mlp": 0.00777654, + "balance_loss_clip": 1.0418191, + "balance_loss_mlp": 1.00046229, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.3116953499140287, + "language_loss": 0.72352248, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74218369, + "num_input_tokens_seen": 204226560, + "step": 9481, + "time_per_iteration": 4.078419923782349 + }, + { + "auxiliary_loss_clip": 0.01110007, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.04427147, + "balance_loss_mlp": 1.02065969, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.7441931894931266, + "language_loss": 0.77987689, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.8013165, + "num_input_tokens_seen": 204245410, + "step": 9482, + "time_per_iteration": 2.5091540813446045 + }, + { + "auxiliary_loss_clip": 0.01095574, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.04182649, + "balance_loss_mlp": 1.01529932, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 2.0658873358229037, + "language_loss": 0.77954298, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80078125, + "num_input_tokens_seen": 204264840, + "step": 9483, + "time_per_iteration": 3.834172010421753 + }, + { + "auxiliary_loss_clip": 0.01096779, + "auxiliary_loss_mlp": 0.01038262, + "balance_loss_clip": 1.04347467, + "balance_loss_mlp": 1.02620435, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 2.0353745210797154, + "language_loss": 0.81071687, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.83206725, + "num_input_tokens_seen": 204284335, + "step": 9484, + "time_per_iteration": 2.554288387298584 + }, + { + "auxiliary_loss_clip": 0.01119058, + "auxiliary_loss_mlp": 0.00777953, + "balance_loss_clip": 1.04275429, + "balance_loss_mlp": 1.00052989, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 2.0888537485097403, + "language_loss": 0.6076926, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62666267, + "num_input_tokens_seen": 204302590, + "step": 9485, + "time_per_iteration": 2.5004465579986572 + }, + { + "auxiliary_loss_clip": 0.01105009, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.04084957, + "balance_loss_mlp": 1.01721263, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.916840586985496, + "language_loss": 0.65173614, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67309302, + "num_input_tokens_seen": 204323055, + "step": 9486, + "time_per_iteration": 2.519488573074341 + }, + { + "auxiliary_loss_clip": 0.01028321, + "auxiliary_loss_mlp": 0.01001797, + "balance_loss_clip": 1.02067065, + "balance_loss_mlp": 1.00068271, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6666179275426098, + "language_loss": 0.47957009, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.49987128, + "num_input_tokens_seen": 204386160, + "step": 9487, + "time_per_iteration": 3.1626651287078857 + }, + { + "auxiliary_loss_clip": 0.01083559, + "auxiliary_loss_mlp": 0.00778258, + "balance_loss_clip": 1.04114413, + "balance_loss_mlp": 1.00048923, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 1.6194578553802659, + "language_loss": 0.86571002, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.88432819, + "num_input_tokens_seen": 204406315, + "step": 9488, + "time_per_iteration": 2.597043037414551 + }, + { + "auxiliary_loss_clip": 0.01080328, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.03570127, + "balance_loss_mlp": 1.0207777, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.404388842102599, + "language_loss": 0.78916943, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81031907, + "num_input_tokens_seen": 204427645, + "step": 9489, + "time_per_iteration": 2.599061965942383 + }, + { + "auxiliary_loss_clip": 0.01095412, + "auxiliary_loss_mlp": 0.01026123, + "balance_loss_clip": 1.04098547, + "balance_loss_mlp": 1.01473212, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.7452588381191672, + "language_loss": 0.70238715, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72360247, + "num_input_tokens_seen": 204445910, + "step": 9490, + "time_per_iteration": 2.5393197536468506 + }, + { + "auxiliary_loss_clip": 0.01083651, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.04005396, + "balance_loss_mlp": 1.01900172, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.7157030747039208, + "language_loss": 0.76345766, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78460455, + "num_input_tokens_seen": 204464680, + "step": 9491, + "time_per_iteration": 2.5785417556762695 + }, + { + "auxiliary_loss_clip": 0.01013487, + "auxiliary_loss_mlp": 0.00753553, + "balance_loss_clip": 1.01819015, + "balance_loss_mlp": 1.00038862, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.8095663332016251, + "language_loss": 0.57348299, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59115338, + "num_input_tokens_seen": 204525580, + "step": 9492, + "time_per_iteration": 3.0999205112457275 + }, + { + "auxiliary_loss_clip": 0.01096193, + "auxiliary_loss_mlp": 0.00778114, + "balance_loss_clip": 1.04664409, + "balance_loss_mlp": 1.00055885, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 2.148188641066089, + "language_loss": 0.71714962, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73589265, + "num_input_tokens_seen": 204541320, + "step": 9493, + "time_per_iteration": 2.5893118381500244 + }, + { + "auxiliary_loss_clip": 0.01121501, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.0434866, + "balance_loss_mlp": 1.01677656, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.5944204174439978, + "language_loss": 0.77579892, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.79731423, + "num_input_tokens_seen": 204560275, + "step": 9494, + "time_per_iteration": 2.488208055496216 + }, + { + "auxiliary_loss_clip": 0.01121302, + "auxiliary_loss_mlp": 0.01037852, + "balance_loss_clip": 1.04168868, + "balance_loss_mlp": 1.02351713, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.657885919581032, + "language_loss": 0.80320203, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82479358, + "num_input_tokens_seen": 204579430, + "step": 9495, + "time_per_iteration": 2.492647409439087 + }, + { + "auxiliary_loss_clip": 0.01076494, + "auxiliary_loss_mlp": 0.01043358, + "balance_loss_clip": 1.03758168, + "balance_loss_mlp": 1.02715743, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 2.1034957789022655, + "language_loss": 0.66306782, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68426639, + "num_input_tokens_seen": 204597710, + "step": 9496, + "time_per_iteration": 4.249013423919678 + }, + { + "auxiliary_loss_clip": 0.01121228, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.04274106, + "balance_loss_mlp": 1.02319479, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.2147336162970404, + "language_loss": 0.69317603, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71475565, + "num_input_tokens_seen": 204616140, + "step": 9497, + "time_per_iteration": 2.4408788681030273 + }, + { + "auxiliary_loss_clip": 0.01105743, + "auxiliary_loss_mlp": 0.00779249, + "balance_loss_clip": 1.04388142, + "balance_loss_mlp": 1.00056338, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 1.622947173050663, + "language_loss": 0.80928981, + "learning_rate": 1.638819551358182e-06, + "loss": 0.82813978, + "num_input_tokens_seen": 204636470, + "step": 9498, + "time_per_iteration": 2.530721426010132 + }, + { + "auxiliary_loss_clip": 0.01117293, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.04064655, + "balance_loss_mlp": 1.02098751, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 2.149458396007251, + "language_loss": 0.66970569, + "learning_rate": 1.638436499891469e-06, + "loss": 0.69123173, + "num_input_tokens_seen": 204656640, + "step": 9499, + "time_per_iteration": 2.49385666847229 + }, + { + "auxiliary_loss_clip": 0.01087352, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.03917778, + "balance_loss_mlp": 1.02416599, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 1.8545823055513817, + "language_loss": 0.71478713, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73603344, + "num_input_tokens_seen": 204675475, + "step": 9500, + "time_per_iteration": 2.499195098876953 + }, + { + "auxiliary_loss_clip": 0.01090725, + "auxiliary_loss_mlp": 0.01032089, + "balance_loss_clip": 1.0390842, + "balance_loss_mlp": 1.01846313, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 2.0064187854504265, + "language_loss": 0.7664851, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78771323, + "num_input_tokens_seen": 204695385, + "step": 9501, + "time_per_iteration": 2.6021764278411865 + }, + { + "auxiliary_loss_clip": 0.01098519, + "auxiliary_loss_mlp": 0.01034507, + "balance_loss_clip": 1.0399152, + "balance_loss_mlp": 1.02165079, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.9630026078316545, + "language_loss": 0.75017715, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77150738, + "num_input_tokens_seen": 204714730, + "step": 9502, + "time_per_iteration": 2.506822347640991 + }, + { + "auxiliary_loss_clip": 0.01085802, + "auxiliary_loss_mlp": 0.01027363, + "balance_loss_clip": 1.04169202, + "balance_loss_mlp": 1.01475096, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 1.6127107530407252, + "language_loss": 0.82502019, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84615189, + "num_input_tokens_seen": 204735025, + "step": 9503, + "time_per_iteration": 2.548038959503174 + }, + { + "auxiliary_loss_clip": 0.01084385, + "auxiliary_loss_mlp": 0.01031766, + "balance_loss_clip": 1.04155827, + "balance_loss_mlp": 1.01977336, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.0851842776276523, + "language_loss": 0.85881388, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.87997544, + "num_input_tokens_seen": 204751365, + "step": 9504, + "time_per_iteration": 2.511873245239258 + }, + { + "auxiliary_loss_clip": 0.01072827, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.03775787, + "balance_loss_mlp": 1.01726007, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.72778046462293, + "language_loss": 0.75354075, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77457511, + "num_input_tokens_seen": 204768980, + "step": 9505, + "time_per_iteration": 2.5514047145843506 + }, + { + "auxiliary_loss_clip": 0.01115369, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.0403347, + "balance_loss_mlp": 1.02243626, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.553350535316159, + "language_loss": 0.81799126, + "learning_rate": 1.635755524332509e-06, + "loss": 0.83949083, + "num_input_tokens_seen": 204788110, + "step": 9506, + "time_per_iteration": 2.4463508129119873 + }, + { + "auxiliary_loss_clip": 0.01076696, + "auxiliary_loss_mlp": 0.00779261, + "balance_loss_clip": 1.03548479, + "balance_loss_mlp": 1.00053263, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.7039177718077902, + "language_loss": 0.772241, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79080051, + "num_input_tokens_seen": 204807240, + "step": 9507, + "time_per_iteration": 2.5370585918426514 + }, + { + "auxiliary_loss_clip": 0.01096682, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.03925121, + "balance_loss_mlp": 1.0234344, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 1.39650954325912, + "language_loss": 0.68499815, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70634139, + "num_input_tokens_seen": 204826415, + "step": 9508, + "time_per_iteration": 2.53788161277771 + }, + { + "auxiliary_loss_clip": 0.01118792, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.04097629, + "balance_loss_mlp": 1.01755667, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.1750025552007135, + "language_loss": 0.79677022, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81826878, + "num_input_tokens_seen": 204844305, + "step": 9509, + "time_per_iteration": 2.4193274974823 + }, + { + "auxiliary_loss_clip": 0.01101813, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.04059815, + "balance_loss_mlp": 1.01713407, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.7697998919483457, + "language_loss": 0.72450316, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74582553, + "num_input_tokens_seen": 204861765, + "step": 9510, + "time_per_iteration": 2.4485418796539307 + }, + { + "auxiliary_loss_clip": 0.01093405, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.03869224, + "balance_loss_mlp": 1.02052081, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.4329169995219928, + "language_loss": 0.69559479, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71685791, + "num_input_tokens_seen": 204882505, + "step": 9511, + "time_per_iteration": 4.04326605796814 + }, + { + "auxiliary_loss_clip": 0.01095552, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.04069507, + "balance_loss_mlp": 1.02235997, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 1.8190479608580647, + "language_loss": 0.6172384, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63854027, + "num_input_tokens_seen": 204899830, + "step": 9512, + "time_per_iteration": 2.4714879989624023 + }, + { + "auxiliary_loss_clip": 0.01092959, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.03933835, + "balance_loss_mlp": 1.01611388, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 2.7952385200649035, + "language_loss": 0.75616425, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.7773779, + "num_input_tokens_seen": 204918100, + "step": 9513, + "time_per_iteration": 2.5325393676757812 + }, + { + "auxiliary_loss_clip": 0.01027585, + "auxiliary_loss_mlp": 0.01003387, + "balance_loss_clip": 1.01134372, + "balance_loss_mlp": 1.00237334, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.915900930163454, + "language_loss": 0.66818315, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68849283, + "num_input_tokens_seen": 204972925, + "step": 9514, + "time_per_iteration": 3.0405101776123047 + }, + { + "auxiliary_loss_clip": 0.01112754, + "auxiliary_loss_mlp": 0.01042245, + "balance_loss_clip": 1.04362488, + "balance_loss_mlp": 1.02905405, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.0749967267250344, + "language_loss": 0.81177014, + "learning_rate": 1.63230955093099e-06, + "loss": 0.83332014, + "num_input_tokens_seen": 204990910, + "step": 9515, + "time_per_iteration": 2.5144340991973877 + }, + { + "auxiliary_loss_clip": 0.01096188, + "auxiliary_loss_mlp": 0.01031106, + "balance_loss_clip": 1.03690481, + "balance_loss_mlp": 1.01843357, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 2.59021822597038, + "language_loss": 0.85895991, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88023287, + "num_input_tokens_seen": 205010500, + "step": 9516, + "time_per_iteration": 2.5541632175445557 + }, + { + "auxiliary_loss_clip": 0.01087954, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.04453778, + "balance_loss_mlp": 1.01782393, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.7548810700504514, + "language_loss": 0.87432539, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89551491, + "num_input_tokens_seen": 205028560, + "step": 9517, + "time_per_iteration": 2.567539691925049 + }, + { + "auxiliary_loss_clip": 0.01072732, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.04101324, + "balance_loss_mlp": 1.01575112, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.8125341451275108, + "language_loss": 0.85539901, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.8764146, + "num_input_tokens_seen": 205048650, + "step": 9518, + "time_per_iteration": 2.6519622802734375 + }, + { + "auxiliary_loss_clip": 0.01100489, + "auxiliary_loss_mlp": 0.01031512, + "balance_loss_clip": 1.03895986, + "balance_loss_mlp": 1.01901913, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 2.2016113394557757, + "language_loss": 0.78932476, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.81064475, + "num_input_tokens_seen": 205066480, + "step": 9519, + "time_per_iteration": 2.47554349899292 + }, + { + "auxiliary_loss_clip": 0.01114713, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.04070914, + "balance_loss_mlp": 1.02029717, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.625081408598239, + "language_loss": 0.82907724, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85054719, + "num_input_tokens_seen": 205087475, + "step": 9520, + "time_per_iteration": 2.4977352619171143 + }, + { + "auxiliary_loss_clip": 0.0109611, + "auxiliary_loss_mlp": 0.01039633, + "balance_loss_clip": 1.04096496, + "balance_loss_mlp": 1.02638328, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.204453617777292, + "language_loss": 0.72250772, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74386513, + "num_input_tokens_seen": 205106495, + "step": 9521, + "time_per_iteration": 4.096383571624756 + }, + { + "auxiliary_loss_clip": 0.01113014, + "auxiliary_loss_mlp": 0.00777044, + "balance_loss_clip": 1.0389719, + "balance_loss_mlp": 1.00056052, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.6296299345864862, + "language_loss": 0.78268218, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.80158275, + "num_input_tokens_seen": 205128285, + "step": 9522, + "time_per_iteration": 2.5411014556884766 + }, + { + "auxiliary_loss_clip": 0.01090706, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.04383039, + "balance_loss_mlp": 1.02286386, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.5679252470511658, + "language_loss": 0.71606982, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73731685, + "num_input_tokens_seen": 205146595, + "step": 9523, + "time_per_iteration": 3.8450844287872314 + }, + { + "auxiliary_loss_clip": 0.01090467, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.03806138, + "balance_loss_mlp": 1.01685071, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.659715414913846, + "language_loss": 0.69858497, + "learning_rate": 1.628864706900738e-06, + "loss": 0.71977723, + "num_input_tokens_seen": 205164295, + "step": 9524, + "time_per_iteration": 2.535489082336426 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.0410248, + "balance_loss_mlp": 1.01860285, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.4142248122489731, + "language_loss": 0.65533805, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67669326, + "num_input_tokens_seen": 205185380, + "step": 9525, + "time_per_iteration": 2.5880672931671143 + }, + { + "auxiliary_loss_clip": 0.01088963, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.03851628, + "balance_loss_mlp": 1.01803064, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.8597036605466901, + "language_loss": 0.7226662, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74385321, + "num_input_tokens_seen": 205204895, + "step": 9526, + "time_per_iteration": 2.5291049480438232 + }, + { + "auxiliary_loss_clip": 0.01101429, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.03956223, + "balance_loss_mlp": 1.02019358, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.5885396695193186, + "language_loss": 0.80092496, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82226133, + "num_input_tokens_seen": 205223440, + "step": 9527, + "time_per_iteration": 2.54056978225708 + }, + { + "auxiliary_loss_clip": 0.0110201, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.03824854, + "balance_loss_mlp": 1.0227201, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.7859627700003307, + "language_loss": 0.72786444, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.7492401, + "num_input_tokens_seen": 205242800, + "step": 9528, + "time_per_iteration": 2.5129806995391846 + }, + { + "auxiliary_loss_clip": 0.01113869, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.03973651, + "balance_loss_mlp": 1.02332878, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 2.1977207394155496, + "language_loss": 0.86239558, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.88389039, + "num_input_tokens_seen": 205259465, + "step": 9529, + "time_per_iteration": 2.462265729904175 + }, + { + "auxiliary_loss_clip": 0.01018014, + "auxiliary_loss_mlp": 0.01001302, + "balance_loss_clip": 1.01189566, + "balance_loss_mlp": 1.00022912, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7653802781096999, + "language_loss": 0.56106794, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58126104, + "num_input_tokens_seen": 205314100, + "step": 9530, + "time_per_iteration": 2.930708169937134 + }, + { + "auxiliary_loss_clip": 0.01097422, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.04221416, + "balance_loss_mlp": 1.01720834, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 2.116954760450828, + "language_loss": 0.66400892, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68527341, + "num_input_tokens_seen": 205333420, + "step": 9531, + "time_per_iteration": 2.525686502456665 + }, + { + "auxiliary_loss_clip": 0.01097695, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.03573585, + "balance_loss_mlp": 1.02323055, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 1.9778152276847685, + "language_loss": 0.75506496, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77641356, + "num_input_tokens_seen": 205350995, + "step": 9532, + "time_per_iteration": 2.6411263942718506 + }, + { + "auxiliary_loss_clip": 0.0111182, + "auxiliary_loss_mlp": 0.01028318, + "balance_loss_clip": 1.03871226, + "balance_loss_mlp": 1.01552081, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 1.7412030089512305, + "language_loss": 0.7869215, + "learning_rate": 1.625421002822686e-06, + "loss": 0.80832291, + "num_input_tokens_seen": 205372675, + "step": 9533, + "time_per_iteration": 2.4899511337280273 + }, + { + "auxiliary_loss_clip": 0.01100519, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.03937495, + "balance_loss_mlp": 1.0172286, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 1.6399917419603325, + "language_loss": 0.85472959, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.8760246, + "num_input_tokens_seen": 205392590, + "step": 9534, + "time_per_iteration": 2.5098721981048584 + }, + { + "auxiliary_loss_clip": 0.01094245, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.04026294, + "balance_loss_mlp": 1.01820207, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.9112743107992691, + "language_loss": 0.75338578, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77464986, + "num_input_tokens_seen": 205414885, + "step": 9535, + "time_per_iteration": 4.211116075515747 + }, + { + "auxiliary_loss_clip": 0.01098853, + "auxiliary_loss_mlp": 0.01029688, + "balance_loss_clip": 1.04027462, + "balance_loss_mlp": 1.01708782, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.6806666349129151, + "language_loss": 0.70868659, + "learning_rate": 1.624273356614346e-06, + "loss": 0.729972, + "num_input_tokens_seen": 205434440, + "step": 9536, + "time_per_iteration": 2.5702033042907715 + }, + { + "auxiliary_loss_clip": 0.01072132, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.03370845, + "balance_loss_mlp": 1.02270007, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 2.0091232464349678, + "language_loss": 0.69798428, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.7190671, + "num_input_tokens_seen": 205454225, + "step": 9537, + "time_per_iteration": 2.623673915863037 + }, + { + "auxiliary_loss_clip": 0.0111341, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.03961825, + "balance_loss_mlp": 1.01679897, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.6938170939300579, + "language_loss": 0.62846828, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64989555, + "num_input_tokens_seen": 205474750, + "step": 9538, + "time_per_iteration": 2.512571334838867 + }, + { + "auxiliary_loss_clip": 0.01101366, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.03911304, + "balance_loss_mlp": 1.02090645, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.6789462574895813, + "language_loss": 0.83221292, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85355949, + "num_input_tokens_seen": 205495495, + "step": 9539, + "time_per_iteration": 2.5064926147460938 + }, + { + "auxiliary_loss_clip": 0.01085343, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.0483532, + "balance_loss_mlp": 1.01728773, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 2.019076160013828, + "language_loss": 0.73098093, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.7521311, + "num_input_tokens_seen": 205510070, + "step": 9540, + "time_per_iteration": 2.612989664077759 + }, + { + "auxiliary_loss_clip": 0.01095367, + "auxiliary_loss_mlp": 0.00776667, + "balance_loss_clip": 1.03683674, + "balance_loss_mlp": 1.00045252, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.8195663342592958, + "language_loss": 0.80177557, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82049596, + "num_input_tokens_seen": 205530190, + "step": 9541, + "time_per_iteration": 2.526320457458496 + }, + { + "auxiliary_loss_clip": 0.01094292, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.03911948, + "balance_loss_mlp": 1.01840544, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.299735275632943, + "language_loss": 0.65051675, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.67177308, + "num_input_tokens_seen": 205547380, + "step": 9542, + "time_per_iteration": 2.471209764480591 + }, + { + "auxiliary_loss_clip": 0.01093164, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.03905845, + "balance_loss_mlp": 1.01733661, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.346319001011611, + "language_loss": 0.82866818, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.84989214, + "num_input_tokens_seen": 205566540, + "step": 9543, + "time_per_iteration": 2.4815030097961426 + }, + { + "auxiliary_loss_clip": 0.01075694, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.03693485, + "balance_loss_mlp": 1.01790726, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 1.6983056496651903, + "language_loss": 0.73222017, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75329804, + "num_input_tokens_seen": 205584200, + "step": 9544, + "time_per_iteration": 2.5622804164886475 + }, + { + "auxiliary_loss_clip": 0.01068949, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.04032636, + "balance_loss_mlp": 1.01589704, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 1.718821589078253, + "language_loss": 0.76751167, + "learning_rate": 1.620831188925733e-06, + "loss": 0.7884894, + "num_input_tokens_seen": 205604675, + "step": 9545, + "time_per_iteration": 2.602388381958008 + }, + { + "auxiliary_loss_clip": 0.01092686, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.03964353, + "balance_loss_mlp": 1.02262795, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 2.021076591121935, + "language_loss": 0.56694961, + "learning_rate": 1.620448797546459e-06, + "loss": 0.5882256, + "num_input_tokens_seen": 205624680, + "step": 9546, + "time_per_iteration": 2.6082303524017334 + }, + { + "auxiliary_loss_clip": 0.01087544, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.03551531, + "balance_loss_mlp": 1.02088237, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.392531561983179, + "language_loss": 0.7631067, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.7843191, + "num_input_tokens_seen": 205641950, + "step": 9547, + "time_per_iteration": 2.510037899017334 + }, + { + "auxiliary_loss_clip": 0.01101483, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.03671217, + "balance_loss_mlp": 1.02050555, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 2.371372035036336, + "language_loss": 0.74104482, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76239872, + "num_input_tokens_seen": 205660130, + "step": 9548, + "time_per_iteration": 2.479172945022583 + }, + { + "auxiliary_loss_clip": 0.01087313, + "auxiliary_loss_mlp": 0.01035327, + "balance_loss_clip": 1.03855658, + "balance_loss_mlp": 1.02301896, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 2.5903249361160885, + "language_loss": 0.69288611, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71411252, + "num_input_tokens_seen": 205678895, + "step": 9549, + "time_per_iteration": 2.577852964401245 + }, + { + "auxiliary_loss_clip": 0.01065407, + "auxiliary_loss_mlp": 0.0102844, + "balance_loss_clip": 1.04209518, + "balance_loss_mlp": 1.01641822, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.5593439387438803, + "language_loss": 0.79398513, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81492358, + "num_input_tokens_seen": 205698450, + "step": 9550, + "time_per_iteration": 4.0729005336761475 + }, + { + "auxiliary_loss_clip": 0.01086349, + "auxiliary_loss_mlp": 0.01038501, + "balance_loss_clip": 1.03654289, + "balance_loss_mlp": 1.02379084, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 2.177416882416277, + "language_loss": 0.67906213, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70031065, + "num_input_tokens_seen": 205714870, + "step": 9551, + "time_per_iteration": 2.4943549633026123 + }, + { + "auxiliary_loss_clip": 0.01077687, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.03912854, + "balance_loss_mlp": 1.01977515, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 2.276241743268633, + "language_loss": 0.71884394, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.7399404, + "num_input_tokens_seen": 205736045, + "step": 9552, + "time_per_iteration": 2.5904488563537598 + }, + { + "auxiliary_loss_clip": 0.01100608, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.04273236, + "balance_loss_mlp": 1.01780677, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 2.50741042273489, + "language_loss": 0.80223382, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82354558, + "num_input_tokens_seen": 205754445, + "step": 9553, + "time_per_iteration": 2.506561279296875 + }, + { + "auxiliary_loss_clip": 0.01105987, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.04006779, + "balance_loss_mlp": 1.01864529, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.0475942894459958, + "language_loss": 0.83456278, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85593021, + "num_input_tokens_seen": 205770595, + "step": 9554, + "time_per_iteration": 2.4557688236236572 + }, + { + "auxiliary_loss_clip": 0.01108157, + "auxiliary_loss_mlp": 0.00778656, + "balance_loss_clip": 1.03983641, + "balance_loss_mlp": 1.00063467, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.4838378649300006, + "language_loss": 0.71395165, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.7328198, + "num_input_tokens_seen": 205791935, + "step": 9555, + "time_per_iteration": 2.5534420013427734 + }, + { + "auxiliary_loss_clip": 0.01093935, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.04074764, + "balance_loss_mlp": 1.01713896, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.508810224376507, + "language_loss": 0.72738838, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.7486276, + "num_input_tokens_seen": 205807260, + "step": 9556, + "time_per_iteration": 2.483790636062622 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.04085195, + "balance_loss_mlp": 1.02068698, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.7495948967863255, + "language_loss": 0.73928714, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76067042, + "num_input_tokens_seen": 205826885, + "step": 9557, + "time_per_iteration": 2.519240379333496 + }, + { + "auxiliary_loss_clip": 0.01104841, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.03928709, + "balance_loss_mlp": 1.0198133, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.5434846337241153, + "language_loss": 0.67882574, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.70019829, + "num_input_tokens_seen": 205844630, + "step": 9558, + "time_per_iteration": 2.476402997970581 + }, + { + "auxiliary_loss_clip": 0.01090025, + "auxiliary_loss_mlp": 0.01046714, + "balance_loss_clip": 1.03907526, + "balance_loss_mlp": 1.02926219, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 2.0016406511859333, + "language_loss": 0.71144855, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73281592, + "num_input_tokens_seen": 205860960, + "step": 9559, + "time_per_iteration": 2.4716312885284424 + }, + { + "auxiliary_loss_clip": 0.0109182, + "auxiliary_loss_mlp": 0.00775999, + "balance_loss_clip": 1.03962505, + "balance_loss_mlp": 1.00054395, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 3.412607174535102, + "language_loss": 0.79324955, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.8119278, + "num_input_tokens_seen": 205880675, + "step": 9560, + "time_per_iteration": 2.5431106090545654 + }, + { + "auxiliary_loss_clip": 0.01051367, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.03732109, + "balance_loss_mlp": 1.01890182, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 1.83963253555009, + "language_loss": 0.64456832, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66540056, + "num_input_tokens_seen": 205900050, + "step": 9561, + "time_per_iteration": 4.173317909240723 + }, + { + "auxiliary_loss_clip": 0.0111272, + "auxiliary_loss_mlp": 0.01036801, + "balance_loss_clip": 1.04119575, + "balance_loss_mlp": 1.02312183, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.6308837145687451, + "language_loss": 0.71016687, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73166203, + "num_input_tokens_seen": 205918855, + "step": 9562, + "time_per_iteration": 3.8250656127929688 + }, + { + "auxiliary_loss_clip": 0.01063095, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.03612614, + "balance_loss_mlp": 1.02385116, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.658181475275185, + "language_loss": 0.84154034, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86253059, + "num_input_tokens_seen": 205936970, + "step": 9563, + "time_per_iteration": 2.593865156173706 + }, + { + "auxiliary_loss_clip": 0.01072237, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.04139161, + "balance_loss_mlp": 1.02428091, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 1.7918888181082828, + "language_loss": 0.57140005, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59249699, + "num_input_tokens_seen": 205954630, + "step": 9564, + "time_per_iteration": 2.6822597980499268 + }, + { + "auxiliary_loss_clip": 0.01086004, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.03522563, + "balance_loss_mlp": 1.02004552, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.928420744291761, + "language_loss": 0.76059902, + "learning_rate": 1.613186112465078e-06, + "loss": 0.78179336, + "num_input_tokens_seen": 205971510, + "step": 9565, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.01006305, + "auxiliary_loss_mlp": 0.0101197, + "balance_loss_clip": 1.01777291, + "balance_loss_mlp": 1.01066518, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7431885865424409, + "language_loss": 0.60662502, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62680775, + "num_input_tokens_seen": 206035125, + "step": 9566, + "time_per_iteration": 3.2547318935394287 + }, + { + "auxiliary_loss_clip": 0.01089645, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.04024291, + "balance_loss_mlp": 1.02115631, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 1.882285102658365, + "language_loss": 0.75253445, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.7737689, + "num_input_tokens_seen": 206052075, + "step": 9567, + "time_per_iteration": 2.541869878768921 + }, + { + "auxiliary_loss_clip": 0.01103636, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.04001617, + "balance_loss_mlp": 1.01774335, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.4112169512363053, + "language_loss": 0.74571711, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76705277, + "num_input_tokens_seen": 206069970, + "step": 9568, + "time_per_iteration": 2.474351167678833 + }, + { + "auxiliary_loss_clip": 0.01116605, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.04086685, + "balance_loss_mlp": 1.0212996, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.684431711942267, + "language_loss": 0.71268547, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73419219, + "num_input_tokens_seen": 206088950, + "step": 9569, + "time_per_iteration": 2.4618186950683594 + }, + { + "auxiliary_loss_clip": 0.01108817, + "auxiliary_loss_mlp": 0.01040216, + "balance_loss_clip": 1.04308116, + "balance_loss_mlp": 1.0267688, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 2.0815525241676682, + "language_loss": 0.55210435, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57359463, + "num_input_tokens_seen": 206107780, + "step": 9570, + "time_per_iteration": 2.4742066860198975 + }, + { + "auxiliary_loss_clip": 0.01113486, + "auxiliary_loss_mlp": 0.01035368, + "balance_loss_clip": 1.03986621, + "balance_loss_mlp": 1.02324998, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.6802167982193075, + "language_loss": 0.64567429, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66716278, + "num_input_tokens_seen": 206127445, + "step": 9571, + "time_per_iteration": 2.476836919784546 + }, + { + "auxiliary_loss_clip": 0.01105506, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.03918743, + "balance_loss_mlp": 1.01973212, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.6032555005507196, + "language_loss": 0.66839975, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.68978512, + "num_input_tokens_seen": 206152005, + "step": 9572, + "time_per_iteration": 2.7529947757720947 + }, + { + "auxiliary_loss_clip": 0.01093962, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.04116786, + "balance_loss_mlp": 1.0196414, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 1.8376726581744767, + "language_loss": 0.72516394, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74642962, + "num_input_tokens_seen": 206169875, + "step": 9573, + "time_per_iteration": 2.536311149597168 + }, + { + "auxiliary_loss_clip": 0.01112289, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.04337406, + "balance_loss_mlp": 1.01762843, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.8439738470789484, + "language_loss": 0.76362514, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.7850399, + "num_input_tokens_seen": 206192635, + "step": 9574, + "time_per_iteration": 4.325340509414673 + }, + { + "auxiliary_loss_clip": 0.01067772, + "auxiliary_loss_mlp": 0.01039243, + "balance_loss_clip": 1.03611946, + "balance_loss_mlp": 1.0246098, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 3.2205288992097825, + "language_loss": 0.66586494, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.68693507, + "num_input_tokens_seen": 206211485, + "step": 9575, + "time_per_iteration": 2.7025575637817383 + }, + { + "auxiliary_loss_clip": 0.01091731, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.03986835, + "balance_loss_mlp": 1.01988685, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.7676908460362246, + "language_loss": 0.80073762, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.82197869, + "num_input_tokens_seen": 206231740, + "step": 9576, + "time_per_iteration": 2.526667594909668 + }, + { + "auxiliary_loss_clip": 0.01094409, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.04242182, + "balance_loss_mlp": 1.01639736, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.6291851394184365, + "language_loss": 0.69461477, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71583819, + "num_input_tokens_seen": 206250975, + "step": 9577, + "time_per_iteration": 2.5463948249816895 + }, + { + "auxiliary_loss_clip": 0.01106292, + "auxiliary_loss_mlp": 0.01034176, + "balance_loss_clip": 1.04009032, + "balance_loss_mlp": 1.02232039, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 2.026136219848313, + "language_loss": 0.66671348, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68811822, + "num_input_tokens_seen": 206268800, + "step": 9578, + "time_per_iteration": 2.4925968647003174 + }, + { + "auxiliary_loss_clip": 0.01086416, + "auxiliary_loss_mlp": 0.01028192, + "balance_loss_clip": 1.03849936, + "balance_loss_mlp": 1.01611626, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.6032720348387475, + "language_loss": 0.72426236, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74540848, + "num_input_tokens_seen": 206287190, + "step": 9579, + "time_per_iteration": 2.5076324939727783 + }, + { + "auxiliary_loss_clip": 0.01099649, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.04221141, + "balance_loss_mlp": 1.01792717, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 4.232239786152724, + "language_loss": 0.64605182, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66736823, + "num_input_tokens_seen": 206307020, + "step": 9580, + "time_per_iteration": 2.550455093383789 + }, + { + "auxiliary_loss_clip": 0.01089796, + "auxiliary_loss_mlp": 0.0103694, + "balance_loss_clip": 1.03681993, + "balance_loss_mlp": 1.0231179, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.8646418740840838, + "language_loss": 0.85675812, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87802547, + "num_input_tokens_seen": 206324095, + "step": 9581, + "time_per_iteration": 2.510399341583252 + }, + { + "auxiliary_loss_clip": 0.01122946, + "auxiliary_loss_mlp": 0.01037207, + "balance_loss_clip": 1.04498506, + "balance_loss_mlp": 1.0237658, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.83656510150433, + "language_loss": 0.67684197, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69844353, + "num_input_tokens_seen": 206343210, + "step": 9582, + "time_per_iteration": 2.4390416145324707 + }, + { + "auxiliary_loss_clip": 0.01021647, + "auxiliary_loss_mlp": 0.01002028, + "balance_loss_clip": 1.01349831, + "balance_loss_mlp": 1.00086021, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6408510218399661, + "language_loss": 0.57172561, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59196234, + "num_input_tokens_seen": 206415935, + "step": 9583, + "time_per_iteration": 3.2433764934539795 + }, + { + "auxiliary_loss_clip": 0.01090024, + "auxiliary_loss_mlp": 0.01029262, + "balance_loss_clip": 1.04052258, + "balance_loss_mlp": 1.01692986, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 2.8161570197803423, + "language_loss": 0.8221516, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84334439, + "num_input_tokens_seen": 206431900, + "step": 9584, + "time_per_iteration": 2.509136915206909 + }, + { + "auxiliary_loss_clip": 0.01037806, + "auxiliary_loss_mlp": 0.01001853, + "balance_loss_clip": 1.0129056, + "balance_loss_mlp": 1.00064874, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6245630748379312, + "language_loss": 0.4954285, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51582509, + "num_input_tokens_seen": 206501200, + "step": 9585, + "time_per_iteration": 3.1214816570281982 + }, + { + "auxiliary_loss_clip": 0.01088245, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03807473, + "balance_loss_mlp": 1.01530313, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 1.7415038130233271, + "language_loss": 0.84914744, + "learning_rate": 1.605165098835465e-06, + "loss": 0.87030435, + "num_input_tokens_seen": 206520575, + "step": 9586, + "time_per_iteration": 2.537534236907959 + }, + { + "auxiliary_loss_clip": 0.01101315, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.04024601, + "balance_loss_mlp": 1.01903105, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.7521194205543693, + "language_loss": 0.79943752, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82077861, + "num_input_tokens_seen": 206538060, + "step": 9587, + "time_per_iteration": 2.446286678314209 + }, + { + "auxiliary_loss_clip": 0.01086103, + "auxiliary_loss_mlp": 0.01039944, + "balance_loss_clip": 1.03732681, + "balance_loss_mlp": 1.02634239, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.7450794652101806, + "language_loss": 0.6586532, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.67991364, + "num_input_tokens_seen": 206557320, + "step": 9588, + "time_per_iteration": 2.5214056968688965 + }, + { + "auxiliary_loss_clip": 0.01089362, + "auxiliary_loss_mlp": 0.01038541, + "balance_loss_clip": 1.03780484, + "balance_loss_mlp": 1.0246172, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 2.818811972209651, + "language_loss": 0.7888242, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.81010318, + "num_input_tokens_seen": 206575780, + "step": 9589, + "time_per_iteration": 3.9928102493286133 + }, + { + "auxiliary_loss_clip": 0.01109993, + "auxiliary_loss_mlp": 0.01025425, + "balance_loss_clip": 1.03787673, + "balance_loss_mlp": 1.01376617, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 2.0428460903985006, + "language_loss": 0.79927373, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.82062793, + "num_input_tokens_seen": 206594100, + "step": 9590, + "time_per_iteration": 2.4812026023864746 + }, + { + "auxiliary_loss_clip": 0.01058967, + "auxiliary_loss_mlp": 0.00777666, + "balance_loss_clip": 1.04163933, + "balance_loss_mlp": 1.0006423, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 2.8575213119040597, + "language_loss": 0.63165253, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.65001887, + "num_input_tokens_seen": 206613325, + "step": 9591, + "time_per_iteration": 2.6433265209198 + }, + { + "auxiliary_loss_clip": 0.01117457, + "auxiliary_loss_mlp": 0.00778529, + "balance_loss_clip": 1.04231191, + "balance_loss_mlp": 1.00066674, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 2.0394469002634357, + "language_loss": 0.78287756, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.80183744, + "num_input_tokens_seen": 206634265, + "step": 9592, + "time_per_iteration": 2.5208168029785156 + }, + { + "auxiliary_loss_clip": 0.01000389, + "auxiliary_loss_mlp": 0.01052205, + "balance_loss_clip": 1.01989245, + "balance_loss_mlp": 1.05075645, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7851660316938427, + "language_loss": 0.59728569, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61781162, + "num_input_tokens_seen": 206696990, + "step": 9593, + "time_per_iteration": 3.6338002681732178 + }, + { + "auxiliary_loss_clip": 0.0110807, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.03927135, + "balance_loss_mlp": 1.02919817, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 1.5731316537544695, + "language_loss": 0.71045589, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73196846, + "num_input_tokens_seen": 206717815, + "step": 9594, + "time_per_iteration": 2.869375228881836 + }, + { + "auxiliary_loss_clip": 0.01081721, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.03918314, + "balance_loss_mlp": 1.02343774, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 2.052222952404336, + "language_loss": 0.71098137, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73214757, + "num_input_tokens_seen": 206735985, + "step": 9595, + "time_per_iteration": 2.5588762760162354 + }, + { + "auxiliary_loss_clip": 0.01116399, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.04129028, + "balance_loss_mlp": 1.02328837, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.1245867770881928, + "language_loss": 0.69613647, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.71766579, + "num_input_tokens_seen": 206753370, + "step": 9596, + "time_per_iteration": 2.5182836055755615 + }, + { + "auxiliary_loss_clip": 0.01094092, + "auxiliary_loss_mlp": 0.01037168, + "balance_loss_clip": 1.04218185, + "balance_loss_mlp": 1.02265978, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 3.03014010026178, + "language_loss": 0.67809772, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69941032, + "num_input_tokens_seen": 206777645, + "step": 9597, + "time_per_iteration": 2.729184627532959 + }, + { + "auxiliary_loss_clip": 0.01091187, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.03806853, + "balance_loss_mlp": 1.02154732, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 2.283065285809821, + "language_loss": 0.81867152, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.8399241, + "num_input_tokens_seen": 206794865, + "step": 9598, + "time_per_iteration": 2.539562463760376 + }, + { + "auxiliary_loss_clip": 0.01073382, + "auxiliary_loss_mlp": 0.01045315, + "balance_loss_clip": 1.0325563, + "balance_loss_mlp": 1.02947235, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.5867362834877816, + "language_loss": 0.72707868, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74826568, + "num_input_tokens_seen": 206814095, + "step": 9599, + "time_per_iteration": 2.537041425704956 + }, + { + "auxiliary_loss_clip": 0.01111161, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.03955579, + "balance_loss_mlp": 1.01932013, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 2.016167403532889, + "language_loss": 0.7813279, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80275011, + "num_input_tokens_seen": 206832245, + "step": 9600, + "time_per_iteration": 4.340920925140381 + }, + { + "auxiliary_loss_clip": 0.01107823, + "auxiliary_loss_mlp": 0.00779114, + "balance_loss_clip": 1.04209948, + "balance_loss_mlp": 1.00062597, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.5920476271538089, + "language_loss": 0.72461587, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74348527, + "num_input_tokens_seen": 206851535, + "step": 9601, + "time_per_iteration": 2.5349295139312744 + }, + { + "auxiliary_loss_clip": 0.01063581, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.04277086, + "balance_loss_mlp": 1.01978374, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.6718395689767631, + "language_loss": 0.68658811, + "learning_rate": 1.599058274973348e-06, + "loss": 0.70754695, + "num_input_tokens_seen": 206870595, + "step": 9602, + "time_per_iteration": 4.033069610595703 + }, + { + "auxiliary_loss_clip": 0.01085185, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.0367229, + "balance_loss_mlp": 1.02068543, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4785351603237573, + "language_loss": 0.7305364, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75172144, + "num_input_tokens_seen": 206892320, + "step": 9603, + "time_per_iteration": 2.534745693206787 + }, + { + "auxiliary_loss_clip": 0.01102515, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.04037666, + "balance_loss_mlp": 1.01605225, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.9098379588827552, + "language_loss": 0.76915663, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.79047126, + "num_input_tokens_seen": 206912485, + "step": 9604, + "time_per_iteration": 2.498476505279541 + }, + { + "auxiliary_loss_clip": 0.01084541, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.03984094, + "balance_loss_mlp": 1.0220474, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.7411128683165304, + "language_loss": 0.83251661, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85372216, + "num_input_tokens_seen": 206929100, + "step": 9605, + "time_per_iteration": 2.5051980018615723 + }, + { + "auxiliary_loss_clip": 0.01093199, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.04323614, + "balance_loss_mlp": 1.01672065, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.5772622298334147, + "language_loss": 0.77622396, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.79747665, + "num_input_tokens_seen": 206947020, + "step": 9606, + "time_per_iteration": 2.573115110397339 + }, + { + "auxiliary_loss_clip": 0.01115032, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.04093039, + "balance_loss_mlp": 1.02090764, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.7688588204392168, + "language_loss": 0.74102014, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76250362, + "num_input_tokens_seen": 206964065, + "step": 9607, + "time_per_iteration": 2.421590566635132 + }, + { + "auxiliary_loss_clip": 0.01078064, + "auxiliary_loss_mlp": 0.01032919, + "balance_loss_clip": 1.04305327, + "balance_loss_mlp": 1.02004409, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.569471614650451, + "language_loss": 0.69378746, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71489727, + "num_input_tokens_seen": 206981940, + "step": 9608, + "time_per_iteration": 2.553236484527588 + }, + { + "auxiliary_loss_clip": 0.01083021, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.03734064, + "balance_loss_mlp": 1.01745057, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 1.803193826332401, + "language_loss": 0.76413184, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78527224, + "num_input_tokens_seen": 207002365, + "step": 9609, + "time_per_iteration": 2.619452714920044 + }, + { + "auxiliary_loss_clip": 0.01086985, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.04167485, + "balance_loss_mlp": 1.01982069, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 2.01903749206305, + "language_loss": 0.77436322, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79554904, + "num_input_tokens_seen": 207021195, + "step": 9610, + "time_per_iteration": 2.6225061416625977 + }, + { + "auxiliary_loss_clip": 0.01080722, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.03613508, + "balance_loss_mlp": 1.01561904, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 3.0758847433053007, + "language_loss": 0.6874842, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70858252, + "num_input_tokens_seen": 207037465, + "step": 9611, + "time_per_iteration": 2.522970676422119 + }, + { + "auxiliary_loss_clip": 0.01101936, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.03714991, + "balance_loss_mlp": 1.01587296, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 2.0712668593344055, + "language_loss": 0.83088213, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85218835, + "num_input_tokens_seen": 207054230, + "step": 9612, + "time_per_iteration": 2.4890003204345703 + }, + { + "auxiliary_loss_clip": 0.01115726, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.04137444, + "balance_loss_mlp": 1.01995873, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.591344332809051, + "language_loss": 0.79551816, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81700742, + "num_input_tokens_seen": 207073150, + "step": 9613, + "time_per_iteration": 2.451122283935547 + }, + { + "auxiliary_loss_clip": 0.01102735, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.03859639, + "balance_loss_mlp": 1.01756191, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 5.608810592783442, + "language_loss": 0.77564979, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79696453, + "num_input_tokens_seen": 207090375, + "step": 9614, + "time_per_iteration": 3.9649035930633545 + }, + { + "auxiliary_loss_clip": 0.01084579, + "auxiliary_loss_mlp": 0.0103342, + "balance_loss_clip": 1.0396744, + "balance_loss_mlp": 1.02067614, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.549592811477571, + "language_loss": 0.80971247, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.83089244, + "num_input_tokens_seen": 207106030, + "step": 9615, + "time_per_iteration": 2.495459794998169 + }, + { + "auxiliary_loss_clip": 0.01102946, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.0370214, + "balance_loss_mlp": 1.01782775, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.6492389865742545, + "language_loss": 0.67201078, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69335294, + "num_input_tokens_seen": 207125435, + "step": 9616, + "time_per_iteration": 2.5120301246643066 + }, + { + "auxiliary_loss_clip": 0.01103333, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.04076433, + "balance_loss_mlp": 1.01836252, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.6526428295132343, + "language_loss": 0.77801883, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79936492, + "num_input_tokens_seen": 207145095, + "step": 9617, + "time_per_iteration": 2.520404815673828 + }, + { + "auxiliary_loss_clip": 0.01092764, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.03987837, + "balance_loss_mlp": 1.02188349, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.5728331157210156, + "language_loss": 0.75022578, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77150863, + "num_input_tokens_seen": 207166045, + "step": 9618, + "time_per_iteration": 2.5871386528015137 + }, + { + "auxiliary_loss_clip": 0.01112976, + "auxiliary_loss_mlp": 0.010293, + "balance_loss_clip": 1.04020667, + "balance_loss_mlp": 1.01699805, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.5730246951721771, + "language_loss": 0.82127512, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.84269786, + "num_input_tokens_seen": 207185290, + "step": 9619, + "time_per_iteration": 2.4411847591400146 + }, + { + "auxiliary_loss_clip": 0.01098052, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.04210865, + "balance_loss_mlp": 1.01604819, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 2.0043117949261564, + "language_loss": 0.72761548, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.74888033, + "num_input_tokens_seen": 207205505, + "step": 9620, + "time_per_iteration": 2.5572471618652344 + }, + { + "auxiliary_loss_clip": 0.01095947, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.03972006, + "balance_loss_mlp": 1.01915765, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.6593793795718912, + "language_loss": 0.77451915, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79580009, + "num_input_tokens_seen": 207225315, + "step": 9621, + "time_per_iteration": 2.591223955154419 + }, + { + "auxiliary_loss_clip": 0.01055424, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.03301263, + "balance_loss_mlp": 1.02215922, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.403598660659966, + "language_loss": 0.70164609, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72257286, + "num_input_tokens_seen": 207247690, + "step": 9622, + "time_per_iteration": 2.678739309310913 + }, + { + "auxiliary_loss_clip": 0.01027311, + "auxiliary_loss_mlp": 0.01020943, + "balance_loss_clip": 1.02240682, + "balance_loss_mlp": 1.01954794, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7764256773852548, + "language_loss": 0.55991912, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58040166, + "num_input_tokens_seen": 207301735, + "step": 9623, + "time_per_iteration": 3.120877265930176 + }, + { + "auxiliary_loss_clip": 0.01081419, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.04064894, + "balance_loss_mlp": 1.02491689, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 2.2690952112510945, + "language_loss": 0.71064138, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73184943, + "num_input_tokens_seen": 207321240, + "step": 9624, + "time_per_iteration": 2.625325918197632 + }, + { + "auxiliary_loss_clip": 0.01085397, + "auxiliary_loss_mlp": 0.01039581, + "balance_loss_clip": 1.03936291, + "balance_loss_mlp": 1.0257467, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 2.2411044040511054, + "language_loss": 0.82594937, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84719914, + "num_input_tokens_seen": 207339540, + "step": 9625, + "time_per_iteration": 2.5551493167877197 + }, + { + "auxiliary_loss_clip": 0.01113301, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.03957498, + "balance_loss_mlp": 1.01835263, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.475541015488528, + "language_loss": 0.70386624, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72531915, + "num_input_tokens_seen": 207360470, + "step": 9626, + "time_per_iteration": 2.482407808303833 + }, + { + "auxiliary_loss_clip": 0.01091519, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.03874254, + "balance_loss_mlp": 1.02296281, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.4313619698246214, + "language_loss": 0.71563363, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.73690116, + "num_input_tokens_seen": 207383080, + "step": 9627, + "time_per_iteration": 2.6077895164489746 + }, + { + "auxiliary_loss_clip": 0.01102609, + "auxiliary_loss_mlp": 0.01028874, + "balance_loss_clip": 1.03814054, + "balance_loss_mlp": 1.0160706, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 2.593425039698886, + "language_loss": 0.83885008, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86016494, + "num_input_tokens_seen": 207401000, + "step": 9628, + "time_per_iteration": 4.010119199752808 + }, + { + "auxiliary_loss_clip": 0.01091891, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.03642726, + "balance_loss_mlp": 1.01847053, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 2.433837347813214, + "language_loss": 0.72371435, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74494767, + "num_input_tokens_seen": 207419230, + "step": 9629, + "time_per_iteration": 2.56303071975708 + }, + { + "auxiliary_loss_clip": 0.01093808, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.0402627, + "balance_loss_mlp": 1.02296758, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 2.379013065490363, + "language_loss": 0.74485636, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76616728, + "num_input_tokens_seen": 207437615, + "step": 9630, + "time_per_iteration": 2.5247397422790527 + }, + { + "auxiliary_loss_clip": 0.01080932, + "auxiliary_loss_mlp": 0.00778388, + "balance_loss_clip": 1.03792894, + "balance_loss_mlp": 1.00071669, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.686694549292198, + "language_loss": 0.79001492, + "learning_rate": 1.587999618060523e-06, + "loss": 0.80860806, + "num_input_tokens_seen": 207457270, + "step": 9631, + "time_per_iteration": 2.5558578968048096 + }, + { + "auxiliary_loss_clip": 0.01113855, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.03885496, + "balance_loss_mlp": 1.01899171, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.5596976215801064, + "language_loss": 0.75405455, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77551013, + "num_input_tokens_seen": 207477890, + "step": 9632, + "time_per_iteration": 2.486999750137329 + }, + { + "auxiliary_loss_clip": 0.01087393, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.03879845, + "balance_loss_mlp": 1.01649249, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 1.9082458530071327, + "language_loss": 0.79888749, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.82006341, + "num_input_tokens_seen": 207497670, + "step": 9633, + "time_per_iteration": 2.5147950649261475 + }, + { + "auxiliary_loss_clip": 0.0108912, + "auxiliary_loss_mlp": 0.01042682, + "balance_loss_clip": 1.03954124, + "balance_loss_mlp": 1.02862084, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.6315735869991843, + "language_loss": 0.77711588, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.7984339, + "num_input_tokens_seen": 207516105, + "step": 9634, + "time_per_iteration": 2.567228078842163 + }, + { + "auxiliary_loss_clip": 0.01099783, + "auxiliary_loss_mlp": 0.01041254, + "balance_loss_clip": 1.03940773, + "balance_loss_mlp": 1.02812314, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.7003434823943375, + "language_loss": 0.63472641, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65613681, + "num_input_tokens_seen": 207533685, + "step": 9635, + "time_per_iteration": 2.5061686038970947 + }, + { + "auxiliary_loss_clip": 0.01089512, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.03831077, + "balance_loss_mlp": 1.02757788, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.38822139147432, + "language_loss": 0.77192175, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79321754, + "num_input_tokens_seen": 207552840, + "step": 9636, + "time_per_iteration": 2.5364794731140137 + }, + { + "auxiliary_loss_clip": 0.01085786, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.03488064, + "balance_loss_mlp": 1.02408981, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.740378532677386, + "language_loss": 0.68488151, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70610118, + "num_input_tokens_seen": 207572095, + "step": 9637, + "time_per_iteration": 2.520073413848877 + }, + { + "auxiliary_loss_clip": 0.01075504, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.03915715, + "balance_loss_mlp": 1.02375734, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.427592166887527, + "language_loss": 0.72359931, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74472535, + "num_input_tokens_seen": 207587495, + "step": 9638, + "time_per_iteration": 4.100676774978638 + }, + { + "auxiliary_loss_clip": 0.01104796, + "auxiliary_loss_mlp": 0.01035438, + "balance_loss_clip": 1.0415374, + "balance_loss_mlp": 1.02293897, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 1.6325275080847765, + "language_loss": 0.72238922, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74379158, + "num_input_tokens_seen": 207606795, + "step": 9639, + "time_per_iteration": 2.47624135017395 + }, + { + "auxiliary_loss_clip": 0.01094329, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.03817749, + "balance_loss_mlp": 1.02609754, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 2.8186264533039975, + "language_loss": 0.69801271, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71934783, + "num_input_tokens_seen": 207623620, + "step": 9640, + "time_per_iteration": 2.4874370098114014 + }, + { + "auxiliary_loss_clip": 0.01095587, + "auxiliary_loss_mlp": 0.01048361, + "balance_loss_clip": 1.03977501, + "balance_loss_mlp": 1.03434157, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.787519195534317, + "language_loss": 0.77757865, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.79901814, + "num_input_tokens_seen": 207639380, + "step": 9641, + "time_per_iteration": 3.6939098834991455 + }, + { + "auxiliary_loss_clip": 0.0111694, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_clip": 1.041821, + "balance_loss_mlp": 1.02858758, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.8405624053398397, + "language_loss": 0.73761159, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.7591964, + "num_input_tokens_seen": 207657915, + "step": 9642, + "time_per_iteration": 2.431302547454834 + }, + { + "auxiliary_loss_clip": 0.01098062, + "auxiliary_loss_mlp": 0.01039732, + "balance_loss_clip": 1.04056776, + "balance_loss_mlp": 1.02688098, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.718046159264889, + "language_loss": 0.73137003, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75274801, + "num_input_tokens_seen": 207678620, + "step": 9643, + "time_per_iteration": 2.588982343673706 + }, + { + "auxiliary_loss_clip": 0.01119619, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.0418942, + "balance_loss_mlp": 1.02430964, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 2.096821285981066, + "language_loss": 0.67136163, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69293147, + "num_input_tokens_seen": 207696980, + "step": 9644, + "time_per_iteration": 2.437227725982666 + }, + { + "auxiliary_loss_clip": 0.01112033, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.04174328, + "balance_loss_mlp": 1.02434325, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 3.322895114205684, + "language_loss": 0.85263985, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87414479, + "num_input_tokens_seen": 207714065, + "step": 9645, + "time_per_iteration": 2.5455682277679443 + }, + { + "auxiliary_loss_clip": 0.01118625, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.04285145, + "balance_loss_mlp": 1.02123868, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.821076314277748, + "language_loss": 0.75177604, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77330232, + "num_input_tokens_seen": 207734720, + "step": 9646, + "time_per_iteration": 2.614586114883423 + }, + { + "auxiliary_loss_clip": 0.01098693, + "auxiliary_loss_mlp": 0.01037629, + "balance_loss_clip": 1.04238594, + "balance_loss_mlp": 1.0231992, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.9508620720972647, + "language_loss": 0.59661454, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61797774, + "num_input_tokens_seen": 207755435, + "step": 9647, + "time_per_iteration": 2.6859850883483887 + }, + { + "auxiliary_loss_clip": 0.01076631, + "auxiliary_loss_mlp": 0.01051577, + "balance_loss_clip": 1.03945112, + "balance_loss_mlp": 1.03492343, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.5851631037146705, + "language_loss": 0.8416546, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86293668, + "num_input_tokens_seen": 207773570, + "step": 9648, + "time_per_iteration": 2.5678584575653076 + }, + { + "auxiliary_loss_clip": 0.01026419, + "auxiliary_loss_mlp": 0.01007853, + "balance_loss_clip": 1.01106501, + "balance_loss_mlp": 1.00656533, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8578842386217211, + "language_loss": 0.62954807, + "learning_rate": 1.581142210256242e-06, + "loss": 0.64989078, + "num_input_tokens_seen": 207830095, + "step": 9649, + "time_per_iteration": 3.087411880493164 + }, + { + "auxiliary_loss_clip": 0.01078327, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.03523684, + "balance_loss_mlp": 1.02508998, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 1.9349445989774523, + "language_loss": 0.81939793, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84056813, + "num_input_tokens_seen": 207848555, + "step": 9650, + "time_per_iteration": 2.536487102508545 + }, + { + "auxiliary_loss_clip": 0.01084049, + "auxiliary_loss_mlp": 0.01036971, + "balance_loss_clip": 1.03879929, + "balance_loss_mlp": 1.02313662, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.6926777737381324, + "language_loss": 0.77520961, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79641986, + "num_input_tokens_seen": 207867060, + "step": 9651, + "time_per_iteration": 2.519301652908325 + }, + { + "auxiliary_loss_clip": 0.0110085, + "auxiliary_loss_mlp": 0.01038204, + "balance_loss_clip": 1.04220009, + "balance_loss_mlp": 1.02429843, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 1.8409394462697983, + "language_loss": 0.74116623, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76255679, + "num_input_tokens_seen": 207884520, + "step": 9652, + "time_per_iteration": 2.5092344284057617 + }, + { + "auxiliary_loss_clip": 0.01091451, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.03946328, + "balance_loss_mlp": 1.017658, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 2.6688395375281315, + "language_loss": 0.77175605, + "learning_rate": 1.579619037747193e-06, + "loss": 0.79298961, + "num_input_tokens_seen": 207905370, + "step": 9653, + "time_per_iteration": 4.053754091262817 + }, + { + "auxiliary_loss_clip": 0.01118074, + "auxiliary_loss_mlp": 0.01032158, + "balance_loss_clip": 1.04164493, + "balance_loss_mlp": 1.01759624, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 11.61742982128721, + "language_loss": 0.74471188, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76621413, + "num_input_tokens_seen": 207923790, + "step": 9654, + "time_per_iteration": 2.4306223392486572 + }, + { + "auxiliary_loss_clip": 0.01058384, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.04624557, + "balance_loss_mlp": 1.02229154, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 2.8256367847601185, + "language_loss": 0.70167959, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72260857, + "num_input_tokens_seen": 207942335, + "step": 9655, + "time_per_iteration": 2.6524150371551514 + }, + { + "auxiliary_loss_clip": 0.01121866, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.0408783, + "balance_loss_mlp": 1.0171771, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 2.420628020786068, + "language_loss": 0.69251728, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71404946, + "num_input_tokens_seen": 207961975, + "step": 9656, + "time_per_iteration": 2.4641189575195312 + }, + { + "auxiliary_loss_clip": 0.01101747, + "auxiliary_loss_mlp": 0.01037073, + "balance_loss_clip": 1.04155445, + "balance_loss_mlp": 1.02364385, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.842760966266575, + "language_loss": 0.71717435, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73856258, + "num_input_tokens_seen": 207979520, + "step": 9657, + "time_per_iteration": 2.4499495029449463 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01037167, + "balance_loss_clip": 1.04099679, + "balance_loss_mlp": 1.02232528, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 2.3237768668698, + "language_loss": 0.71009374, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.73155355, + "num_input_tokens_seen": 207998375, + "step": 9658, + "time_per_iteration": 2.5138745307922363 + }, + { + "auxiliary_loss_clip": 0.01029754, + "auxiliary_loss_mlp": 0.01003094, + "balance_loss_clip": 1.01433659, + "balance_loss_mlp": 1.00164533, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6582872837447481, + "language_loss": 0.53787875, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55820727, + "num_input_tokens_seen": 208060605, + "step": 9659, + "time_per_iteration": 3.077192544937134 + }, + { + "auxiliary_loss_clip": 0.01109971, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.04036415, + "balance_loss_mlp": 1.02924633, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 1.9625223183328004, + "language_loss": 0.62189305, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64342248, + "num_input_tokens_seen": 208080320, + "step": 9660, + "time_per_iteration": 2.553596258163452 + }, + { + "auxiliary_loss_clip": 0.01106242, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.03769445, + "balance_loss_mlp": 1.02292681, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.784214573421935, + "language_loss": 0.65737134, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67880344, + "num_input_tokens_seen": 208099305, + "step": 9661, + "time_per_iteration": 2.4992361068725586 + }, + { + "auxiliary_loss_clip": 0.01065935, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.03806138, + "balance_loss_mlp": 1.01762867, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.5306388768912482, + "language_loss": 0.74780536, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.7687636, + "num_input_tokens_seen": 208116960, + "step": 9662, + "time_per_iteration": 2.560955762863159 + }, + { + "auxiliary_loss_clip": 0.01036211, + "auxiliary_loss_mlp": 0.01001338, + "balance_loss_clip": 1.01131809, + "balance_loss_mlp": 1.00009859, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8738648361945578, + "language_loss": 0.58378994, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60416543, + "num_input_tokens_seen": 208182190, + "step": 9663, + "time_per_iteration": 3.093243360519409 + }, + { + "auxiliary_loss_clip": 0.01097665, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.04204643, + "balance_loss_mlp": 1.01935577, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.32806635262535, + "language_loss": 0.81921744, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84051645, + "num_input_tokens_seen": 208197015, + "step": 9664, + "time_per_iteration": 2.5163564682006836 + }, + { + "auxiliary_loss_clip": 0.01089307, + "auxiliary_loss_mlp": 0.00780403, + "balance_loss_clip": 1.03551841, + "balance_loss_mlp": 1.00064743, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.8511614438001618, + "language_loss": 0.81372255, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.83241963, + "num_input_tokens_seen": 208215795, + "step": 9665, + "time_per_iteration": 2.5750539302825928 + }, + { + "auxiliary_loss_clip": 0.01101574, + "auxiliary_loss_mlp": 0.01035841, + "balance_loss_clip": 1.04396677, + "balance_loss_mlp": 1.02095199, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.6234885797303573, + "language_loss": 0.81097102, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83234513, + "num_input_tokens_seen": 208234655, + "step": 9666, + "time_per_iteration": 2.5411341190338135 + }, + { + "auxiliary_loss_clip": 0.01104098, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_clip": 1.04417694, + "balance_loss_mlp": 1.02367282, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.9410005206390517, + "language_loss": 0.79892761, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82032824, + "num_input_tokens_seen": 208251300, + "step": 9667, + "time_per_iteration": 2.480682134628296 + }, + { + "auxiliary_loss_clip": 0.01112606, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.04038072, + "balance_loss_mlp": 1.02359259, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.454042544861398, + "language_loss": 0.78637385, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80788499, + "num_input_tokens_seen": 208272685, + "step": 9668, + "time_per_iteration": 4.045397996902466 + }, + { + "auxiliary_loss_clip": 0.01095345, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.040272, + "balance_loss_mlp": 1.0207324, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 2.0230439426649567, + "language_loss": 0.64757508, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66886622, + "num_input_tokens_seen": 208294315, + "step": 9669, + "time_per_iteration": 2.698145866394043 + }, + { + "auxiliary_loss_clip": 0.01071539, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.04206443, + "balance_loss_mlp": 1.02241421, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 2.516267008112728, + "language_loss": 0.73014379, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75122118, + "num_input_tokens_seen": 208315610, + "step": 9670, + "time_per_iteration": 2.621323585510254 + }, + { + "auxiliary_loss_clip": 0.01086726, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.03942788, + "balance_loss_mlp": 1.0255897, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 2.157557227114685, + "language_loss": 0.79220921, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81346214, + "num_input_tokens_seen": 208334725, + "step": 9671, + "time_per_iteration": 2.552872896194458 + }, + { + "auxiliary_loss_clip": 0.01081279, + "auxiliary_loss_mlp": 0.01040529, + "balance_loss_clip": 1.04326236, + "balance_loss_mlp": 1.02566385, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 1.9854855703269967, + "language_loss": 0.61251032, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.63372844, + "num_input_tokens_seen": 208353825, + "step": 9672, + "time_per_iteration": 2.6200709342956543 + }, + { + "auxiliary_loss_clip": 0.01066742, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.03869987, + "balance_loss_mlp": 1.02565205, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.6050715227685481, + "language_loss": 0.81451184, + "learning_rate": 1.572007019492342e-06, + "loss": 0.8355791, + "num_input_tokens_seen": 208374160, + "step": 9673, + "time_per_iteration": 2.6080169677734375 + }, + { + "auxiliary_loss_clip": 0.01086604, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.0430603, + "balance_loss_mlp": 1.02532125, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 2.0250432109547476, + "language_loss": 0.87857729, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.89984143, + "num_input_tokens_seen": 208392105, + "step": 9674, + "time_per_iteration": 2.5529122352600098 + }, + { + "auxiliary_loss_clip": 0.01116929, + "auxiliary_loss_mlp": 0.00778745, + "balance_loss_clip": 1.04049039, + "balance_loss_mlp": 1.00080895, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 2.4417057774630195, + "language_loss": 0.78980112, + "learning_rate": 1.571246172811984e-06, + "loss": 0.8087579, + "num_input_tokens_seen": 208411755, + "step": 9675, + "time_per_iteration": 2.48893404006958 + }, + { + "auxiliary_loss_clip": 0.01104694, + "auxiliary_loss_mlp": 0.01036118, + "balance_loss_clip": 1.03870082, + "balance_loss_mlp": 1.02183652, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.083732129442358, + "language_loss": 0.70376027, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72516835, + "num_input_tokens_seen": 208429995, + "step": 9676, + "time_per_iteration": 2.4870996475219727 + }, + { + "auxiliary_loss_clip": 0.01062513, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.04255676, + "balance_loss_mlp": 1.01825809, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.2834222931819173, + "language_loss": 0.62779444, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.64873815, + "num_input_tokens_seen": 208443655, + "step": 9677, + "time_per_iteration": 2.675199508666992 + }, + { + "auxiliary_loss_clip": 0.01026764, + "auxiliary_loss_mlp": 0.01004777, + "balance_loss_clip": 1.02679193, + "balance_loss_mlp": 1.00307262, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8041667395616673, + "language_loss": 0.54214782, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56246322, + "num_input_tokens_seen": 208498405, + "step": 9678, + "time_per_iteration": 4.673716306686401 + }, + { + "auxiliary_loss_clip": 0.0101585, + "auxiliary_loss_mlp": 0.01009257, + "balance_loss_clip": 1.00885451, + "balance_loss_mlp": 1.00782096, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7414915287042612, + "language_loss": 0.5621568, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58240783, + "num_input_tokens_seen": 208559075, + "step": 9679, + "time_per_iteration": 2.9607415199279785 + }, + { + "auxiliary_loss_clip": 0.01115344, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.03901625, + "balance_loss_mlp": 1.019508, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.9956786268871318, + "language_loss": 0.6607877, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.68225747, + "num_input_tokens_seen": 208577770, + "step": 9680, + "time_per_iteration": 2.4595658779144287 + }, + { + "auxiliary_loss_clip": 0.01093746, + "auxiliary_loss_mlp": 0.01030584, + "balance_loss_clip": 1.03899419, + "balance_loss_mlp": 1.01830578, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 2.0090797828413884, + "language_loss": 0.83198845, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85323167, + "num_input_tokens_seen": 208595110, + "step": 9681, + "time_per_iteration": 3.854262351989746 + }, + { + "auxiliary_loss_clip": 0.01115492, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.03962195, + "balance_loss_mlp": 1.02043247, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 2.111124561372128, + "language_loss": 0.75684875, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77833778, + "num_input_tokens_seen": 208612080, + "step": 9682, + "time_per_iteration": 2.4529056549072266 + }, + { + "auxiliary_loss_clip": 0.01054838, + "auxiliary_loss_mlp": 0.01035251, + "balance_loss_clip": 1.03311431, + "balance_loss_mlp": 1.01998055, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 2.369980512417403, + "language_loss": 0.75598776, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77688861, + "num_input_tokens_seen": 208630235, + "step": 9683, + "time_per_iteration": 2.6308023929595947 + }, + { + "auxiliary_loss_clip": 0.0109394, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.04199266, + "balance_loss_mlp": 1.01659322, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.7117204975848694, + "language_loss": 0.73861229, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.75985473, + "num_input_tokens_seen": 208647925, + "step": 9684, + "time_per_iteration": 2.529846668243408 + }, + { + "auxiliary_loss_clip": 0.01090042, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.03777003, + "balance_loss_mlp": 1.02393973, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.1693552026126195, + "language_loss": 0.78429401, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80556774, + "num_input_tokens_seen": 208666180, + "step": 9685, + "time_per_iteration": 2.590876817703247 + }, + { + "auxiliary_loss_clip": 0.01116857, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.04041755, + "balance_loss_mlp": 1.02555752, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.7497237997495945, + "language_loss": 0.74821961, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.76977205, + "num_input_tokens_seen": 208684240, + "step": 9686, + "time_per_iteration": 2.420776605606079 + }, + { + "auxiliary_loss_clip": 0.01027725, + "auxiliary_loss_mlp": 0.01005385, + "balance_loss_clip": 1.0123868, + "balance_loss_mlp": 1.00416303, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8359067676563634, + "language_loss": 0.5769999, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59733105, + "num_input_tokens_seen": 208736090, + "step": 9687, + "time_per_iteration": 2.869591474533081 + }, + { + "auxiliary_loss_clip": 0.01080092, + "auxiliary_loss_mlp": 0.01038382, + "balance_loss_clip": 1.04014146, + "balance_loss_mlp": 1.02329612, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.9811184472461207, + "language_loss": 0.6990658, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72025061, + "num_input_tokens_seen": 208754600, + "step": 9688, + "time_per_iteration": 2.5414443016052246 + }, + { + "auxiliary_loss_clip": 0.0110621, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.03924775, + "balance_loss_mlp": 1.02058935, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 4.036872971886579, + "language_loss": 0.65095735, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67235315, + "num_input_tokens_seen": 208773140, + "step": 9689, + "time_per_iteration": 2.4923088550567627 + }, + { + "auxiliary_loss_clip": 0.01095736, + "auxiliary_loss_mlp": 0.00779148, + "balance_loss_clip": 1.0408572, + "balance_loss_mlp": 1.00064611, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.6452079598570324, + "language_loss": 0.73196208, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75071096, + "num_input_tokens_seen": 208793410, + "step": 9690, + "time_per_iteration": 2.5385828018188477 + }, + { + "auxiliary_loss_clip": 0.01094859, + "auxiliary_loss_mlp": 0.01038713, + "balance_loss_clip": 1.03790212, + "balance_loss_mlp": 1.023597, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 10.512570853121264, + "language_loss": 0.75885093, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78018665, + "num_input_tokens_seen": 208811920, + "step": 9691, + "time_per_iteration": 2.536896228790283 + }, + { + "auxiliary_loss_clip": 0.01105478, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.03688872, + "balance_loss_mlp": 1.01861966, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 3.530745320950223, + "language_loss": 0.80605531, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82742923, + "num_input_tokens_seen": 208834720, + "step": 9692, + "time_per_iteration": 2.5951457023620605 + }, + { + "auxiliary_loss_clip": 0.01027335, + "auxiliary_loss_mlp": 0.01001988, + "balance_loss_clip": 1.01214743, + "balance_loss_mlp": 1.00071263, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7588439007176143, + "language_loss": 0.56952178, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.58981502, + "num_input_tokens_seen": 208898415, + "step": 9693, + "time_per_iteration": 4.611676454544067 + }, + { + "auxiliary_loss_clip": 0.01105477, + "auxiliary_loss_mlp": 0.00778648, + "balance_loss_clip": 1.0391494, + "balance_loss_mlp": 1.00069618, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 1.5345329521355286, + "language_loss": 0.79249966, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81134093, + "num_input_tokens_seen": 208919045, + "step": 9694, + "time_per_iteration": 2.585702657699585 + }, + { + "auxiliary_loss_clip": 0.01086292, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.04022753, + "balance_loss_mlp": 1.02859616, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.81353559027647, + "language_loss": 0.75987411, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78116053, + "num_input_tokens_seen": 208939375, + "step": 9695, + "time_per_iteration": 2.533637762069702 + }, + { + "auxiliary_loss_clip": 0.0102436, + "auxiliary_loss_mlp": 0.01000449, + "balance_loss_clip": 1.00920486, + "balance_loss_mlp": 0.99929851, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7692139421320816, + "language_loss": 0.54965138, + "learning_rate": 1.563261231127095e-06, + "loss": 0.56989944, + "num_input_tokens_seen": 209004760, + "step": 9696, + "time_per_iteration": 3.15163516998291 + }, + { + "auxiliary_loss_clip": 0.01082606, + "auxiliary_loss_mlp": 0.01027973, + "balance_loss_clip": 1.04312038, + "balance_loss_mlp": 1.01526546, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 4.378981973782233, + "language_loss": 0.76386249, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78496826, + "num_input_tokens_seen": 209022930, + "step": 9697, + "time_per_iteration": 2.539430618286133 + }, + { + "auxiliary_loss_clip": 0.01118486, + "auxiliary_loss_mlp": 0.01035347, + "balance_loss_clip": 1.03936613, + "balance_loss_mlp": 1.0211072, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.6894655824154006, + "language_loss": 0.77578819, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79732645, + "num_input_tokens_seen": 209043740, + "step": 9698, + "time_per_iteration": 2.4688808917999268 + }, + { + "auxiliary_loss_clip": 0.01077084, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.03721476, + "balance_loss_mlp": 1.02703726, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.5830107211082507, + "language_loss": 0.83591926, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85709935, + "num_input_tokens_seen": 209068885, + "step": 9699, + "time_per_iteration": 2.7060294151306152 + }, + { + "auxiliary_loss_clip": 0.01091813, + "auxiliary_loss_mlp": 0.01036831, + "balance_loss_clip": 1.03507483, + "balance_loss_mlp": 1.02232909, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.0513650329393096, + "language_loss": 0.66382205, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68510854, + "num_input_tokens_seen": 209087340, + "step": 9700, + "time_per_iteration": 2.536890983581543 + }, + { + "auxiliary_loss_clip": 0.01105572, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.03754365, + "balance_loss_mlp": 1.02237272, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.5450527233706404, + "language_loss": 0.71327567, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73468763, + "num_input_tokens_seen": 209108840, + "step": 9701, + "time_per_iteration": 2.5165045261383057 + }, + { + "auxiliary_loss_clip": 0.0109361, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.03999352, + "balance_loss_mlp": 1.02235556, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 2.169667994862254, + "language_loss": 0.85238266, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.8736701, + "num_input_tokens_seen": 209127985, + "step": 9702, + "time_per_iteration": 2.5281476974487305 + }, + { + "auxiliary_loss_clip": 0.01097855, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.03647184, + "balance_loss_mlp": 1.02503586, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.5207621302607253, + "language_loss": 0.77870297, + "learning_rate": 1.560601200301392e-06, + "loss": 0.80005693, + "num_input_tokens_seen": 209146885, + "step": 9703, + "time_per_iteration": 2.475783109664917 + }, + { + "auxiliary_loss_clip": 0.01119079, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.04115784, + "balance_loss_mlp": 1.02156866, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.675642562250935, + "language_loss": 0.71382886, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.7353735, + "num_input_tokens_seen": 209166130, + "step": 9704, + "time_per_iteration": 2.4425466060638428 + }, + { + "auxiliary_loss_clip": 0.01090815, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.03993773, + "balance_loss_mlp": 1.02246428, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 2.0729023578169765, + "language_loss": 0.81665963, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83791399, + "num_input_tokens_seen": 209183350, + "step": 9705, + "time_per_iteration": 2.470659017562866 + }, + { + "auxiliary_loss_clip": 0.01073242, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.03749287, + "balance_loss_mlp": 1.01949215, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 2.3764403842446495, + "language_loss": 0.80562705, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82668304, + "num_input_tokens_seen": 209203945, + "step": 9706, + "time_per_iteration": 2.554105758666992 + }, + { + "auxiliary_loss_clip": 0.0110148, + "auxiliary_loss_mlp": 0.01039347, + "balance_loss_clip": 1.0365696, + "balance_loss_mlp": 1.02538741, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 1.8333455700807135, + "language_loss": 0.74901187, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77042019, + "num_input_tokens_seen": 209227080, + "step": 9707, + "time_per_iteration": 2.6908843517303467 + }, + { + "auxiliary_loss_clip": 0.01081519, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.03651083, + "balance_loss_mlp": 1.01507366, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.8871870072807728, + "language_loss": 0.81179792, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83288431, + "num_input_tokens_seen": 209248170, + "step": 9708, + "time_per_iteration": 4.269970178604126 + }, + { + "auxiliary_loss_clip": 0.01102188, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.04383004, + "balance_loss_mlp": 1.01563573, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.4878667280301776, + "language_loss": 0.78647894, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80778879, + "num_input_tokens_seen": 209267730, + "step": 9709, + "time_per_iteration": 2.5017013549804688 + }, + { + "auxiliary_loss_clip": 0.01015668, + "auxiliary_loss_mlp": 0.01002836, + "balance_loss_clip": 1.00900269, + "balance_loss_mlp": 1.00164413, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7595334193028319, + "language_loss": 0.56586719, + "learning_rate": 1.557941985915844e-06, + "loss": 0.5860523, + "num_input_tokens_seen": 209332510, + "step": 9710, + "time_per_iteration": 3.0988104343414307 + }, + { + "auxiliary_loss_clip": 0.01083273, + "auxiliary_loss_mlp": 0.01030081, + "balance_loss_clip": 1.03937757, + "balance_loss_mlp": 1.0185833, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.486814104248169, + "language_loss": 0.65344757, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67458111, + "num_input_tokens_seen": 209353355, + "step": 9711, + "time_per_iteration": 2.6007440090179443 + }, + { + "auxiliary_loss_clip": 0.01120774, + "auxiliary_loss_mlp": 0.01038456, + "balance_loss_clip": 1.04020047, + "balance_loss_mlp": 1.02373946, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.684044380675812, + "language_loss": 0.79099584, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.81258816, + "num_input_tokens_seen": 209370960, + "step": 9712, + "time_per_iteration": 2.4589974880218506 + }, + { + "auxiliary_loss_clip": 0.01078255, + "auxiliary_loss_mlp": 0.0078086, + "balance_loss_clip": 1.03305173, + "balance_loss_mlp": 1.00062871, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.712917491439956, + "language_loss": 0.73214722, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75073832, + "num_input_tokens_seen": 209390955, + "step": 9713, + "time_per_iteration": 2.6051666736602783 + }, + { + "auxiliary_loss_clip": 0.01096886, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.03821957, + "balance_loss_mlp": 1.01894927, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 2.460947943327774, + "language_loss": 0.69520551, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71651125, + "num_input_tokens_seen": 209410260, + "step": 9714, + "time_per_iteration": 2.5401930809020996 + }, + { + "auxiliary_loss_clip": 0.01115814, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.03856969, + "balance_loss_mlp": 1.01746178, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 2.4553747182126657, + "language_loss": 0.8039723, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.8254407, + "num_input_tokens_seen": 209429920, + "step": 9715, + "time_per_iteration": 2.441950559616089 + }, + { + "auxiliary_loss_clip": 0.01093505, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.03530931, + "balance_loss_mlp": 1.01738, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 2.0429963631742094, + "language_loss": 0.7289536, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.75020146, + "num_input_tokens_seen": 209449470, + "step": 9716, + "time_per_iteration": 2.5186972618103027 + }, + { + "auxiliary_loss_clip": 0.01088749, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.03629124, + "balance_loss_mlp": 1.01614749, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.7242991872522957, + "language_loss": 0.7487061, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.76988196, + "num_input_tokens_seen": 209467695, + "step": 9717, + "time_per_iteration": 4.053596496582031 + }, + { + "auxiliary_loss_clip": 0.01105456, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.03910816, + "balance_loss_mlp": 1.02188623, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 2.021156971207099, + "language_loss": 0.79696274, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.81836915, + "num_input_tokens_seen": 209484250, + "step": 9718, + "time_per_iteration": 2.4441168308258057 + }, + { + "auxiliary_loss_clip": 0.01092291, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.03718662, + "balance_loss_mlp": 1.02137804, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.633075519720901, + "language_loss": 0.67518544, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69645983, + "num_input_tokens_seen": 209502830, + "step": 9719, + "time_per_iteration": 2.535953998565674 + }, + { + "auxiliary_loss_clip": 0.01118958, + "auxiliary_loss_mlp": 0.01038279, + "balance_loss_clip": 1.04141021, + "balance_loss_mlp": 1.02517152, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.092076539056656, + "language_loss": 0.75365269, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.77522504, + "num_input_tokens_seen": 209525995, + "step": 9720, + "time_per_iteration": 3.78159761428833 + }, + { + "auxiliary_loss_clip": 0.01077883, + "auxiliary_loss_mlp": 0.01033919, + "balance_loss_clip": 1.03913093, + "balance_loss_mlp": 1.02113366, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.6917709229768083, + "language_loss": 0.82940996, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.850528, + "num_input_tokens_seen": 209545895, + "step": 9721, + "time_per_iteration": 2.583878755569458 + }, + { + "auxiliary_loss_clip": 0.01032709, + "auxiliary_loss_mlp": 0.01004925, + "balance_loss_clip": 1.00786614, + "balance_loss_mlp": 1.00382257, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9308910245215909, + "language_loss": 0.71262193, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73299825, + "num_input_tokens_seen": 209602315, + "step": 9722, + "time_per_iteration": 3.071228265762329 + }, + { + "auxiliary_loss_clip": 0.01096448, + "auxiliary_loss_mlp": 0.01041439, + "balance_loss_clip": 1.03516638, + "balance_loss_mlp": 1.02808762, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.039135148151056, + "language_loss": 0.89122522, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91260409, + "num_input_tokens_seen": 209617615, + "step": 9723, + "time_per_iteration": 2.445185899734497 + }, + { + "auxiliary_loss_clip": 0.01091448, + "auxiliary_loss_mlp": 0.01034548, + "balance_loss_clip": 1.04159236, + "balance_loss_mlp": 1.02193522, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.4092574642258269, + "language_loss": 0.68467969, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70593965, + "num_input_tokens_seen": 209637005, + "step": 9724, + "time_per_iteration": 2.5141286849975586 + }, + { + "auxiliary_loss_clip": 0.01108518, + "auxiliary_loss_mlp": 0.01037287, + "balance_loss_clip": 1.04194689, + "balance_loss_mlp": 1.02357185, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.9546685034137103, + "language_loss": 0.86116576, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88262385, + "num_input_tokens_seen": 209653170, + "step": 9725, + "time_per_iteration": 2.439493179321289 + }, + { + "auxiliary_loss_clip": 0.01094669, + "auxiliary_loss_mlp": 0.0104007, + "balance_loss_clip": 1.04145014, + "balance_loss_mlp": 1.02659309, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.7297805780646653, + "language_loss": 0.82946151, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85080892, + "num_input_tokens_seen": 209671275, + "step": 9726, + "time_per_iteration": 2.4836113452911377 + }, + { + "auxiliary_loss_clip": 0.01054489, + "auxiliary_loss_mlp": 0.00779624, + "balance_loss_clip": 1.04024553, + "balance_loss_mlp": 1.00064278, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.8886151662061015, + "language_loss": 0.67061603, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.6889571, + "num_input_tokens_seen": 209690380, + "step": 9727, + "time_per_iteration": 2.6532695293426514 + }, + { + "auxiliary_loss_clip": 0.01081709, + "auxiliary_loss_mlp": 0.01048423, + "balance_loss_clip": 1.03570807, + "balance_loss_mlp": 1.03408217, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 2.156437558792005, + "language_loss": 0.81360972, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.83491099, + "num_input_tokens_seen": 209708845, + "step": 9728, + "time_per_iteration": 2.542699098587036 + }, + { + "auxiliary_loss_clip": 0.01100808, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.0371871, + "balance_loss_mlp": 1.02699673, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 2.643221179380592, + "language_loss": 0.7763536, + "learning_rate": 1.550728272957027e-06, + "loss": 0.79775912, + "num_input_tokens_seen": 209729000, + "step": 9729, + "time_per_iteration": 2.471367359161377 + }, + { + "auxiliary_loss_clip": 0.01099787, + "auxiliary_loss_mlp": 0.01037043, + "balance_loss_clip": 1.03643692, + "balance_loss_mlp": 1.02244592, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 1.7867405457257834, + "language_loss": 0.70636308, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72773135, + "num_input_tokens_seen": 209747435, + "step": 9730, + "time_per_iteration": 2.522115468978882 + }, + { + "auxiliary_loss_clip": 0.01121554, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.04214919, + "balance_loss_mlp": 1.02381015, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 1.802997444280294, + "language_loss": 0.78830838, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80990791, + "num_input_tokens_seen": 209764910, + "step": 9731, + "time_per_iteration": 2.4585158824920654 + }, + { + "auxiliary_loss_clip": 0.01101404, + "auxiliary_loss_mlp": 0.01040256, + "balance_loss_clip": 1.03793395, + "balance_loss_mlp": 1.02521777, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 2.047368441825972, + "language_loss": 0.7020238, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72344041, + "num_input_tokens_seen": 209786115, + "step": 9732, + "time_per_iteration": 4.057812690734863 + }, + { + "auxiliary_loss_clip": 0.01066721, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.03489351, + "balance_loss_mlp": 1.02478242, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 2.370619067077715, + "language_loss": 0.52900052, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.55007339, + "num_input_tokens_seen": 209806095, + "step": 9733, + "time_per_iteration": 2.6080374717712402 + }, + { + "auxiliary_loss_clip": 0.01101584, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.03863406, + "balance_loss_mlp": 1.0204345, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 2.135677781081377, + "language_loss": 0.88060874, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.90197396, + "num_input_tokens_seen": 209823650, + "step": 9734, + "time_per_iteration": 2.48236083984375 + }, + { + "auxiliary_loss_clip": 0.01086295, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.03459907, + "balance_loss_mlp": 1.02309132, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.5641692638134235, + "language_loss": 0.72371835, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74493885, + "num_input_tokens_seen": 209843220, + "step": 9735, + "time_per_iteration": 2.515368700027466 + }, + { + "auxiliary_loss_clip": 0.01106842, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.03990173, + "balance_loss_mlp": 1.02561069, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.67234541113475, + "language_loss": 0.75087619, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.77235103, + "num_input_tokens_seen": 209854880, + "step": 9736, + "time_per_iteration": 2.4415769577026367 + }, + { + "auxiliary_loss_clip": 0.01084821, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.03813243, + "balance_loss_mlp": 1.0226202, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 1.498127505244645, + "language_loss": 0.70614052, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72735929, + "num_input_tokens_seen": 209877870, + "step": 9737, + "time_per_iteration": 2.7381591796875 + }, + { + "auxiliary_loss_clip": 0.01077483, + "auxiliary_loss_mlp": 0.01040187, + "balance_loss_clip": 1.03736305, + "balance_loss_mlp": 1.02619767, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 2.061939373345591, + "language_loss": 0.8272531, + "learning_rate": 1.547313391573169e-06, + "loss": 0.8484298, + "num_input_tokens_seen": 209896690, + "step": 9738, + "time_per_iteration": 2.547473430633545 + }, + { + "auxiliary_loss_clip": 0.01121259, + "auxiliary_loss_mlp": 0.00780512, + "balance_loss_clip": 1.04187787, + "balance_loss_mlp": 1.00073266, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.6973430396535751, + "language_loss": 0.68315887, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70217657, + "num_input_tokens_seen": 209914640, + "step": 9739, + "time_per_iteration": 2.4551990032196045 + }, + { + "auxiliary_loss_clip": 0.01116258, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.03782201, + "balance_loss_mlp": 1.01485467, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 2.254614039886703, + "language_loss": 0.58797175, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60942525, + "num_input_tokens_seen": 209933375, + "step": 9740, + "time_per_iteration": 2.4307990074157715 + }, + { + "auxiliary_loss_clip": 0.0109336, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.04181576, + "balance_loss_mlp": 1.01742983, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 1.8949869314941787, + "language_loss": 0.75200963, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77325374, + "num_input_tokens_seen": 209952055, + "step": 9741, + "time_per_iteration": 2.522043228149414 + }, + { + "auxiliary_loss_clip": 0.01085293, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.03828239, + "balance_loss_mlp": 1.01914978, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 3.386509273148141, + "language_loss": 0.7603218, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.78149474, + "num_input_tokens_seen": 209971190, + "step": 9742, + "time_per_iteration": 2.5566999912261963 + }, + { + "auxiliary_loss_clip": 0.01092386, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.03891063, + "balance_loss_mlp": 1.02039564, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.6694670895703154, + "language_loss": 0.74981284, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77107233, + "num_input_tokens_seen": 209990695, + "step": 9743, + "time_per_iteration": 2.524501323699951 + }, + { + "auxiliary_loss_clip": 0.01093797, + "auxiliary_loss_mlp": 0.010288, + "balance_loss_clip": 1.04139197, + "balance_loss_mlp": 1.01642632, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.9602508900578381, + "language_loss": 0.81354326, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83476925, + "num_input_tokens_seen": 210010210, + "step": 9744, + "time_per_iteration": 2.567532539367676 + }, + { + "auxiliary_loss_clip": 0.01093332, + "auxiliary_loss_mlp": 0.01027342, + "balance_loss_clip": 1.03960741, + "balance_loss_mlp": 1.01394296, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 2.139537075988154, + "language_loss": 0.7167818, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73798853, + "num_input_tokens_seen": 210030030, + "step": 9745, + "time_per_iteration": 2.5676441192626953 + }, + { + "auxiliary_loss_clip": 0.01018403, + "auxiliary_loss_mlp": 0.01002724, + "balance_loss_clip": 1.01371765, + "balance_loss_mlp": 1.00143707, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7276613419023222, + "language_loss": 0.5326876, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55289888, + "num_input_tokens_seen": 210094840, + "step": 9746, + "time_per_iteration": 3.1705334186553955 + }, + { + "auxiliary_loss_clip": 0.01094055, + "auxiliary_loss_mlp": 0.01036344, + "balance_loss_clip": 1.04140568, + "balance_loss_mlp": 1.02211082, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.7764818111980256, + "language_loss": 0.73305851, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75436252, + "num_input_tokens_seen": 210114660, + "step": 9747, + "time_per_iteration": 4.024813413619995 + }, + { + "auxiliary_loss_clip": 0.0109008, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.03546476, + "balance_loss_mlp": 1.02313304, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 4.405993275533584, + "language_loss": 0.81529492, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83658493, + "num_input_tokens_seen": 210132770, + "step": 9748, + "time_per_iteration": 2.511265754699707 + }, + { + "auxiliary_loss_clip": 0.01106701, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.03933954, + "balance_loss_mlp": 1.01964772, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.7024526140970804, + "language_loss": 0.71879375, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74019295, + "num_input_tokens_seen": 210151895, + "step": 9749, + "time_per_iteration": 2.4738612174987793 + }, + { + "auxiliary_loss_clip": 0.01098113, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.04437494, + "balance_loss_mlp": 1.01901364, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 1.9751182977430157, + "language_loss": 0.74662215, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.76792574, + "num_input_tokens_seen": 210168040, + "step": 9750, + "time_per_iteration": 2.5150680541992188 + }, + { + "auxiliary_loss_clip": 0.01083618, + "auxiliary_loss_mlp": 0.01036923, + "balance_loss_clip": 1.04309559, + "balance_loss_mlp": 1.02287364, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.6789745189810732, + "language_loss": 0.70827484, + "learning_rate": 1.542383242598344e-06, + "loss": 0.72948027, + "num_input_tokens_seen": 210187720, + "step": 9751, + "time_per_iteration": 2.538007974624634 + }, + { + "auxiliary_loss_clip": 0.01120441, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.04128265, + "balance_loss_mlp": 1.02073932, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.690643525953594, + "language_loss": 0.74764943, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76921451, + "num_input_tokens_seen": 210206080, + "step": 9752, + "time_per_iteration": 2.434854030609131 + }, + { + "auxiliary_loss_clip": 0.01103787, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.03987098, + "balance_loss_mlp": 1.01962042, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.7819117545744227, + "language_loss": 0.77839327, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79975861, + "num_input_tokens_seen": 210225660, + "step": 9753, + "time_per_iteration": 2.472032308578491 + }, + { + "auxiliary_loss_clip": 0.01113257, + "auxiliary_loss_mlp": 0.01025819, + "balance_loss_clip": 1.04070711, + "balance_loss_mlp": 1.01342106, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.7302297110832952, + "language_loss": 0.71466178, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73605245, + "num_input_tokens_seen": 210242725, + "step": 9754, + "time_per_iteration": 2.415393352508545 + }, + { + "auxiliary_loss_clip": 0.01093109, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_clip": 1.03668916, + "balance_loss_mlp": 1.01744509, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 1.6794936671346423, + "language_loss": 0.72537243, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74661946, + "num_input_tokens_seen": 210263225, + "step": 9755, + "time_per_iteration": 2.497483015060425 + }, + { + "auxiliary_loss_clip": 0.01015175, + "auxiliary_loss_mlp": 0.01004809, + "balance_loss_clip": 1.01940513, + "balance_loss_mlp": 1.00356293, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7480122121354947, + "language_loss": 0.56878197, + "learning_rate": 1.540487810607967e-06, + "loss": 0.58898187, + "num_input_tokens_seen": 210322310, + "step": 9756, + "time_per_iteration": 3.107726573944092 + }, + { + "auxiliary_loss_clip": 0.01111915, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.03814602, + "balance_loss_mlp": 1.02066112, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.867241034696243, + "language_loss": 0.76243776, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78388655, + "num_input_tokens_seen": 210340845, + "step": 9757, + "time_per_iteration": 3.948456048965454 + }, + { + "auxiliary_loss_clip": 0.01018427, + "auxiliary_loss_mlp": 0.01003649, + "balance_loss_clip": 1.01189566, + "balance_loss_mlp": 1.00240922, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8465519137952121, + "language_loss": 0.60458589, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62480664, + "num_input_tokens_seen": 210397815, + "step": 9758, + "time_per_iteration": 3.062680244445801 + }, + { + "auxiliary_loss_clip": 0.01121692, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.04151249, + "balance_loss_mlp": 1.01872611, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.0033957885749545, + "language_loss": 0.7180348, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.73957813, + "num_input_tokens_seen": 210413900, + "step": 9759, + "time_per_iteration": 2.431164264678955 + }, + { + "auxiliary_loss_clip": 0.01094487, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.03843737, + "balance_loss_mlp": 1.02149701, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5946595376329376, + "language_loss": 0.72768945, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.74897575, + "num_input_tokens_seen": 210434110, + "step": 9760, + "time_per_iteration": 3.8270375728607178 + }, + { + "auxiliary_loss_clip": 0.01102903, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.03921044, + "balance_loss_mlp": 1.01669455, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 2.662842499763982, + "language_loss": 0.72708327, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74841166, + "num_input_tokens_seen": 210451685, + "step": 9761, + "time_per_iteration": 2.4420006275177 + }, + { + "auxiliary_loss_clip": 0.01102097, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.04269254, + "balance_loss_mlp": 1.01454687, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.8593023672750886, + "language_loss": 0.74946439, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77077073, + "num_input_tokens_seen": 210470825, + "step": 9762, + "time_per_iteration": 2.5357792377471924 + }, + { + "auxiliary_loss_clip": 0.01079597, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.03764212, + "balance_loss_mlp": 1.02235961, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.2714283744364938, + "language_loss": 0.7216481, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74280888, + "num_input_tokens_seen": 210500075, + "step": 9763, + "time_per_iteration": 2.9554238319396973 + }, + { + "auxiliary_loss_clip": 0.01100827, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.03604305, + "balance_loss_mlp": 1.01635313, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.5112815611350268, + "language_loss": 0.80277514, + "learning_rate": 1.53745602625755e-06, + "loss": 0.82407349, + "num_input_tokens_seen": 210518150, + "step": 9764, + "time_per_iteration": 2.5018253326416016 + }, + { + "auxiliary_loss_clip": 0.01089162, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.04079509, + "balance_loss_mlp": 1.02013731, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 1.7414682159588941, + "language_loss": 0.79335982, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81458509, + "num_input_tokens_seen": 210537760, + "step": 9765, + "time_per_iteration": 2.4939932823181152 + }, + { + "auxiliary_loss_clip": 0.01088943, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.04258478, + "balance_loss_mlp": 1.02088475, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.8177590584519139, + "language_loss": 0.83873892, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.85997379, + "num_input_tokens_seen": 210555515, + "step": 9766, + "time_per_iteration": 2.4984633922576904 + }, + { + "auxiliary_loss_clip": 0.01109203, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.04125679, + "balance_loss_mlp": 1.01972365, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 1.6371663217185213, + "language_loss": 0.69670022, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71812177, + "num_input_tokens_seen": 210575000, + "step": 9767, + "time_per_iteration": 2.518648386001587 + }, + { + "auxiliary_loss_clip": 0.01101111, + "auxiliary_loss_mlp": 0.00781093, + "balance_loss_clip": 1.03687572, + "balance_loss_mlp": 1.00068498, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 1.7614477474101418, + "language_loss": 0.63364279, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65246481, + "num_input_tokens_seen": 210595185, + "step": 9768, + "time_per_iteration": 2.556774377822876 + }, + { + "auxiliary_loss_clip": 0.0103683, + "auxiliary_loss_mlp": 0.00754274, + "balance_loss_clip": 1.01169336, + "balance_loss_mlp": 1.00035381, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.72691469446199, + "language_loss": 0.53865921, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.55657023, + "num_input_tokens_seen": 210653210, + "step": 9769, + "time_per_iteration": 3.034421682357788 + }, + { + "auxiliary_loss_clip": 0.01080688, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.03600407, + "balance_loss_mlp": 1.02022481, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.8858242403018695, + "language_loss": 0.70988649, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.73102546, + "num_input_tokens_seen": 210673750, + "step": 9770, + "time_per_iteration": 2.565248489379883 + }, + { + "auxiliary_loss_clip": 0.01071274, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.03806543, + "balance_loss_mlp": 1.01919007, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 1.8270595955681292, + "language_loss": 0.67441374, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69545019, + "num_input_tokens_seen": 210692960, + "step": 9771, + "time_per_iteration": 2.634545087814331 + }, + { + "auxiliary_loss_clip": 0.01072187, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.03652811, + "balance_loss_mlp": 1.0195154, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.6911314500466332, + "language_loss": 0.65978307, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68085325, + "num_input_tokens_seen": 210714040, + "step": 9772, + "time_per_iteration": 4.180710315704346 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01042281, + "balance_loss_clip": 1.04111862, + "balance_loss_mlp": 1.02694464, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.8569934663043872, + "language_loss": 0.74286628, + "learning_rate": 1.534046611017519e-06, + "loss": 0.76449001, + "num_input_tokens_seen": 210733710, + "step": 9773, + "time_per_iteration": 2.4874789714813232 + }, + { + "auxiliary_loss_clip": 0.01085095, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.04091144, + "balance_loss_mlp": 1.0217737, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.0757769566019775, + "language_loss": 0.53783727, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55904901, + "num_input_tokens_seen": 210753580, + "step": 9774, + "time_per_iteration": 2.62060546875 + }, + { + "auxiliary_loss_clip": 0.01109168, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.041857, + "balance_loss_mlp": 1.02354813, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.7818544639011926, + "language_loss": 0.65211177, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.67357337, + "num_input_tokens_seen": 210773495, + "step": 9775, + "time_per_iteration": 2.6606812477111816 + }, + { + "auxiliary_loss_clip": 0.01100926, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.03922272, + "balance_loss_mlp": 1.02030945, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.6873751981330174, + "language_loss": 0.73321986, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.75456834, + "num_input_tokens_seen": 210793645, + "step": 9776, + "time_per_iteration": 2.523648977279663 + }, + { + "auxiliary_loss_clip": 0.01115169, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.03891134, + "balance_loss_mlp": 1.02007318, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 1.8205170223293148, + "language_loss": 0.7400797, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76156175, + "num_input_tokens_seen": 210813415, + "step": 9777, + "time_per_iteration": 2.4419753551483154 + }, + { + "auxiliary_loss_clip": 0.01081682, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.04006386, + "balance_loss_mlp": 1.02124274, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.4687240499633996, + "language_loss": 0.74071968, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76187062, + "num_input_tokens_seen": 210833850, + "step": 9778, + "time_per_iteration": 2.597987651824951 + }, + { + "auxiliary_loss_clip": 0.01072022, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.0331378, + "balance_loss_mlp": 1.01857185, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 2.6872967851251595, + "language_loss": 0.69979346, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.720846, + "num_input_tokens_seen": 210853115, + "step": 9779, + "time_per_iteration": 2.550074577331543 + }, + { + "auxiliary_loss_clip": 0.01117363, + "auxiliary_loss_mlp": 0.00780524, + "balance_loss_clip": 1.03910089, + "balance_loss_mlp": 1.00082588, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 1.8317385999664797, + "language_loss": 0.66976047, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.68873936, + "num_input_tokens_seen": 210872090, + "step": 9780, + "time_per_iteration": 2.4577929973602295 + }, + { + "auxiliary_loss_clip": 0.01091596, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.04570985, + "balance_loss_mlp": 1.02100945, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 1.9940608311508357, + "language_loss": 0.72450083, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74576241, + "num_input_tokens_seen": 210888490, + "step": 9781, + "time_per_iteration": 2.4944403171539307 + }, + { + "auxiliary_loss_clip": 0.01088288, + "auxiliary_loss_mlp": 0.00779526, + "balance_loss_clip": 1.03807497, + "balance_loss_mlp": 1.00067818, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.4726990773577722, + "language_loss": 0.70411348, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72279167, + "num_input_tokens_seen": 210908220, + "step": 9782, + "time_per_iteration": 2.4973669052124023 + }, + { + "auxiliary_loss_clip": 0.01101787, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.03547728, + "balance_loss_mlp": 1.02294183, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 2.134609586856023, + "language_loss": 0.70454615, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72593462, + "num_input_tokens_seen": 210923945, + "step": 9783, + "time_per_iteration": 2.428279161453247 + }, + { + "auxiliary_loss_clip": 0.01081921, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.04142141, + "balance_loss_mlp": 1.01928306, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 1.9288902734845377, + "language_loss": 0.68823349, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.70938629, + "num_input_tokens_seen": 210941955, + "step": 9784, + "time_per_iteration": 2.5486321449279785 + }, + { + "auxiliary_loss_clip": 0.01076435, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.04390943, + "balance_loss_mlp": 1.01866412, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 3.367680793808389, + "language_loss": 0.69288754, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71396834, + "num_input_tokens_seen": 210963105, + "step": 9785, + "time_per_iteration": 2.690014600753784 + }, + { + "auxiliary_loss_clip": 0.01105343, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.04157662, + "balance_loss_mlp": 1.01692438, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 1.986033743970311, + "language_loss": 0.77817357, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79951966, + "num_input_tokens_seen": 210978720, + "step": 9786, + "time_per_iteration": 2.4495882987976074 + }, + { + "auxiliary_loss_clip": 0.0110185, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.04092598, + "balance_loss_mlp": 1.02106035, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 2.07452056190118, + "language_loss": 0.79322278, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81458431, + "num_input_tokens_seen": 210998750, + "step": 9787, + "time_per_iteration": 4.193073511123657 + }, + { + "auxiliary_loss_clip": 0.01081863, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.04283857, + "balance_loss_mlp": 1.01980472, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.6204311770435211, + "language_loss": 0.66552436, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68666112, + "num_input_tokens_seen": 211017550, + "step": 9788, + "time_per_iteration": 2.5728275775909424 + }, + { + "auxiliary_loss_clip": 0.01087424, + "auxiliary_loss_mlp": 0.01039113, + "balance_loss_clip": 1.03708339, + "balance_loss_mlp": 1.02377725, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.635795854232808, + "language_loss": 0.80742621, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82869166, + "num_input_tokens_seen": 211034135, + "step": 9789, + "time_per_iteration": 2.5444366931915283 + }, + { + "auxiliary_loss_clip": 0.01084611, + "auxiliary_loss_mlp": 0.00780038, + "balance_loss_clip": 1.03566539, + "balance_loss_mlp": 1.00063455, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.5436104202948493, + "language_loss": 0.70565581, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72430229, + "num_input_tokens_seen": 211053850, + "step": 9790, + "time_per_iteration": 2.5045416355133057 + }, + { + "auxiliary_loss_clip": 0.01080752, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.04174328, + "balance_loss_mlp": 1.02241206, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.6545613252382045, + "language_loss": 0.83572179, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85688126, + "num_input_tokens_seen": 211072165, + "step": 9791, + "time_per_iteration": 2.574833869934082 + }, + { + "auxiliary_loss_clip": 0.01104138, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.03955972, + "balance_loss_mlp": 1.02741623, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.670617556998778, + "language_loss": 0.76391405, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78538096, + "num_input_tokens_seen": 211089630, + "step": 9792, + "time_per_iteration": 2.4865713119506836 + }, + { + "auxiliary_loss_clip": 0.01060193, + "auxiliary_loss_mlp": 0.01049779, + "balance_loss_clip": 1.03002739, + "balance_loss_mlp": 1.03398395, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 2.77171519667674, + "language_loss": 0.69208896, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71318877, + "num_input_tokens_seen": 211106120, + "step": 9793, + "time_per_iteration": 2.564401149749756 + }, + { + "auxiliary_loss_clip": 0.011131, + "auxiliary_loss_mlp": 0.01034115, + "balance_loss_clip": 1.039451, + "balance_loss_mlp": 1.02053738, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 1.8016611167227916, + "language_loss": 0.60350704, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.6249792, + "num_input_tokens_seen": 211122450, + "step": 9794, + "time_per_iteration": 2.4176409244537354 + }, + { + "auxiliary_loss_clip": 0.01087254, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.03986287, + "balance_loss_mlp": 1.02530289, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.7910316187987596, + "language_loss": 0.64971244, + "learning_rate": 1.525718531219257e-06, + "loss": 0.6709789, + "num_input_tokens_seen": 211141765, + "step": 9795, + "time_per_iteration": 2.554813861846924 + }, + { + "auxiliary_loss_clip": 0.01079266, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.03689039, + "balance_loss_mlp": 1.02014709, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.592583144759424, + "language_loss": 0.74167717, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.7627936, + "num_input_tokens_seen": 211160475, + "step": 9796, + "time_per_iteration": 4.092076301574707 + }, + { + "auxiliary_loss_clip": 0.01091834, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.03633189, + "balance_loss_mlp": 1.02403784, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.7093985018907667, + "language_loss": 0.83015299, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85144335, + "num_input_tokens_seen": 211180480, + "step": 9797, + "time_per_iteration": 2.5514087677001953 + }, + { + "auxiliary_loss_clip": 0.01088794, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.0355382, + "balance_loss_mlp": 1.01970196, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 2.966749848675761, + "language_loss": 0.79213667, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81335664, + "num_input_tokens_seen": 211198000, + "step": 9798, + "time_per_iteration": 2.523935079574585 + }, + { + "auxiliary_loss_clip": 0.01113215, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.03961682, + "balance_loss_mlp": 1.02186704, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.505751870585887, + "language_loss": 0.74288988, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76436079, + "num_input_tokens_seen": 211214765, + "step": 9799, + "time_per_iteration": 3.7834973335266113 + }, + { + "auxiliary_loss_clip": 0.01083052, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.03597474, + "balance_loss_mlp": 1.01756644, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 1.989498853324379, + "language_loss": 0.76367402, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78482538, + "num_input_tokens_seen": 211232335, + "step": 9800, + "time_per_iteration": 2.544144868850708 + }, + { + "auxiliary_loss_clip": 0.0106939, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.03618813, + "balance_loss_mlp": 1.02717447, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 6.270360923564982, + "language_loss": 0.78471339, + "learning_rate": 1.523448741022722e-06, + "loss": 0.80581993, + "num_input_tokens_seen": 211249985, + "step": 9801, + "time_per_iteration": 2.564791440963745 + }, + { + "auxiliary_loss_clip": 0.01084049, + "auxiliary_loss_mlp": 0.01028373, + "balance_loss_clip": 1.04219413, + "balance_loss_mlp": 1.0153017, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 1.796210558968994, + "language_loss": 0.66604251, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68716669, + "num_input_tokens_seen": 211268425, + "step": 9802, + "time_per_iteration": 2.5716569423675537 + }, + { + "auxiliary_loss_clip": 0.01103508, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.03993785, + "balance_loss_mlp": 1.01759005, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.5928579196229917, + "language_loss": 0.77928722, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80062938, + "num_input_tokens_seen": 211286680, + "step": 9803, + "time_per_iteration": 2.4561550617218018 + }, + { + "auxiliary_loss_clip": 0.01105052, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.03981018, + "balance_loss_mlp": 1.02359641, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 2.009962816591263, + "language_loss": 0.72883618, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75025636, + "num_input_tokens_seen": 211307700, + "step": 9804, + "time_per_iteration": 2.493649482727051 + }, + { + "auxiliary_loss_clip": 0.01092002, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.04162705, + "balance_loss_mlp": 1.0169009, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.5188822146376513, + "language_loss": 0.74966991, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.77089232, + "num_input_tokens_seen": 211324835, + "step": 9805, + "time_per_iteration": 2.519794464111328 + }, + { + "auxiliary_loss_clip": 0.01112816, + "auxiliary_loss_mlp": 0.00780643, + "balance_loss_clip": 1.04027414, + "balance_loss_mlp": 1.00075102, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 1.8104819883112655, + "language_loss": 0.77837002, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.79730463, + "num_input_tokens_seen": 211344130, + "step": 9806, + "time_per_iteration": 2.4768338203430176 + }, + { + "auxiliary_loss_clip": 0.01115871, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.03974736, + "balance_loss_mlp": 1.01981616, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 1.8099535494206138, + "language_loss": 0.76691449, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.78840494, + "num_input_tokens_seen": 211362915, + "step": 9807, + "time_per_iteration": 2.434293270111084 + }, + { + "auxiliary_loss_clip": 0.01111194, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.04306257, + "balance_loss_mlp": 1.01565027, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 1.7441794362495833, + "language_loss": 0.74811417, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76951504, + "num_input_tokens_seen": 211380700, + "step": 9808, + "time_per_iteration": 2.4590697288513184 + }, + { + "auxiliary_loss_clip": 0.01067633, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.03494477, + "balance_loss_mlp": 1.02137995, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 2.0230839672850665, + "language_loss": 0.71956944, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.7406069, + "num_input_tokens_seen": 211400095, + "step": 9809, + "time_per_iteration": 2.604153871536255 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.04020238, + "balance_loss_mlp": 1.02174854, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 2.113970156220005, + "language_loss": 0.82424778, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84558094, + "num_input_tokens_seen": 211417810, + "step": 9810, + "time_per_iteration": 2.5452656745910645 + }, + { + "auxiliary_loss_clip": 0.01107121, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.04127431, + "balance_loss_mlp": 1.01973748, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.6082086703922196, + "language_loss": 0.81138438, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8327868, + "num_input_tokens_seen": 211436020, + "step": 9811, + "time_per_iteration": 3.9759302139282227 + }, + { + "auxiliary_loss_clip": 0.01111516, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.04348326, + "balance_loss_mlp": 1.01623547, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 1.6945315864871797, + "language_loss": 0.76796627, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.7893877, + "num_input_tokens_seen": 211454335, + "step": 9812, + "time_per_iteration": 2.5061490535736084 + }, + { + "auxiliary_loss_clip": 0.01081073, + "auxiliary_loss_mlp": 0.01029465, + "balance_loss_clip": 1.0381062, + "balance_loss_mlp": 1.0185219, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 1.844285165335025, + "language_loss": 0.70454276, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72564816, + "num_input_tokens_seen": 211472775, + "step": 9813, + "time_per_iteration": 2.551196575164795 + }, + { + "auxiliary_loss_clip": 0.01090663, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.04119229, + "balance_loss_mlp": 1.02323139, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 2.692838915421561, + "language_loss": 0.72232664, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74359739, + "num_input_tokens_seen": 211492195, + "step": 9814, + "time_per_iteration": 2.5671634674072266 + }, + { + "auxiliary_loss_clip": 0.0109116, + "auxiliary_loss_mlp": 0.01033717, + "balance_loss_clip": 1.04130483, + "balance_loss_mlp": 1.02046657, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 2.2406494299945563, + "language_loss": 0.78705716, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80830586, + "num_input_tokens_seen": 211510220, + "step": 9815, + "time_per_iteration": 2.515735387802124 + }, + { + "auxiliary_loss_clip": 0.01094662, + "auxiliary_loss_mlp": 0.00781384, + "balance_loss_clip": 1.04147613, + "balance_loss_mlp": 1.00082278, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 2.051118529317716, + "language_loss": 0.76185882, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.78061926, + "num_input_tokens_seen": 211526260, + "step": 9816, + "time_per_iteration": 2.5919673442840576 + }, + { + "auxiliary_loss_clip": 0.01117453, + "auxiliary_loss_mlp": 0.0103678, + "balance_loss_clip": 1.04287052, + "balance_loss_mlp": 1.02358937, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 1.837062046272354, + "language_loss": 0.80661058, + "learning_rate": 1.517399156051309e-06, + "loss": 0.82815289, + "num_input_tokens_seen": 211542890, + "step": 9817, + "time_per_iteration": 2.4143033027648926 + }, + { + "auxiliary_loss_clip": 0.01065982, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_clip": 1.03585672, + "balance_loss_mlp": 1.02958918, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.695046308245383, + "language_loss": 0.76460397, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78571081, + "num_input_tokens_seen": 211562685, + "step": 9818, + "time_per_iteration": 2.588061571121216 + }, + { + "auxiliary_loss_clip": 0.01078914, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.0405606, + "balance_loss_mlp": 1.02063286, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 1.9754092090089348, + "language_loss": 0.66827333, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68939209, + "num_input_tokens_seen": 211579960, + "step": 9819, + "time_per_iteration": 2.539430856704712 + }, + { + "auxiliary_loss_clip": 0.01117426, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.04136038, + "balance_loss_mlp": 1.02328539, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.5433478090288915, + "language_loss": 0.77818167, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.79972076, + "num_input_tokens_seen": 211599310, + "step": 9820, + "time_per_iteration": 2.4948439598083496 + }, + { + "auxiliary_loss_clip": 0.01009, + "auxiliary_loss_mlp": 0.0101601, + "balance_loss_clip": 1.0159061, + "balance_loss_mlp": 1.01463306, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.987173445823823, + "language_loss": 0.65104365, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67129374, + "num_input_tokens_seen": 211658790, + "step": 9821, + "time_per_iteration": 3.1063895225524902 + }, + { + "auxiliary_loss_clip": 0.01077991, + "auxiliary_loss_mlp": 0.01041167, + "balance_loss_clip": 1.03808033, + "balance_loss_mlp": 1.02798223, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.9481433840183469, + "language_loss": 0.61087751, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63206905, + "num_input_tokens_seen": 211677240, + "step": 9822, + "time_per_iteration": 2.5492191314697266 + }, + { + "auxiliary_loss_clip": 0.01119354, + "auxiliary_loss_mlp": 0.01039634, + "balance_loss_clip": 1.04170418, + "balance_loss_mlp": 1.02617502, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 2.4020634982954823, + "language_loss": 0.82741845, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84900826, + "num_input_tokens_seen": 211695485, + "step": 9823, + "time_per_iteration": 2.428460121154785 + }, + { + "auxiliary_loss_clip": 0.01095978, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.04085028, + "balance_loss_mlp": 1.01735663, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 2.480240897219307, + "language_loss": 0.73275131, + "learning_rate": 1.514753932336165e-06, + "loss": 0.7540139, + "num_input_tokens_seen": 211713090, + "step": 9824, + "time_per_iteration": 2.522005796432495 + }, + { + "auxiliary_loss_clip": 0.01095052, + "auxiliary_loss_mlp": 0.00782069, + "balance_loss_clip": 1.03989482, + "balance_loss_mlp": 1.00079966, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.2341063716900824, + "language_loss": 0.83327734, + "learning_rate": 1.514376116721693e-06, + "loss": 0.85204858, + "num_input_tokens_seen": 211732510, + "step": 9825, + "time_per_iteration": 4.068004608154297 + }, + { + "auxiliary_loss_clip": 0.0110031, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.03909719, + "balance_loss_mlp": 1.02079725, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 2.7301021802150283, + "language_loss": 0.7700727, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.79139423, + "num_input_tokens_seen": 211748695, + "step": 9826, + "time_per_iteration": 2.4961817264556885 + }, + { + "auxiliary_loss_clip": 0.01091701, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.03990531, + "balance_loss_mlp": 1.01593935, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.5504096854046843, + "language_loss": 0.7211802, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74237955, + "num_input_tokens_seen": 211768545, + "step": 9827, + "time_per_iteration": 2.5209968090057373 + }, + { + "auxiliary_loss_clip": 0.01072386, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.03957582, + "balance_loss_mlp": 1.0206635, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.7225109137520382, + "language_loss": 0.79931062, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.82036042, + "num_input_tokens_seen": 211786665, + "step": 9828, + "time_per_iteration": 2.5711605548858643 + }, + { + "auxiliary_loss_clip": 0.01064888, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.0402956, + "balance_loss_mlp": 1.02441359, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.1496498702905713, + "language_loss": 0.88602948, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90705633, + "num_input_tokens_seen": 211801215, + "step": 9829, + "time_per_iteration": 2.5327091217041016 + }, + { + "auxiliary_loss_clip": 0.01025374, + "auxiliary_loss_mlp": 0.0100425, + "balance_loss_clip": 1.01997197, + "balance_loss_mlp": 1.00295663, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7601423558971176, + "language_loss": 0.57806545, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59836167, + "num_input_tokens_seen": 211857005, + "step": 9830, + "time_per_iteration": 3.0130836963653564 + }, + { + "auxiliary_loss_clip": 0.01113389, + "auxiliary_loss_mlp": 0.00781037, + "balance_loss_clip": 1.04227889, + "balance_loss_mlp": 1.0008707, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.1249125626260277, + "language_loss": 0.76022089, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.77916515, + "num_input_tokens_seen": 211876675, + "step": 9831, + "time_per_iteration": 2.470785140991211 + }, + { + "auxiliary_loss_clip": 0.01086178, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.04114223, + "balance_loss_mlp": 1.01798332, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.8532339081655982, + "language_loss": 0.7769748, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79815292, + "num_input_tokens_seen": 211895725, + "step": 9832, + "time_per_iteration": 2.510551929473877 + }, + { + "auxiliary_loss_clip": 0.01102866, + "auxiliary_loss_mlp": 0.01027332, + "balance_loss_clip": 1.03865016, + "balance_loss_mlp": 1.01487422, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 2.071235155510558, + "language_loss": 0.83062816, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85193014, + "num_input_tokens_seen": 211913860, + "step": 9833, + "time_per_iteration": 2.4994964599609375 + }, + { + "auxiliary_loss_clip": 0.0110547, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.03853238, + "balance_loss_mlp": 1.02217746, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.5243341579987684, + "language_loss": 0.7388556, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76026416, + "num_input_tokens_seen": 211932880, + "step": 9834, + "time_per_iteration": 2.4690544605255127 + }, + { + "auxiliary_loss_clip": 0.01114967, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.03896189, + "balance_loss_mlp": 1.02537096, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 3.0575452637956637, + "language_loss": 0.77956831, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80110121, + "num_input_tokens_seen": 211948625, + "step": 9835, + "time_per_iteration": 2.408681869506836 + }, + { + "auxiliary_loss_clip": 0.01095111, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.03785074, + "balance_loss_mlp": 1.02308559, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 2.0936067805812537, + "language_loss": 0.73637611, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.75768983, + "num_input_tokens_seen": 211965355, + "step": 9836, + "time_per_iteration": 4.11375617980957 + }, + { + "auxiliary_loss_clip": 0.01082905, + "auxiliary_loss_mlp": 0.01033783, + "balance_loss_clip": 1.039047, + "balance_loss_mlp": 1.02011526, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 1.9729913781438106, + "language_loss": 0.82543349, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84660041, + "num_input_tokens_seen": 211982245, + "step": 9837, + "time_per_iteration": 2.5364599227905273 + }, + { + "auxiliary_loss_clip": 0.01078616, + "auxiliary_loss_mlp": 0.01035727, + "balance_loss_clip": 1.0359267, + "balance_loss_mlp": 1.02125478, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.6849292525651927, + "language_loss": 0.7942481, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81539154, + "num_input_tokens_seen": 212000250, + "step": 9838, + "time_per_iteration": 3.953974485397339 + }, + { + "auxiliary_loss_clip": 0.01071316, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.04136825, + "balance_loss_mlp": 1.02377737, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 3.162177885672784, + "language_loss": 0.69730508, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.71838731, + "num_input_tokens_seen": 212017505, + "step": 9839, + "time_per_iteration": 2.565134286880493 + }, + { + "auxiliary_loss_clip": 0.01093001, + "auxiliary_loss_mlp": 0.01040295, + "balance_loss_clip": 1.04138899, + "balance_loss_mlp": 1.02761722, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.2973153270813502, + "language_loss": 0.65384209, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.67517507, + "num_input_tokens_seen": 212034595, + "step": 9840, + "time_per_iteration": 2.5331873893737793 + }, + { + "auxiliary_loss_clip": 0.01088812, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.03548634, + "balance_loss_mlp": 1.01870418, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.8195780161190829, + "language_loss": 0.8186332, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83984447, + "num_input_tokens_seen": 212055775, + "step": 9841, + "time_per_iteration": 2.5485637187957764 + }, + { + "auxiliary_loss_clip": 0.01090015, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.04063964, + "balance_loss_mlp": 1.01898718, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.780653103222048, + "language_loss": 0.69044709, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71166098, + "num_input_tokens_seen": 212074000, + "step": 9842, + "time_per_iteration": 2.512906789779663 + }, + { + "auxiliary_loss_clip": 0.01091758, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.03592038, + "balance_loss_mlp": 1.02150989, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 2.2076696749296616, + "language_loss": 0.82418311, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84544802, + "num_input_tokens_seen": 212091415, + "step": 9843, + "time_per_iteration": 2.518486261367798 + }, + { + "auxiliary_loss_clip": 0.0108988, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.03413224, + "balance_loss_mlp": 1.02101243, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.3156035938468458, + "language_loss": 0.81269944, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.83395475, + "num_input_tokens_seen": 212105255, + "step": 9844, + "time_per_iteration": 2.5144922733306885 + }, + { + "auxiliary_loss_clip": 0.01066076, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.03781986, + "balance_loss_mlp": 1.01675808, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.9636021942113528, + "language_loss": 0.7451241, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.76608503, + "num_input_tokens_seen": 212122765, + "step": 9845, + "time_per_iteration": 2.550882339477539 + }, + { + "auxiliary_loss_clip": 0.01080073, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.04012752, + "balance_loss_mlp": 1.01868224, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 1.8041832328288203, + "language_loss": 0.63701177, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65815353, + "num_input_tokens_seen": 212143960, + "step": 9846, + "time_per_iteration": 2.719536066055298 + }, + { + "auxiliary_loss_clip": 0.01071746, + "auxiliary_loss_mlp": 0.00776337, + "balance_loss_clip": 1.0353179, + "balance_loss_mlp": 1.00063121, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 2.1504348644297075, + "language_loss": 0.76225156, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78073239, + "num_input_tokens_seen": 212162005, + "step": 9847, + "time_per_iteration": 2.5738162994384766 + }, + { + "auxiliary_loss_clip": 0.01090547, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.03646374, + "balance_loss_mlp": 1.01763463, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.881164258569692, + "language_loss": 0.62375772, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64497739, + "num_input_tokens_seen": 212181635, + "step": 9848, + "time_per_iteration": 2.5251028537750244 + }, + { + "auxiliary_loss_clip": 0.0110541, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.03986371, + "balance_loss_mlp": 1.02573049, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 1.6566095091923652, + "language_loss": 0.75738722, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.77882612, + "num_input_tokens_seen": 212201615, + "step": 9849, + "time_per_iteration": 2.490712881088257 + }, + { + "auxiliary_loss_clip": 0.01092826, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.03736436, + "balance_loss_mlp": 1.02546215, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.7065403680155016, + "language_loss": 0.75014412, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77146184, + "num_input_tokens_seen": 212219355, + "step": 9850, + "time_per_iteration": 2.5347745418548584 + }, + { + "auxiliary_loss_clip": 0.01079454, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.03783584, + "balance_loss_mlp": 1.02195287, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 2.3198347512283655, + "language_loss": 0.75559592, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.77674401, + "num_input_tokens_seen": 212236710, + "step": 9851, + "time_per_iteration": 4.087177515029907 + }, + { + "auxiliary_loss_clip": 0.01091551, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.03850186, + "balance_loss_mlp": 1.02033114, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 2.1948554662064574, + "language_loss": 0.70986426, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.73111308, + "num_input_tokens_seen": 212256195, + "step": 9852, + "time_per_iteration": 2.5560786724090576 + }, + { + "auxiliary_loss_clip": 0.01098975, + "auxiliary_loss_mlp": 0.00780413, + "balance_loss_clip": 1.04061902, + "balance_loss_mlp": 1.00077343, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 2.103962340035673, + "language_loss": 0.80373716, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82253104, + "num_input_tokens_seen": 212274085, + "step": 9853, + "time_per_iteration": 2.502601385116577 + }, + { + "auxiliary_loss_clip": 0.01088499, + "auxiliary_loss_mlp": 0.01026882, + "balance_loss_clip": 1.03671134, + "balance_loss_mlp": 1.0144726, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.969098888102129, + "language_loss": 0.6744256, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69557935, + "num_input_tokens_seen": 212295530, + "step": 9854, + "time_per_iteration": 2.578989267349243 + }, + { + "auxiliary_loss_clip": 0.01082191, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.03917754, + "balance_loss_mlp": 1.01414299, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.767630894155531, + "language_loss": 0.88515872, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.90625501, + "num_input_tokens_seen": 212313770, + "step": 9855, + "time_per_iteration": 2.563281774520874 + }, + { + "auxiliary_loss_clip": 0.01096155, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.03814232, + "balance_loss_mlp": 1.01657784, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 2.442250140027775, + "language_loss": 0.86565894, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.88691521, + "num_input_tokens_seen": 212331525, + "step": 9856, + "time_per_iteration": 2.4740328788757324 + }, + { + "auxiliary_loss_clip": 0.01102194, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.0364728, + "balance_loss_mlp": 1.02472973, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 1.8918313644726905, + "language_loss": 0.77663773, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79803109, + "num_input_tokens_seen": 212347295, + "step": 9857, + "time_per_iteration": 2.539706230163574 + }, + { + "auxiliary_loss_clip": 0.01073475, + "auxiliary_loss_mlp": 0.01048005, + "balance_loss_clip": 1.03333759, + "balance_loss_mlp": 1.03246033, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 2.175316946671878, + "language_loss": 0.64498115, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66619593, + "num_input_tokens_seen": 212365750, + "step": 9858, + "time_per_iteration": 2.5468385219573975 + }, + { + "auxiliary_loss_clip": 0.01101228, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.03835928, + "balance_loss_mlp": 1.01964378, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 2.1139875659736713, + "language_loss": 0.77170706, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79304439, + "num_input_tokens_seen": 212385300, + "step": 9859, + "time_per_iteration": 2.548907518386841 + }, + { + "auxiliary_loss_clip": 0.01076926, + "auxiliary_loss_mlp": 0.00780313, + "balance_loss_clip": 1.04319966, + "balance_loss_mlp": 1.00069392, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 2.1955575965689396, + "language_loss": 0.7549991, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77357149, + "num_input_tokens_seen": 212402140, + "step": 9860, + "time_per_iteration": 2.6123430728912354 + }, + { + "auxiliary_loss_clip": 0.01080531, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.04193425, + "balance_loss_mlp": 1.02249563, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.6431552354436236, + "language_loss": 0.76089513, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78204262, + "num_input_tokens_seen": 212421790, + "step": 9861, + "time_per_iteration": 2.6427576541900635 + }, + { + "auxiliary_loss_clip": 0.01076432, + "auxiliary_loss_mlp": 0.01026598, + "balance_loss_clip": 1.0361954, + "balance_loss_mlp": 1.01535106, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 5.863344065287092, + "language_loss": 0.70767164, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72870195, + "num_input_tokens_seen": 212442115, + "step": 9862, + "time_per_iteration": 2.592170238494873 + }, + { + "auxiliary_loss_clip": 0.01064072, + "auxiliary_loss_mlp": 0.01038234, + "balance_loss_clip": 1.037498, + "balance_loss_mlp": 1.02586043, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.7328353290229361, + "language_loss": 0.77728784, + "learning_rate": 1.500032899685832e-06, + "loss": 0.79831088, + "num_input_tokens_seen": 212459535, + "step": 9863, + "time_per_iteration": 2.6221508979797363 + }, + { + "auxiliary_loss_clip": 0.0108976, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.03922164, + "balance_loss_mlp": 1.0260303, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.7486609844378758, + "language_loss": 0.70849526, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72978693, + "num_input_tokens_seen": 212479385, + "step": 9864, + "time_per_iteration": 2.559136390686035 + }, + { + "auxiliary_loss_clip": 0.01088727, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.03706551, + "balance_loss_mlp": 1.02080929, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 1.4098265508518646, + "language_loss": 0.67389178, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69512367, + "num_input_tokens_seen": 212500060, + "step": 9865, + "time_per_iteration": 4.002001762390137 + }, + { + "auxiliary_loss_clip": 0.01096934, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.03925836, + "balance_loss_mlp": 1.02065825, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 2.2009837736184332, + "language_loss": 0.77927983, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80058897, + "num_input_tokens_seen": 212518590, + "step": 9866, + "time_per_iteration": 2.5045647621154785 + }, + { + "auxiliary_loss_clip": 0.01086846, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.03845215, + "balance_loss_mlp": 1.01426613, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 2.2160608753489304, + "language_loss": 0.72504652, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74617469, + "num_input_tokens_seen": 212538190, + "step": 9867, + "time_per_iteration": 2.5950381755828857 + }, + { + "auxiliary_loss_clip": 0.01093013, + "auxiliary_loss_mlp": 0.01032896, + "balance_loss_clip": 1.0405252, + "balance_loss_mlp": 1.01863265, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.5107213913007143, + "language_loss": 0.66541964, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68667871, + "num_input_tokens_seen": 212557820, + "step": 9868, + "time_per_iteration": 2.517080068588257 + }, + { + "auxiliary_loss_clip": 0.01061979, + "auxiliary_loss_mlp": 0.00779625, + "balance_loss_clip": 1.03667247, + "balance_loss_mlp": 1.00056982, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.5871396595403833, + "language_loss": 0.75527787, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77369392, + "num_input_tokens_seen": 212577645, + "step": 9869, + "time_per_iteration": 2.6664135456085205 + }, + { + "auxiliary_loss_clip": 0.01064286, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.0388813, + "balance_loss_mlp": 1.02514827, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.600835842303194, + "language_loss": 0.74481899, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.7658515, + "num_input_tokens_seen": 212603430, + "step": 9870, + "time_per_iteration": 2.904512405395508 + }, + { + "auxiliary_loss_clip": 0.01071854, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.03577518, + "balance_loss_mlp": 1.01606202, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 2.5393444465312545, + "language_loss": 0.72058833, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.74159545, + "num_input_tokens_seen": 212620730, + "step": 9871, + "time_per_iteration": 2.599949359893799 + }, + { + "auxiliary_loss_clip": 0.01084584, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.03938913, + "balance_loss_mlp": 1.01957572, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 2.2088323764678566, + "language_loss": 0.74520886, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76638854, + "num_input_tokens_seen": 212639745, + "step": 9872, + "time_per_iteration": 2.615682601928711 + }, + { + "auxiliary_loss_clip": 0.01111437, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.04201889, + "balance_loss_mlp": 1.02753234, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.2913482879750737, + "language_loss": 0.79430544, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.81583691, + "num_input_tokens_seen": 212655915, + "step": 9873, + "time_per_iteration": 2.469308853149414 + }, + { + "auxiliary_loss_clip": 0.01103543, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.03847396, + "balance_loss_mlp": 1.018273, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.672723826687818, + "language_loss": 0.8510344, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87238801, + "num_input_tokens_seen": 212676115, + "step": 9874, + "time_per_iteration": 2.5192129611968994 + }, + { + "auxiliary_loss_clip": 0.01024022, + "auxiliary_loss_mlp": 0.0100624, + "balance_loss_clip": 1.01767898, + "balance_loss_mlp": 1.00494671, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.712145276190502, + "language_loss": 0.6000371, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62033963, + "num_input_tokens_seen": 212737560, + "step": 9875, + "time_per_iteration": 3.155482769012451 + }, + { + "auxiliary_loss_clip": 0.01092643, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.03420019, + "balance_loss_mlp": 1.01953387, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 2.178134688144686, + "language_loss": 0.77362013, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.79489219, + "num_input_tokens_seen": 212755365, + "step": 9876, + "time_per_iteration": 4.063056945800781 + }, + { + "auxiliary_loss_clip": 0.01096442, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.03434789, + "balance_loss_mlp": 1.01800132, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.4780607525663207, + "language_loss": 0.75821829, + "learning_rate": 1.494755415907243e-06, + "loss": 0.7794863, + "num_input_tokens_seen": 212773875, + "step": 9877, + "time_per_iteration": 2.4916141033172607 + }, + { + "auxiliary_loss_clip": 0.01102653, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.03660333, + "balance_loss_mlp": 1.0197407, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 2.387418493258168, + "language_loss": 0.81433743, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83569789, + "num_input_tokens_seen": 212790590, + "step": 9878, + "time_per_iteration": 3.6639420986175537 + }, + { + "auxiliary_loss_clip": 0.01089203, + "auxiliary_loss_mlp": 0.00781137, + "balance_loss_clip": 1.03657758, + "balance_loss_mlp": 1.00053847, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.7021822628838252, + "language_loss": 0.71118468, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.72988808, + "num_input_tokens_seen": 212812265, + "step": 9879, + "time_per_iteration": 2.715885639190674 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.03957427, + "balance_loss_mlp": 1.02377224, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.43306676211538, + "language_loss": 0.57606411, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59745729, + "num_input_tokens_seen": 212831915, + "step": 9880, + "time_per_iteration": 2.5218236446380615 + }, + { + "auxiliary_loss_clip": 0.0110614, + "auxiliary_loss_mlp": 0.01036905, + "balance_loss_clip": 1.03901684, + "balance_loss_mlp": 1.02308273, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.8798204560562732, + "language_loss": 0.7739467, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79537714, + "num_input_tokens_seen": 212851350, + "step": 9881, + "time_per_iteration": 2.598783493041992 + }, + { + "auxiliary_loss_clip": 0.01102052, + "auxiliary_loss_mlp": 0.01025767, + "balance_loss_clip": 1.03700376, + "balance_loss_mlp": 1.01314878, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 3.0620268336174004, + "language_loss": 0.82653141, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84780955, + "num_input_tokens_seen": 212867995, + "step": 9882, + "time_per_iteration": 2.4362354278564453 + }, + { + "auxiliary_loss_clip": 0.01103373, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.03977799, + "balance_loss_mlp": 1.02152264, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.016695607948057, + "language_loss": 0.7935822, + "learning_rate": 1.492494784393667e-06, + "loss": 0.8149581, + "num_input_tokens_seen": 212885220, + "step": 9883, + "time_per_iteration": 2.469006061553955 + }, + { + "auxiliary_loss_clip": 0.01083847, + "auxiliary_loss_mlp": 0.00778267, + "balance_loss_clip": 1.04036784, + "balance_loss_mlp": 1.00066996, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.8995368733724762, + "language_loss": 0.74677646, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76539755, + "num_input_tokens_seen": 212903195, + "step": 9884, + "time_per_iteration": 2.575944185256958 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.04162633, + "balance_loss_mlp": 1.01803255, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 2.043760096085215, + "language_loss": 0.66344225, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68491614, + "num_input_tokens_seen": 212923340, + "step": 9885, + "time_per_iteration": 2.4743576049804688 + }, + { + "auxiliary_loss_clip": 0.01092348, + "auxiliary_loss_mlp": 0.01036579, + "balance_loss_clip": 1.04034376, + "balance_loss_mlp": 1.02363873, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.924510370382648, + "language_loss": 0.78191859, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.80320787, + "num_input_tokens_seen": 212942755, + "step": 9886, + "time_per_iteration": 2.555358409881592 + }, + { + "auxiliary_loss_clip": 0.01024291, + "auxiliary_loss_mlp": 0.01001436, + "balance_loss_clip": 1.01241946, + "balance_loss_mlp": 1.00008917, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8434948145690865, + "language_loss": 0.64505744, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66531467, + "num_input_tokens_seen": 212999355, + "step": 9887, + "time_per_iteration": 2.9248406887054443 + }, + { + "auxiliary_loss_clip": 0.0110122, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.04046702, + "balance_loss_mlp": 1.02050805, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 2.769964401350903, + "language_loss": 0.69547653, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71682084, + "num_input_tokens_seen": 213018570, + "step": 9888, + "time_per_iteration": 2.514310598373413 + }, + { + "auxiliary_loss_clip": 0.01089735, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.03737319, + "balance_loss_mlp": 1.01853812, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.8084728330121602, + "language_loss": 0.79618782, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81740862, + "num_input_tokens_seen": 213037735, + "step": 9889, + "time_per_iteration": 2.6128296852111816 + }, + { + "auxiliary_loss_clip": 0.01078964, + "auxiliary_loss_mlp": 0.01026549, + "balance_loss_clip": 1.03664601, + "balance_loss_mlp": 1.01455009, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.5819454659541135, + "language_loss": 0.70821339, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72926849, + "num_input_tokens_seen": 213057160, + "step": 9890, + "time_per_iteration": 4.0887861251831055 + }, + { + "auxiliary_loss_clip": 0.01083623, + "auxiliary_loss_mlp": 0.01035661, + "balance_loss_clip": 1.03978968, + "balance_loss_mlp": 1.02220273, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 1.8413807325528921, + "language_loss": 0.68882024, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71001309, + "num_input_tokens_seen": 213073630, + "step": 9891, + "time_per_iteration": 2.6022140979766846 + }, + { + "auxiliary_loss_clip": 0.01099785, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.03647673, + "balance_loss_mlp": 1.02864957, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.9606942530344416, + "language_loss": 0.53710747, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.5585171, + "num_input_tokens_seen": 213092450, + "step": 9892, + "time_per_iteration": 2.6567044258117676 + }, + { + "auxiliary_loss_clip": 0.01008028, + "auxiliary_loss_mlp": 0.01005495, + "balance_loss_clip": 1.01458764, + "balance_loss_mlp": 1.00417221, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6645487790368511, + "language_loss": 0.54571462, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.5658499, + "num_input_tokens_seen": 213155465, + "step": 9893, + "time_per_iteration": 3.2403509616851807 + }, + { + "auxiliary_loss_clip": 0.01082206, + "auxiliary_loss_mlp": 0.01035366, + "balance_loss_clip": 1.04453731, + "balance_loss_mlp": 1.02284908, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.8097649808765148, + "language_loss": 0.74783194, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76900768, + "num_input_tokens_seen": 213174875, + "step": 9894, + "time_per_iteration": 2.607335090637207 + }, + { + "auxiliary_loss_clip": 0.01083058, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.0388509, + "balance_loss_mlp": 1.02005577, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 3.088513633335943, + "language_loss": 0.77719975, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79836035, + "num_input_tokens_seen": 213192695, + "step": 9895, + "time_per_iteration": 2.5359926223754883 + }, + { + "auxiliary_loss_clip": 0.01067039, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.03542018, + "balance_loss_mlp": 1.02164268, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.6996357368991633, + "language_loss": 0.79025906, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.8112756, + "num_input_tokens_seen": 213211195, + "step": 9896, + "time_per_iteration": 2.594571590423584 + }, + { + "auxiliary_loss_clip": 0.01104042, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.03879881, + "balance_loss_mlp": 1.02092206, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.5705837831415845, + "language_loss": 0.8333168, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.85469377, + "num_input_tokens_seen": 213231975, + "step": 9897, + "time_per_iteration": 2.5363779067993164 + }, + { + "auxiliary_loss_clip": 0.01088693, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.0379504, + "balance_loss_mlp": 1.02269232, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 2.0694933073967485, + "language_loss": 0.70814127, + "learning_rate": 1.486846243389939e-06, + "loss": 0.72937858, + "num_input_tokens_seen": 213249760, + "step": 9898, + "time_per_iteration": 2.5726442337036133 + }, + { + "auxiliary_loss_clip": 0.01103645, + "auxiliary_loss_mlp": 0.01050859, + "balance_loss_clip": 1.03619969, + "balance_loss_mlp": 1.03416967, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.355787243512528, + "language_loss": 0.63918275, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.6607278, + "num_input_tokens_seen": 213269890, + "step": 9899, + "time_per_iteration": 2.5688400268554688 + }, + { + "auxiliary_loss_clip": 0.01115275, + "auxiliary_loss_mlp": 0.0102833, + "balance_loss_clip": 1.04128504, + "balance_loss_mlp": 1.01702857, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.6388132284402146, + "language_loss": 0.71690953, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.73834562, + "num_input_tokens_seen": 213289400, + "step": 9900, + "time_per_iteration": 2.503460645675659 + }, + { + "auxiliary_loss_clip": 0.01112648, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.03964448, + "balance_loss_mlp": 1.02022433, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.6702646718893173, + "language_loss": 0.84409291, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86555386, + "num_input_tokens_seen": 213308040, + "step": 9901, + "time_per_iteration": 2.5040359497070312 + }, + { + "auxiliary_loss_clip": 0.00989344, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.01282859, + "balance_loss_mlp": 1.00167644, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.7983052597871964, + "language_loss": 0.58194953, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60187286, + "num_input_tokens_seen": 213358585, + "step": 9902, + "time_per_iteration": 3.0132510662078857 + }, + { + "auxiliary_loss_clip": 0.01056635, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.03592134, + "balance_loss_mlp": 1.01911271, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 2.0261281935150546, + "language_loss": 0.77215433, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79304004, + "num_input_tokens_seen": 213379585, + "step": 9903, + "time_per_iteration": 2.750284433364868 + }, + { + "auxiliary_loss_clip": 0.01079491, + "auxiliary_loss_mlp": 0.01031826, + "balance_loss_clip": 1.04140818, + "balance_loss_mlp": 1.01934481, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.635072467300952, + "language_loss": 0.77686727, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79798043, + "num_input_tokens_seen": 213401465, + "step": 9904, + "time_per_iteration": 4.253260850906372 + }, + { + "auxiliary_loss_clip": 0.01100103, + "auxiliary_loss_mlp": 0.01037092, + "balance_loss_clip": 1.03843284, + "balance_loss_mlp": 1.02425313, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 1.9167423258235974, + "language_loss": 0.731493, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.75286496, + "num_input_tokens_seen": 213422720, + "step": 9905, + "time_per_iteration": 2.565556526184082 + }, + { + "auxiliary_loss_clip": 0.01106706, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.0388447, + "balance_loss_mlp": 1.017061, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.7107109263679532, + "language_loss": 0.69692254, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71829593, + "num_input_tokens_seen": 213439480, + "step": 9906, + "time_per_iteration": 2.449674367904663 + }, + { + "auxiliary_loss_clip": 0.01103805, + "auxiliary_loss_mlp": 0.01032978, + "balance_loss_clip": 1.03944564, + "balance_loss_mlp": 1.02017486, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 2.0725903666143073, + "language_loss": 0.75627756, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77764541, + "num_input_tokens_seen": 213458895, + "step": 9907, + "time_per_iteration": 2.4863674640655518 + }, + { + "auxiliary_loss_clip": 0.01089089, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.03688443, + "balance_loss_mlp": 1.01862502, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.6080415627208449, + "language_loss": 0.67057961, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69178629, + "num_input_tokens_seen": 213481730, + "step": 9908, + "time_per_iteration": 2.6431260108947754 + }, + { + "auxiliary_loss_clip": 0.01044459, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.03371322, + "balance_loss_mlp": 1.01837611, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 1.9075553764513824, + "language_loss": 0.76302564, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78378326, + "num_input_tokens_seen": 213497225, + "step": 9909, + "time_per_iteration": 2.6274380683898926 + }, + { + "auxiliary_loss_clip": 0.01033546, + "auxiliary_loss_mlp": 0.01008179, + "balance_loss_clip": 1.00848651, + "balance_loss_mlp": 1.00712383, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9399483094545045, + "language_loss": 0.73488295, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75530016, + "num_input_tokens_seen": 213556890, + "step": 9910, + "time_per_iteration": 3.0769736766815186 + }, + { + "auxiliary_loss_clip": 0.01089011, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.03762007, + "balance_loss_mlp": 1.01984894, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 3.5184516242757518, + "language_loss": 0.69687486, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71809727, + "num_input_tokens_seen": 213575800, + "step": 9911, + "time_per_iteration": 2.517610788345337 + }, + { + "auxiliary_loss_clip": 0.01113887, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.0424788, + "balance_loss_mlp": 1.02347755, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 1.8320916600330912, + "language_loss": 0.65816319, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.67967486, + "num_input_tokens_seen": 213592740, + "step": 9912, + "time_per_iteration": 2.469778060913086 + }, + { + "auxiliary_loss_clip": 0.01085218, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.037287, + "balance_loss_mlp": 1.02197695, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 1.82654479063153, + "language_loss": 0.73406821, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75528753, + "num_input_tokens_seen": 213611970, + "step": 9913, + "time_per_iteration": 2.590485095977783 + }, + { + "auxiliary_loss_clip": 0.01084029, + "auxiliary_loss_mlp": 0.00778487, + "balance_loss_clip": 1.03761029, + "balance_loss_mlp": 1.0005846, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 1.9934573886895215, + "language_loss": 0.80068803, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.81931317, + "num_input_tokens_seen": 213632230, + "step": 9914, + "time_per_iteration": 2.6062510013580322 + }, + { + "auxiliary_loss_clip": 0.01077873, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.03507411, + "balance_loss_mlp": 1.01962507, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.7164885002294181, + "language_loss": 0.68214744, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.70324749, + "num_input_tokens_seen": 213649645, + "step": 9915, + "time_per_iteration": 4.05867338180542 + }, + { + "auxiliary_loss_clip": 0.0108761, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.03808427, + "balance_loss_mlp": 1.02087617, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.6240787421087357, + "language_loss": 0.78554463, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.80675328, + "num_input_tokens_seen": 213668850, + "step": 9916, + "time_per_iteration": 2.531947135925293 + }, + { + "auxiliary_loss_clip": 0.01092956, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.03652871, + "balance_loss_mlp": 1.01812601, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.8466899151145704, + "language_loss": 0.82705188, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84829652, + "num_input_tokens_seen": 213685695, + "step": 9917, + "time_per_iteration": 2.523484706878662 + }, + { + "auxiliary_loss_clip": 0.01089472, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.03867388, + "balance_loss_mlp": 1.02131391, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.7914635514420845, + "language_loss": 0.77198499, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79321039, + "num_input_tokens_seen": 213703515, + "step": 9918, + "time_per_iteration": 3.900507926940918 + }, + { + "auxiliary_loss_clip": 0.01107823, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.04242551, + "balance_loss_mlp": 1.02344918, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.5638063173038863, + "language_loss": 0.78940105, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.81084263, + "num_input_tokens_seen": 213724170, + "step": 9919, + "time_per_iteration": 2.5487890243530273 + }, + { + "auxiliary_loss_clip": 0.01092389, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.04254293, + "balance_loss_mlp": 1.01580966, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.8610853874036193, + "language_loss": 0.775267, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79648596, + "num_input_tokens_seen": 213740620, + "step": 9920, + "time_per_iteration": 2.5351603031158447 + }, + { + "auxiliary_loss_clip": 0.011032, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.03906047, + "balance_loss_mlp": 1.01816285, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.1833385565395615, + "language_loss": 0.82667458, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84803057, + "num_input_tokens_seen": 213755390, + "step": 9921, + "time_per_iteration": 2.4993412494659424 + }, + { + "auxiliary_loss_clip": 0.01100639, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.03672731, + "balance_loss_mlp": 1.01753819, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 2.1009544161504223, + "language_loss": 0.80790985, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82922822, + "num_input_tokens_seen": 213773225, + "step": 9922, + "time_per_iteration": 2.461998701095581 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.00778003, + "balance_loss_clip": 1.03970766, + "balance_loss_mlp": 1.00051665, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.8429669293486692, + "language_loss": 0.76597226, + "learning_rate": 1.477441761580111e-06, + "loss": 0.78481442, + "num_input_tokens_seen": 213791860, + "step": 9923, + "time_per_iteration": 2.524681329727173 + }, + { + "auxiliary_loss_clip": 0.01107245, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.04463983, + "balance_loss_mlp": 1.01671839, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.9760871470439503, + "language_loss": 0.76054591, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.78193229, + "num_input_tokens_seen": 213809455, + "step": 9924, + "time_per_iteration": 2.514723777770996 + }, + { + "auxiliary_loss_clip": 0.01096338, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.03607357, + "balance_loss_mlp": 1.02217638, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 1.806946356318002, + "language_loss": 0.6662541, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68758148, + "num_input_tokens_seen": 213826615, + "step": 9925, + "time_per_iteration": 2.4482526779174805 + }, + { + "auxiliary_loss_clip": 0.01086895, + "auxiliary_loss_mlp": 0.01039867, + "balance_loss_clip": 1.03991807, + "balance_loss_mlp": 1.02633071, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 1.9853041543080765, + "language_loss": 0.71616775, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.73743534, + "num_input_tokens_seen": 213844495, + "step": 9926, + "time_per_iteration": 2.4960901737213135 + }, + { + "auxiliary_loss_clip": 0.01074084, + "auxiliary_loss_mlp": 0.00780831, + "balance_loss_clip": 1.03679085, + "balance_loss_mlp": 1.00069332, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 2.874945151861427, + "language_loss": 0.70241612, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.72096527, + "num_input_tokens_seen": 213869125, + "step": 9927, + "time_per_iteration": 2.7972726821899414 + }, + { + "auxiliary_loss_clip": 0.01073101, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.03862774, + "balance_loss_mlp": 1.01974869, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.789907847134882, + "language_loss": 0.64132512, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.66239572, + "num_input_tokens_seen": 213891115, + "step": 9928, + "time_per_iteration": 2.86930251121521 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01033869, + "balance_loss_clip": 1.0385704, + "balance_loss_mlp": 1.02136374, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.6670231595446874, + "language_loss": 0.69186085, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71332413, + "num_input_tokens_seen": 213911925, + "step": 9929, + "time_per_iteration": 4.17698860168457 + }, + { + "auxiliary_loss_clip": 0.01068953, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.03932071, + "balance_loss_mlp": 1.02213001, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.8935527622421184, + "language_loss": 0.76702756, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78805792, + "num_input_tokens_seen": 213930715, + "step": 9930, + "time_per_iteration": 2.654078483581543 + }, + { + "auxiliary_loss_clip": 0.01093912, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.04286742, + "balance_loss_mlp": 1.01772094, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 4.789971944751481, + "language_loss": 0.6859777, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.70723665, + "num_input_tokens_seen": 213950015, + "step": 9931, + "time_per_iteration": 2.7016499042510986 + }, + { + "auxiliary_loss_clip": 0.01031668, + "auxiliary_loss_mlp": 0.01001514, + "balance_loss_clip": 1.02072001, + "balance_loss_mlp": 1.0001725, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 1.0836710264439071, + "language_loss": 0.64192837, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66226017, + "num_input_tokens_seen": 214003330, + "step": 9932, + "time_per_iteration": 2.9864323139190674 + }, + { + "auxiliary_loss_clip": 0.01085911, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.0388217, + "balance_loss_mlp": 1.01685619, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 1.9110550636251071, + "language_loss": 0.74426484, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76543134, + "num_input_tokens_seen": 214021680, + "step": 9933, + "time_per_iteration": 2.7313013076782227 + }, + { + "auxiliary_loss_clip": 0.0103362, + "auxiliary_loss_mlp": 0.00999992, + "balance_loss_clip": 1.0243721, + "balance_loss_mlp": 0.99864525, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.666132154937889, + "language_loss": 0.51998216, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54031831, + "num_input_tokens_seen": 214090265, + "step": 9934, + "time_per_iteration": 3.175187110900879 + }, + { + "auxiliary_loss_clip": 0.01034639, + "auxiliary_loss_mlp": 0.01001847, + "balance_loss_clip": 1.00965202, + "balance_loss_mlp": 1.00060737, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 1.0357918931008983, + "language_loss": 0.54206431, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56242919, + "num_input_tokens_seen": 214146375, + "step": 9935, + "time_per_iteration": 2.976844549179077 + }, + { + "auxiliary_loss_clip": 0.01094962, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.03925061, + "balance_loss_mlp": 1.01817918, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 2.0014837105858168, + "language_loss": 0.65506876, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.67633539, + "num_input_tokens_seen": 214165340, + "step": 9936, + "time_per_iteration": 2.5749869346618652 + }, + { + "auxiliary_loss_clip": 0.0106667, + "auxiliary_loss_mlp": 0.010377, + "balance_loss_clip": 1.04117548, + "balance_loss_mlp": 1.0246942, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.1001612674304955, + "language_loss": 0.6745894, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69563305, + "num_input_tokens_seen": 214181360, + "step": 9937, + "time_per_iteration": 2.581204414367676 + }, + { + "auxiliary_loss_clip": 0.01107835, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.03971171, + "balance_loss_mlp": 1.01990533, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 2.012678325727225, + "language_loss": 0.77444422, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79585934, + "num_input_tokens_seen": 214198525, + "step": 9938, + "time_per_iteration": 2.4932315349578857 + }, + { + "auxiliary_loss_clip": 0.01105047, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.03757739, + "balance_loss_mlp": 1.0189898, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.5230744661531228, + "language_loss": 0.75801587, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.77939045, + "num_input_tokens_seen": 214218710, + "step": 9939, + "time_per_iteration": 2.5305676460266113 + }, + { + "auxiliary_loss_clip": 0.01072522, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.0386517, + "balance_loss_mlp": 1.01466954, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.2467492869296466, + "language_loss": 0.69042337, + "learning_rate": 1.471053774486878e-06, + "loss": 0.71144676, + "num_input_tokens_seen": 214237800, + "step": 9940, + "time_per_iteration": 2.6135313510894775 + }, + { + "auxiliary_loss_clip": 0.01090646, + "auxiliary_loss_mlp": 0.01033343, + "balance_loss_clip": 1.04307675, + "balance_loss_mlp": 1.02116013, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3126892732275393, + "language_loss": 0.70412874, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72536862, + "num_input_tokens_seen": 214260355, + "step": 9941, + "time_per_iteration": 2.6578242778778076 + }, + { + "auxiliary_loss_clip": 0.01091458, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.03766918, + "balance_loss_mlp": 1.02039552, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 1.894284254960242, + "language_loss": 0.77527601, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79652238, + "num_input_tokens_seen": 214277120, + "step": 9942, + "time_per_iteration": 2.476304054260254 + }, + { + "auxiliary_loss_clip": 0.01065408, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.03542435, + "balance_loss_mlp": 1.02663732, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.8612785466320738, + "language_loss": 0.75600648, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.77705538, + "num_input_tokens_seen": 214295300, + "step": 9943, + "time_per_iteration": 4.064570188522339 + }, + { + "auxiliary_loss_clip": 0.01050275, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.03506804, + "balance_loss_mlp": 1.02024531, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.9747668746870046, + "language_loss": 0.62170422, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64252925, + "num_input_tokens_seen": 214317050, + "step": 9944, + "time_per_iteration": 2.716618537902832 + }, + { + "auxiliary_loss_clip": 0.01095971, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.04121614, + "balance_loss_mlp": 1.02203584, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.7666955141536047, + "language_loss": 0.72798967, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74929541, + "num_input_tokens_seen": 214337470, + "step": 9945, + "time_per_iteration": 2.6652045249938965 + }, + { + "auxiliary_loss_clip": 0.0106391, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.03750801, + "balance_loss_mlp": 1.01873028, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 1.9795086484282993, + "language_loss": 0.66975057, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69070971, + "num_input_tokens_seen": 214357975, + "step": 9946, + "time_per_iteration": 2.6139698028564453 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.0104473, + "balance_loss_clip": 1.03743434, + "balance_loss_mlp": 1.03011513, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 2.8198008914628683, + "language_loss": 0.88540256, + "learning_rate": 1.468425107717461e-06, + "loss": 0.90688133, + "num_input_tokens_seen": 214374125, + "step": 9947, + "time_per_iteration": 2.4632084369659424 + }, + { + "auxiliary_loss_clip": 0.01109206, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.03757143, + "balance_loss_mlp": 1.02521718, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.9369019305463793, + "language_loss": 0.7222271, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74368834, + "num_input_tokens_seen": 214393395, + "step": 9948, + "time_per_iteration": 2.537607431411743 + }, + { + "auxiliary_loss_clip": 0.01091515, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.03760779, + "balance_loss_mlp": 1.01905906, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 2.079698445371143, + "language_loss": 0.8941052, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91535097, + "num_input_tokens_seen": 214411550, + "step": 9949, + "time_per_iteration": 2.497436761856079 + }, + { + "auxiliary_loss_clip": 0.01102293, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.03914046, + "balance_loss_mlp": 1.01885784, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 2.120928670743494, + "language_loss": 0.70592344, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72725403, + "num_input_tokens_seen": 214429780, + "step": 9950, + "time_per_iteration": 2.4396779537200928 + }, + { + "auxiliary_loss_clip": 0.01102928, + "auxiliary_loss_mlp": 0.01032933, + "balance_loss_clip": 1.03853726, + "balance_loss_mlp": 1.01977277, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.6888897465421466, + "language_loss": 0.78503793, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80639648, + "num_input_tokens_seen": 214447775, + "step": 9951, + "time_per_iteration": 2.5001323223114014 + }, + { + "auxiliary_loss_clip": 0.01094586, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.03936088, + "balance_loss_mlp": 1.02561545, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 1.4367561544980625, + "language_loss": 0.74306089, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.76440573, + "num_input_tokens_seen": 214467245, + "step": 9952, + "time_per_iteration": 2.5039820671081543 + }, + { + "auxiliary_loss_clip": 0.01095068, + "auxiliary_loss_mlp": 0.00780142, + "balance_loss_clip": 1.03875923, + "balance_loss_mlp": 1.0005734, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.036426408297864, + "language_loss": 0.78808486, + "learning_rate": 1.466172750724613e-06, + "loss": 0.8068369, + "num_input_tokens_seen": 214484385, + "step": 9953, + "time_per_iteration": 2.5030527114868164 + }, + { + "auxiliary_loss_clip": 0.01085747, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.04149246, + "balance_loss_mlp": 1.01758575, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.5128809867813897, + "language_loss": 0.69685292, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.718009, + "num_input_tokens_seen": 214503465, + "step": 9954, + "time_per_iteration": 4.1255106925964355 + }, + { + "auxiliary_loss_clip": 0.01092368, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.03902483, + "balance_loss_mlp": 1.0218972, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.897793566370011, + "language_loss": 0.73358214, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.7548508, + "num_input_tokens_seen": 214520725, + "step": 9955, + "time_per_iteration": 2.6134655475616455 + }, + { + "auxiliary_loss_clip": 0.01116438, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.04002452, + "balance_loss_mlp": 1.01873171, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.5395311161140848, + "language_loss": 0.68529999, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70678151, + "num_input_tokens_seen": 214540675, + "step": 9956, + "time_per_iteration": 2.5428481101989746 + }, + { + "auxiliary_loss_clip": 0.01119113, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.04259825, + "balance_loss_mlp": 1.01879787, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.132528109631733, + "language_loss": 0.73486549, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75637805, + "num_input_tokens_seen": 214559910, + "step": 9957, + "time_per_iteration": 3.7259066104888916 + }, + { + "auxiliary_loss_clip": 0.01077527, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.03602552, + "balance_loss_mlp": 1.01831353, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 2.4993598877527106, + "language_loss": 0.84558952, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86667478, + "num_input_tokens_seen": 214575960, + "step": 9958, + "time_per_iteration": 2.5479137897491455 + }, + { + "auxiliary_loss_clip": 0.0108405, + "auxiliary_loss_mlp": 0.00778983, + "balance_loss_clip": 1.03665972, + "balance_loss_mlp": 1.00065279, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 2.0473954287159355, + "language_loss": 0.66306299, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68169332, + "num_input_tokens_seen": 214594230, + "step": 9959, + "time_per_iteration": 2.5781149864196777 + }, + { + "auxiliary_loss_clip": 0.0110646, + "auxiliary_loss_mlp": 0.01031476, + "balance_loss_clip": 1.03992283, + "balance_loss_mlp": 1.01886964, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6876874749146804, + "language_loss": 0.83530676, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85668612, + "num_input_tokens_seen": 214613130, + "step": 9960, + "time_per_iteration": 2.5000481605529785 + }, + { + "auxiliary_loss_clip": 0.01097936, + "auxiliary_loss_mlp": 0.01030666, + "balance_loss_clip": 1.04030132, + "balance_loss_mlp": 1.0176369, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.58805810270077, + "language_loss": 0.79201341, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81329942, + "num_input_tokens_seen": 214634470, + "step": 9961, + "time_per_iteration": 2.5548558235168457 + }, + { + "auxiliary_loss_clip": 0.01114193, + "auxiliary_loss_mlp": 0.01034758, + "balance_loss_clip": 1.03946233, + "balance_loss_mlp": 1.0215379, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.7500257521978362, + "language_loss": 0.67313826, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69462776, + "num_input_tokens_seen": 214654030, + "step": 9962, + "time_per_iteration": 2.4664156436920166 + }, + { + "auxiliary_loss_clip": 0.01102533, + "auxiliary_loss_mlp": 0.01043838, + "balance_loss_clip": 1.03864801, + "balance_loss_mlp": 1.02910972, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.4104423425311405, + "language_loss": 0.74355495, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76501864, + "num_input_tokens_seen": 214676985, + "step": 9963, + "time_per_iteration": 2.5177695751190186 + }, + { + "auxiliary_loss_clip": 0.01103994, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.03989255, + "balance_loss_mlp": 1.01719713, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 1.717087691729776, + "language_loss": 0.67824942, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.69958651, + "num_input_tokens_seen": 214700105, + "step": 9964, + "time_per_iteration": 2.600463390350342 + }, + { + "auxiliary_loss_clip": 0.010813, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.04037094, + "balance_loss_mlp": 1.01721048, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 3.556324254428272, + "language_loss": 0.76957434, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79068947, + "num_input_tokens_seen": 214717885, + "step": 9965, + "time_per_iteration": 2.561433792114258 + }, + { + "auxiliary_loss_clip": 0.01102314, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.04210258, + "balance_loss_mlp": 1.01626778, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.8638286333385143, + "language_loss": 0.77449143, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79580671, + "num_input_tokens_seen": 214733680, + "step": 9966, + "time_per_iteration": 2.4801692962646484 + }, + { + "auxiliary_loss_clip": 0.0107978, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.03981614, + "balance_loss_mlp": 1.01706314, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.5134373110038146, + "language_loss": 0.73178959, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75287724, + "num_input_tokens_seen": 214753285, + "step": 9967, + "time_per_iteration": 2.578968048095703 + }, + { + "auxiliary_loss_clip": 0.01110803, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.04222274, + "balance_loss_mlp": 1.0258379, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 2.4569818003719393, + "language_loss": 0.6849075, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70641416, + "num_input_tokens_seen": 214767810, + "step": 9968, + "time_per_iteration": 3.9946506023406982 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.03660059, + "balance_loss_mlp": 1.02163315, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.497542462825045, + "language_loss": 0.79202616, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81342137, + "num_input_tokens_seen": 214786040, + "step": 9969, + "time_per_iteration": 2.4522178173065186 + }, + { + "auxiliary_loss_clip": 0.01101164, + "auxiliary_loss_mlp": 0.01029687, + "balance_loss_clip": 1.03914523, + "balance_loss_mlp": 1.01634121, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 1.775151159609949, + "language_loss": 0.81046355, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83177209, + "num_input_tokens_seen": 214803110, + "step": 9970, + "time_per_iteration": 2.4582886695861816 + }, + { + "auxiliary_loss_clip": 0.01065847, + "auxiliary_loss_mlp": 0.01043352, + "balance_loss_clip": 1.03746605, + "balance_loss_mlp": 1.02725303, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 1.9841905955864325, + "language_loss": 0.62084889, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64194089, + "num_input_tokens_seen": 214819945, + "step": 9971, + "time_per_iteration": 2.5521788597106934 + }, + { + "auxiliary_loss_clip": 0.01112939, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.04120398, + "balance_loss_mlp": 1.01825738, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.8342479671843135, + "language_loss": 0.79294121, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81437409, + "num_input_tokens_seen": 214838810, + "step": 9972, + "time_per_iteration": 2.481839418411255 + }, + { + "auxiliary_loss_clip": 0.01078318, + "auxiliary_loss_mlp": 0.01035532, + "balance_loss_clip": 1.04047513, + "balance_loss_mlp": 1.02176368, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.1822005250228336, + "language_loss": 0.76299185, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78413033, + "num_input_tokens_seen": 214857040, + "step": 9973, + "time_per_iteration": 2.6910088062286377 + }, + { + "auxiliary_loss_clip": 0.01081746, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.04064488, + "balance_loss_mlp": 1.01760817, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.167740001702536, + "language_loss": 0.65254635, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67367041, + "num_input_tokens_seen": 214873375, + "step": 9974, + "time_per_iteration": 2.601360559463501 + }, + { + "auxiliary_loss_clip": 0.01103263, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.03684783, + "balance_loss_mlp": 1.01949573, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.4843831463521149, + "language_loss": 0.74831736, + "learning_rate": 1.457920366566428e-06, + "loss": 0.7696749, + "num_input_tokens_seen": 214893900, + "step": 9975, + "time_per_iteration": 2.514115571975708 + }, + { + "auxiliary_loss_clip": 0.01117195, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.04133809, + "balance_loss_mlp": 1.0179404, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.9006721815833638, + "language_loss": 0.77050591, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79199481, + "num_input_tokens_seen": 214912110, + "step": 9976, + "time_per_iteration": 2.455925941467285 + }, + { + "auxiliary_loss_clip": 0.01097832, + "auxiliary_loss_mlp": 0.01037271, + "balance_loss_clip": 1.03867495, + "balance_loss_mlp": 1.02302575, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.365530092203679, + "language_loss": 0.74853253, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76988351, + "num_input_tokens_seen": 214930140, + "step": 9977, + "time_per_iteration": 2.563321352005005 + }, + { + "auxiliary_loss_clip": 0.01078245, + "auxiliary_loss_mlp": 0.01034321, + "balance_loss_clip": 1.03463149, + "balance_loss_mlp": 1.02120793, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.9601130267914963, + "language_loss": 0.68932617, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.71045184, + "num_input_tokens_seen": 214949200, + "step": 9978, + "time_per_iteration": 2.5624680519104004 + }, + { + "auxiliary_loss_clip": 0.0112621, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.04669452, + "balance_loss_mlp": 1.01984715, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.6822840082148747, + "language_loss": 0.81210768, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83370954, + "num_input_tokens_seen": 214965775, + "step": 9979, + "time_per_iteration": 2.454289674758911 + }, + { + "auxiliary_loss_clip": 0.01113004, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.04109693, + "balance_loss_mlp": 1.0206008, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 1.9428451432755385, + "language_loss": 0.70014775, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72161055, + "num_input_tokens_seen": 214982480, + "step": 9980, + "time_per_iteration": 2.425053119659424 + }, + { + "auxiliary_loss_clip": 0.0110492, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.03813636, + "balance_loss_mlp": 1.01889265, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 3.448235680823807, + "language_loss": 0.68509269, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70647877, + "num_input_tokens_seen": 214998110, + "step": 9981, + "time_per_iteration": 2.439251661300659 + }, + { + "auxiliary_loss_clip": 0.01105124, + "auxiliary_loss_mlp": 0.01033947, + "balance_loss_clip": 1.04138064, + "balance_loss_mlp": 1.02162695, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 3.665124509623431, + "language_loss": 0.78461713, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80600786, + "num_input_tokens_seen": 215017995, + "step": 9982, + "time_per_iteration": 3.994614839553833 + }, + { + "auxiliary_loss_clip": 0.01068898, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.03953266, + "balance_loss_mlp": 1.02297735, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.5399389341777545, + "language_loss": 0.72977877, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.75085402, + "num_input_tokens_seen": 215038285, + "step": 9983, + "time_per_iteration": 2.5993809700012207 + }, + { + "auxiliary_loss_clip": 0.01078171, + "auxiliary_loss_mlp": 0.01033703, + "balance_loss_clip": 1.03853118, + "balance_loss_mlp": 1.02023268, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 2.211853621224669, + "language_loss": 0.77785617, + "learning_rate": 1.454547250154447e-06, + "loss": 0.79897487, + "num_input_tokens_seen": 215057825, + "step": 9984, + "time_per_iteration": 2.537522792816162 + }, + { + "auxiliary_loss_clip": 0.011059, + "auxiliary_loss_mlp": 0.01032379, + "balance_loss_clip": 1.04120541, + "balance_loss_mlp": 1.01982045, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.9104663921505227, + "language_loss": 0.83309811, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85448086, + "num_input_tokens_seen": 215077790, + "step": 9985, + "time_per_iteration": 2.6145033836364746 + }, + { + "auxiliary_loss_clip": 0.01108056, + "auxiliary_loss_mlp": 0.01038108, + "balance_loss_clip": 1.04348111, + "balance_loss_mlp": 1.02543044, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 1.7553829747338814, + "language_loss": 0.71301138, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73447305, + "num_input_tokens_seen": 215097650, + "step": 9986, + "time_per_iteration": 2.675229549407959 + }, + { + "auxiliary_loss_clip": 0.01121762, + "auxiliary_loss_mlp": 0.00779838, + "balance_loss_clip": 1.04588199, + "balance_loss_mlp": 1.00071836, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 1.3826252928407794, + "language_loss": 0.71637863, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73539466, + "num_input_tokens_seen": 215118235, + "step": 9987, + "time_per_iteration": 2.4888510704040527 + }, + { + "auxiliary_loss_clip": 0.01095527, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.04167747, + "balance_loss_mlp": 1.01983905, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.748185597719897, + "language_loss": 0.84516937, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.86645162, + "num_input_tokens_seen": 215136755, + "step": 9988, + "time_per_iteration": 2.5362749099731445 + }, + { + "auxiliary_loss_clip": 0.01106387, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.04082608, + "balance_loss_mlp": 1.02277601, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.780721134874317, + "language_loss": 0.65469623, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67612302, + "num_input_tokens_seen": 215155225, + "step": 9989, + "time_per_iteration": 2.479189395904541 + }, + { + "auxiliary_loss_clip": 0.01105586, + "auxiliary_loss_mlp": 0.01034324, + "balance_loss_clip": 1.04152513, + "balance_loss_mlp": 1.02213478, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.4729140793153477, + "language_loss": 0.80835348, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82975256, + "num_input_tokens_seen": 215174815, + "step": 9990, + "time_per_iteration": 2.5160629749298096 + }, + { + "auxiliary_loss_clip": 0.01080848, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.04456437, + "balance_loss_mlp": 1.02162969, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 1.9052216027169229, + "language_loss": 0.82525623, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84641284, + "num_input_tokens_seen": 215192045, + "step": 9991, + "time_per_iteration": 2.612273693084717 + }, + { + "auxiliary_loss_clip": 0.01067338, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.03993487, + "balance_loss_mlp": 1.02692342, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 1.8548032304441116, + "language_loss": 0.82351398, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84459662, + "num_input_tokens_seen": 215209885, + "step": 9992, + "time_per_iteration": 2.61976957321167 + }, + { + "auxiliary_loss_clip": 0.01096569, + "auxiliary_loss_mlp": 0.00778849, + "balance_loss_clip": 1.04321229, + "balance_loss_mlp": 1.00065255, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 2.0995557568164926, + "language_loss": 0.66759324, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.68634748, + "num_input_tokens_seen": 215228150, + "step": 9993, + "time_per_iteration": 4.001123905181885 + }, + { + "auxiliary_loss_clip": 0.01077812, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.03681731, + "balance_loss_mlp": 1.02037084, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.352252654903999, + "language_loss": 0.80866206, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.82977986, + "num_input_tokens_seen": 215243755, + "step": 9994, + "time_per_iteration": 2.59120774269104 + }, + { + "auxiliary_loss_clip": 0.01073114, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.03565073, + "balance_loss_mlp": 1.01751697, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.8994449475249893, + "language_loss": 0.72351718, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74455166, + "num_input_tokens_seen": 215262130, + "step": 9995, + "time_per_iteration": 3.985762596130371 + }, + { + "auxiliary_loss_clip": 0.01095546, + "auxiliary_loss_mlp": 0.01033784, + "balance_loss_clip": 1.03806233, + "balance_loss_mlp": 1.02037323, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 1.6690893395888153, + "language_loss": 0.80734015, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.82863343, + "num_input_tokens_seen": 215281785, + "step": 9996, + "time_per_iteration": 2.6916310787200928 + }, + { + "auxiliary_loss_clip": 0.01054111, + "auxiliary_loss_mlp": 0.0104394, + "balance_loss_clip": 1.03180039, + "balance_loss_mlp": 1.02869296, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 2.136207631429527, + "language_loss": 0.78417379, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80515432, + "num_input_tokens_seen": 215297550, + "step": 9997, + "time_per_iteration": 2.646874189376831 + }, + { + "auxiliary_loss_clip": 0.01109274, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.0420661, + "balance_loss_mlp": 1.01730144, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 2.3325567138369547, + "language_loss": 0.73008049, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75148511, + "num_input_tokens_seen": 215316360, + "step": 9998, + "time_per_iteration": 2.5133090019226074 + }, + { + "auxiliary_loss_clip": 0.01086022, + "auxiliary_loss_mlp": 0.01035723, + "balance_loss_clip": 1.03933203, + "balance_loss_mlp": 1.0227356, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.528959091649039, + "language_loss": 0.72476399, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74598145, + "num_input_tokens_seen": 215336405, + "step": 9999, + "time_per_iteration": 2.5675814151763916 + }, + { + "auxiliary_loss_clip": 0.01069542, + "auxiliary_loss_mlp": 0.01036642, + "balance_loss_clip": 1.03783512, + "balance_loss_mlp": 1.02341628, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 1.5445239901034529, + "language_loss": 0.78467983, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80574167, + "num_input_tokens_seen": 215356590, + "step": 10000, + "time_per_iteration": 2.6177163124084473 + }, + { + "auxiliary_loss_clip": 0.01123025, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.04451895, + "balance_loss_mlp": 1.02458262, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 2.3718335447582626, + "language_loss": 0.777794, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79941875, + "num_input_tokens_seen": 215374295, + "step": 10001, + "time_per_iteration": 2.4687466621398926 + }, + { + "auxiliary_loss_clip": 0.01110922, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.04375613, + "balance_loss_mlp": 1.01876771, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 1.775255474712195, + "language_loss": 0.58842427, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.60985553, + "num_input_tokens_seen": 215394535, + "step": 10002, + "time_per_iteration": 2.622805118560791 + }, + { + "auxiliary_loss_clip": 0.01099947, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.04219174, + "balance_loss_mlp": 1.02195311, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.6312785772531235, + "language_loss": 0.77792764, + "learning_rate": 1.447431741055314e-06, + "loss": 0.79929841, + "num_input_tokens_seen": 215414355, + "step": 10003, + "time_per_iteration": 2.5401253700256348 + }, + { + "auxiliary_loss_clip": 0.01123513, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.04614854, + "balance_loss_mlp": 1.0260818, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 2.319118457939631, + "language_loss": 0.77457702, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79620993, + "num_input_tokens_seen": 215428280, + "step": 10004, + "time_per_iteration": 2.472774028778076 + }, + { + "auxiliary_loss_clip": 0.01106658, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.04193997, + "balance_loss_mlp": 1.01603687, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.508052332136731, + "language_loss": 0.72595698, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74732035, + "num_input_tokens_seen": 215448970, + "step": 10005, + "time_per_iteration": 2.4990477561950684 + }, + { + "auxiliary_loss_clip": 0.01116415, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.04389954, + "balance_loss_mlp": 1.01603365, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 2.361024202813238, + "language_loss": 0.74793291, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.76938647, + "num_input_tokens_seen": 215465260, + "step": 10006, + "time_per_iteration": 2.4435904026031494 + }, + { + "auxiliary_loss_clip": 0.01091753, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.03991413, + "balance_loss_mlp": 1.01869965, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 1.9910366190762838, + "language_loss": 0.73756915, + "learning_rate": 1.445934699732685e-06, + "loss": 0.75881326, + "num_input_tokens_seen": 215482725, + "step": 10007, + "time_per_iteration": 4.007794618606567 + }, + { + "auxiliary_loss_clip": 0.01096578, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.04077888, + "balance_loss_mlp": 1.01628995, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 1.828369236301986, + "language_loss": 0.70064533, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72190082, + "num_input_tokens_seen": 215500420, + "step": 10008, + "time_per_iteration": 2.512561082839966 + }, + { + "auxiliary_loss_clip": 0.01105501, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.03958941, + "balance_loss_mlp": 1.01625061, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.7026150727270846, + "language_loss": 0.76221323, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78355956, + "num_input_tokens_seen": 215522260, + "step": 10009, + "time_per_iteration": 2.5446035861968994 + }, + { + "auxiliary_loss_clip": 0.01092567, + "auxiliary_loss_mlp": 0.00779215, + "balance_loss_clip": 1.0398885, + "balance_loss_mlp": 1.00066209, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.0545113048385977, + "language_loss": 0.74740303, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76612085, + "num_input_tokens_seen": 215541715, + "step": 10010, + "time_per_iteration": 2.584473133087158 + }, + { + "auxiliary_loss_clip": 0.0102674, + "auxiliary_loss_mlp": 0.01009475, + "balance_loss_clip": 1.01032662, + "balance_loss_mlp": 1.00825262, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.8037971881651054, + "language_loss": 0.55071104, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57107317, + "num_input_tokens_seen": 215603020, + "step": 10011, + "time_per_iteration": 3.1114158630371094 + }, + { + "auxiliary_loss_clip": 0.01105855, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.03945673, + "balance_loss_mlp": 1.02415884, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.4152545185424765, + "language_loss": 0.62491387, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64633912, + "num_input_tokens_seen": 215625115, + "step": 10012, + "time_per_iteration": 2.688741445541382 + }, + { + "auxiliary_loss_clip": 0.01079263, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.0422585, + "balance_loss_mlp": 1.01515949, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.7384959633138382, + "language_loss": 0.7497977, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77086866, + "num_input_tokens_seen": 215643730, + "step": 10013, + "time_per_iteration": 2.627938747406006 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.0419116, + "balance_loss_mlp": 1.02025008, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.9580810503771533, + "language_loss": 0.81531918, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83676368, + "num_input_tokens_seen": 215664425, + "step": 10014, + "time_per_iteration": 2.5813705921173096 + }, + { + "auxiliary_loss_clip": 0.01089071, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.03779137, + "balance_loss_mlp": 1.014961, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.6662853486351847, + "language_loss": 0.72657841, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74774253, + "num_input_tokens_seen": 215684280, + "step": 10015, + "time_per_iteration": 2.6450655460357666 + }, + { + "auxiliary_loss_clip": 0.01021189, + "auxiliary_loss_mlp": 0.01004194, + "balance_loss_clip": 1.01566958, + "balance_loss_mlp": 1.00305581, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8228647863209219, + "language_loss": 0.54802859, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56828243, + "num_input_tokens_seen": 215739780, + "step": 10016, + "time_per_iteration": 2.9984750747680664 + }, + { + "auxiliary_loss_clip": 0.01094962, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.04261208, + "balance_loss_mlp": 1.01560068, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.5500526706249633, + "language_loss": 0.82729983, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.84853512, + "num_input_tokens_seen": 215757885, + "step": 10017, + "time_per_iteration": 2.544119119644165 + }, + { + "auxiliary_loss_clip": 0.01090576, + "auxiliary_loss_mlp": 0.01027407, + "balance_loss_clip": 1.04153848, + "balance_loss_mlp": 1.01506329, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.8593095173403131, + "language_loss": 0.83782697, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85900676, + "num_input_tokens_seen": 215776415, + "step": 10018, + "time_per_iteration": 2.5869815349578857 + }, + { + "auxiliary_loss_clip": 0.01093965, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.03814149, + "balance_loss_mlp": 1.02332282, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 1.5415738036448305, + "language_loss": 0.78579021, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80710089, + "num_input_tokens_seen": 215794865, + "step": 10019, + "time_per_iteration": 2.615473747253418 + }, + { + "auxiliary_loss_clip": 0.0107532, + "auxiliary_loss_mlp": 0.00778935, + "balance_loss_clip": 1.03983545, + "balance_loss_mlp": 1.00066829, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.5550593820561944, + "language_loss": 0.74001658, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75855911, + "num_input_tokens_seen": 215816840, + "step": 10020, + "time_per_iteration": 2.635451555252075 + }, + { + "auxiliary_loss_clip": 0.01096181, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.04083896, + "balance_loss_mlp": 1.02160501, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 3.913698970845982, + "language_loss": 0.64131451, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66261667, + "num_input_tokens_seen": 215836100, + "step": 10021, + "time_per_iteration": 3.947335720062256 + }, + { + "auxiliary_loss_clip": 0.01104837, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.04127645, + "balance_loss_mlp": 1.01864552, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.5937727966854949, + "language_loss": 0.80493414, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82629842, + "num_input_tokens_seen": 215858480, + "step": 10022, + "time_per_iteration": 2.5150146484375 + }, + { + "auxiliary_loss_clip": 0.01103756, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.04180408, + "balance_loss_mlp": 1.01730418, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.7238893569599472, + "language_loss": 0.66577387, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68710816, + "num_input_tokens_seen": 215879950, + "step": 10023, + "time_per_iteration": 2.5641043186187744 + }, + { + "auxiliary_loss_clip": 0.0110607, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.03851557, + "balance_loss_mlp": 1.01930285, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 1.9551058774001675, + "language_loss": 0.74252319, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76390696, + "num_input_tokens_seen": 215899830, + "step": 10024, + "time_per_iteration": 2.54419207572937 + }, + { + "auxiliary_loss_clip": 0.01104903, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.04207563, + "balance_loss_mlp": 1.0195241, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.6402341180534867, + "language_loss": 0.7291404, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75051939, + "num_input_tokens_seen": 215920440, + "step": 10025, + "time_per_iteration": 2.4832191467285156 + }, + { + "auxiliary_loss_clip": 0.0112072, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.04155433, + "balance_loss_mlp": 1.01870942, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 5.808441929607528, + "language_loss": 0.67524868, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69677973, + "num_input_tokens_seen": 215940535, + "step": 10026, + "time_per_iteration": 2.4615371227264404 + }, + { + "auxiliary_loss_clip": 0.01109861, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.03901482, + "balance_loss_mlp": 1.02082169, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 1.6840546969192904, + "language_loss": 0.80025733, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82168418, + "num_input_tokens_seen": 215958045, + "step": 10027, + "time_per_iteration": 2.4391748905181885 + }, + { + "auxiliary_loss_clip": 0.01082635, + "auxiliary_loss_mlp": 0.0103192, + "balance_loss_clip": 1.03758335, + "balance_loss_mlp": 1.0188669, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 3.1647065925248303, + "language_loss": 0.71220922, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73335481, + "num_input_tokens_seen": 215977330, + "step": 10028, + "time_per_iteration": 2.598787307739258 + }, + { + "auxiliary_loss_clip": 0.0108365, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.04470301, + "balance_loss_mlp": 1.01773584, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 1.762351479793433, + "language_loss": 0.84152484, + "learning_rate": 1.437707005721669e-06, + "loss": 0.86266601, + "num_input_tokens_seen": 215997865, + "step": 10029, + "time_per_iteration": 2.661395311355591 + }, + { + "auxiliary_loss_clip": 0.01093634, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.03990245, + "balance_loss_mlp": 1.02402306, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.87682193181912, + "language_loss": 0.80004013, + "learning_rate": 1.437333263694373e-06, + "loss": 0.82134187, + "num_input_tokens_seen": 216016230, + "step": 10030, + "time_per_iteration": 2.5238280296325684 + }, + { + "auxiliary_loss_clip": 0.01049629, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.03327751, + "balance_loss_mlp": 1.02408338, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.562082724283919, + "language_loss": 0.71062577, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73150337, + "num_input_tokens_seen": 216035785, + "step": 10031, + "time_per_iteration": 2.676677942276001 + }, + { + "auxiliary_loss_clip": 0.01073027, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.03597903, + "balance_loss_mlp": 1.01647317, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.5224603471874198, + "language_loss": 0.73372138, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75475955, + "num_input_tokens_seen": 216059555, + "step": 10032, + "time_per_iteration": 2.707073926925659 + }, + { + "auxiliary_loss_clip": 0.01098448, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.04164314, + "balance_loss_mlp": 1.01881588, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 10.158202062691199, + "language_loss": 0.68282133, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70412701, + "num_input_tokens_seen": 216077235, + "step": 10033, + "time_per_iteration": 4.115811347961426 + }, + { + "auxiliary_loss_clip": 0.01090297, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.0404228, + "balance_loss_mlp": 1.02167487, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 2.32038493935788, + "language_loss": 0.75405115, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.77530545, + "num_input_tokens_seen": 216094985, + "step": 10034, + "time_per_iteration": 2.5171470642089844 + }, + { + "auxiliary_loss_clip": 0.01094491, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.04071152, + "balance_loss_mlp": 1.01616669, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 1.991526310808377, + "language_loss": 0.74273044, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76397586, + "num_input_tokens_seen": 216115905, + "step": 10035, + "time_per_iteration": 4.0093092918396 + }, + { + "auxiliary_loss_clip": 0.01087899, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.04082966, + "balance_loss_mlp": 1.01739073, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.5737857357187606, + "language_loss": 0.86372018, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88489592, + "num_input_tokens_seen": 216132420, + "step": 10036, + "time_per_iteration": 2.5095949172973633 + }, + { + "auxiliary_loss_clip": 0.010779, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.03950799, + "balance_loss_mlp": 1.01773548, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 1.8720117724859633, + "language_loss": 0.70803308, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72911823, + "num_input_tokens_seen": 216149800, + "step": 10037, + "time_per_iteration": 2.5742247104644775 + }, + { + "auxiliary_loss_clip": 0.0110147, + "auxiliary_loss_mlp": 0.01038262, + "balance_loss_clip": 1.04049671, + "balance_loss_mlp": 1.023862, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.764075862150679, + "language_loss": 0.85206026, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87345755, + "num_input_tokens_seen": 216168200, + "step": 10038, + "time_per_iteration": 2.493393659591675 + }, + { + "auxiliary_loss_clip": 0.0109816, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.03931952, + "balance_loss_mlp": 1.02084923, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 1.994045702169929, + "language_loss": 0.76666534, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78799033, + "num_input_tokens_seen": 216187105, + "step": 10039, + "time_per_iteration": 2.530668258666992 + }, + { + "auxiliary_loss_clip": 0.01102994, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.03881991, + "balance_loss_mlp": 1.0163722, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.8077414714390803, + "language_loss": 0.70929229, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73061156, + "num_input_tokens_seen": 216205440, + "step": 10040, + "time_per_iteration": 2.5129215717315674 + }, + { + "auxiliary_loss_clip": 0.01109002, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.04209125, + "balance_loss_mlp": 1.01889873, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 1.911372264673662, + "language_loss": 0.78112113, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80254734, + "num_input_tokens_seen": 216223130, + "step": 10041, + "time_per_iteration": 2.5137529373168945 + }, + { + "auxiliary_loss_clip": 0.01096493, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.04274225, + "balance_loss_mlp": 1.01626527, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.8244582638887716, + "language_loss": 0.75860012, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77984858, + "num_input_tokens_seen": 216240260, + "step": 10042, + "time_per_iteration": 2.586606502532959 + }, + { + "auxiliary_loss_clip": 0.01068805, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.03592646, + "balance_loss_mlp": 1.01702976, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 2.072207826570242, + "language_loss": 0.84417462, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86515707, + "num_input_tokens_seen": 216258510, + "step": 10043, + "time_per_iteration": 2.564415693283081 + }, + { + "auxiliary_loss_clip": 0.01080329, + "auxiliary_loss_mlp": 0.01040284, + "balance_loss_clip": 1.03554416, + "balance_loss_mlp": 1.02538311, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.7997221912565853, + "language_loss": 0.69503844, + "learning_rate": 1.432103122078974e-06, + "loss": 0.71624452, + "num_input_tokens_seen": 216277550, + "step": 10044, + "time_per_iteration": 2.588914632797241 + }, + { + "auxiliary_loss_clip": 0.01106771, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.04317665, + "balance_loss_mlp": 1.01859188, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 3.2881452839001026, + "language_loss": 0.77797115, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.79936528, + "num_input_tokens_seen": 216296690, + "step": 10045, + "time_per_iteration": 2.5140621662139893 + }, + { + "auxiliary_loss_clip": 0.01071866, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.04470992, + "balance_loss_mlp": 1.02348399, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.7235146517559816, + "language_loss": 0.76854014, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.7896238, + "num_input_tokens_seen": 216316110, + "step": 10046, + "time_per_iteration": 4.1448822021484375 + }, + { + "auxiliary_loss_clip": 0.01062185, + "auxiliary_loss_mlp": 0.01045155, + "balance_loss_clip": 1.03287792, + "balance_loss_mlp": 1.03163671, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.5799131662321289, + "language_loss": 0.86815608, + "learning_rate": 1.430982925257827e-06, + "loss": 0.88922942, + "num_input_tokens_seen": 216333855, + "step": 10047, + "time_per_iteration": 2.596956253051758 + }, + { + "auxiliary_loss_clip": 0.01108028, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.04463387, + "balance_loss_mlp": 1.01825213, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.6044241375568304, + "language_loss": 0.75583446, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77721691, + "num_input_tokens_seen": 216354890, + "step": 10048, + "time_per_iteration": 2.560117244720459 + }, + { + "auxiliary_loss_clip": 0.01108115, + "auxiliary_loss_mlp": 0.01041578, + "balance_loss_clip": 1.04169154, + "balance_loss_mlp": 1.02549684, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 1.8289284864942337, + "language_loss": 0.66161048, + "learning_rate": 1.430236235239386e-06, + "loss": 0.6831075, + "num_input_tokens_seen": 216376055, + "step": 10049, + "time_per_iteration": 2.550516366958618 + }, + { + "auxiliary_loss_clip": 0.01090175, + "auxiliary_loss_mlp": 0.0103841, + "balance_loss_clip": 1.03706074, + "balance_loss_mlp": 1.02429593, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.7044382191597036, + "language_loss": 0.66559529, + "learning_rate": 1.429862922631336e-06, + "loss": 0.68688118, + "num_input_tokens_seen": 216396295, + "step": 10050, + "time_per_iteration": 2.5229227542877197 + }, + { + "auxiliary_loss_clip": 0.0108615, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.0407716, + "balance_loss_mlp": 1.02059591, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 2.1323600055545096, + "language_loss": 0.6970917, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.71829128, + "num_input_tokens_seen": 216416605, + "step": 10051, + "time_per_iteration": 2.636451482772827 + }, + { + "auxiliary_loss_clip": 0.0110389, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.03943253, + "balance_loss_mlp": 1.01944709, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 1.9966521394624133, + "language_loss": 0.64788389, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66924661, + "num_input_tokens_seen": 216435130, + "step": 10052, + "time_per_iteration": 2.4589874744415283 + }, + { + "auxiliary_loss_clip": 0.01095616, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.04132342, + "balance_loss_mlp": 1.01934481, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 2.101805781413079, + "language_loss": 0.68963742, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71092618, + "num_input_tokens_seen": 216455640, + "step": 10053, + "time_per_iteration": 2.5833280086517334 + }, + { + "auxiliary_loss_clip": 0.01016033, + "auxiliary_loss_mlp": 0.0100149, + "balance_loss_clip": 1.00979829, + "balance_loss_mlp": 1.00029838, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7288117184987162, + "language_loss": 0.60356998, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.6237452, + "num_input_tokens_seen": 216518130, + "step": 10054, + "time_per_iteration": 3.187382698059082 + }, + { + "auxiliary_loss_clip": 0.01059116, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.03945649, + "balance_loss_mlp": 1.01724958, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.723258486390409, + "language_loss": 0.85634267, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87723351, + "num_input_tokens_seen": 216536845, + "step": 10055, + "time_per_iteration": 2.6582255363464355 + }, + { + "auxiliary_loss_clip": 0.0109774, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.04266846, + "balance_loss_mlp": 1.03084707, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.2328215050449187, + "language_loss": 0.73678493, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75821728, + "num_input_tokens_seen": 216551860, + "step": 10056, + "time_per_iteration": 2.507962226867676 + }, + { + "auxiliary_loss_clip": 0.0107835, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.04102921, + "balance_loss_mlp": 1.02493894, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.5544484619228622, + "language_loss": 0.80209452, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82326007, + "num_input_tokens_seen": 216574775, + "step": 10057, + "time_per_iteration": 2.6419005393981934 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.00778269, + "balance_loss_clip": 1.04198766, + "balance_loss_mlp": 1.00068402, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.305263693261175, + "language_loss": 0.75328439, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.7722252, + "num_input_tokens_seen": 216590100, + "step": 10058, + "time_per_iteration": 2.6138646602630615 + }, + { + "auxiliary_loss_clip": 0.0110277, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.04027081, + "balance_loss_mlp": 1.01878583, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 2.033084630836235, + "language_loss": 0.7127713, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.734115, + "num_input_tokens_seen": 216610145, + "step": 10059, + "time_per_iteration": 2.5144307613372803 + }, + { + "auxiliary_loss_clip": 0.01093689, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.0402633, + "balance_loss_mlp": 1.01811433, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.4700054175933281, + "language_loss": 0.75983119, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78108096, + "num_input_tokens_seen": 216630625, + "step": 10060, + "time_per_iteration": 4.052111864089966 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.03848302, + "balance_loss_mlp": 1.01970148, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 2.186270654645804, + "language_loss": 0.73453325, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75589937, + "num_input_tokens_seen": 216649255, + "step": 10061, + "time_per_iteration": 2.492220401763916 + }, + { + "auxiliary_loss_clip": 0.01079583, + "auxiliary_loss_mlp": 0.00778878, + "balance_loss_clip": 1.04171252, + "balance_loss_mlp": 1.00073493, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 1.7465318040921722, + "language_loss": 0.67503071, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69361532, + "num_input_tokens_seen": 216668100, + "step": 10062, + "time_per_iteration": 2.6105880737304688 + }, + { + "auxiliary_loss_clip": 0.0109973, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.03869724, + "balance_loss_mlp": 1.03055799, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.0833732276795516, + "language_loss": 0.71465981, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73610157, + "num_input_tokens_seen": 216686125, + "step": 10063, + "time_per_iteration": 2.469554901123047 + }, + { + "auxiliary_loss_clip": 0.0111205, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.04027355, + "balance_loss_mlp": 1.02666497, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.7692878568024488, + "language_loss": 0.84706914, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86858141, + "num_input_tokens_seen": 216704265, + "step": 10064, + "time_per_iteration": 2.4368669986724854 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_clip": 1.04273593, + "balance_loss_mlp": 1.02794838, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.368731987377649, + "language_loss": 0.79556131, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.8170495, + "num_input_tokens_seen": 216721765, + "step": 10065, + "time_per_iteration": 2.4556703567504883 + }, + { + "auxiliary_loss_clip": 0.01073785, + "auxiliary_loss_mlp": 0.01035126, + "balance_loss_clip": 1.04374599, + "balance_loss_mlp": 1.02026033, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 2.180006937119355, + "language_loss": 0.78562498, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80671406, + "num_input_tokens_seen": 216738295, + "step": 10066, + "time_per_iteration": 2.6167500019073486 + }, + { + "auxiliary_loss_clip": 0.01061452, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.03955698, + "balance_loss_mlp": 1.02205658, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 2.3260944366208314, + "language_loss": 0.72821921, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.74918711, + "num_input_tokens_seen": 216759875, + "step": 10067, + "time_per_iteration": 2.6606147289276123 + }, + { + "auxiliary_loss_clip": 0.01096658, + "auxiliary_loss_mlp": 0.00779151, + "balance_loss_clip": 1.04324841, + "balance_loss_mlp": 1.00075507, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.5931229492411951, + "language_loss": 0.6896503, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.70840836, + "num_input_tokens_seen": 216780705, + "step": 10068, + "time_per_iteration": 2.5494754314422607 + }, + { + "auxiliary_loss_clip": 0.0110218, + "auxiliary_loss_mlp": 0.01037776, + "balance_loss_clip": 1.04141545, + "balance_loss_mlp": 1.02456188, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 2.09959673301633, + "language_loss": 0.87512314, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89652276, + "num_input_tokens_seen": 216797625, + "step": 10069, + "time_per_iteration": 2.468278408050537 + }, + { + "auxiliary_loss_clip": 0.01083337, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.03971314, + "balance_loss_mlp": 1.01671433, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.5455002900912105, + "language_loss": 0.83433831, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85546052, + "num_input_tokens_seen": 216817610, + "step": 10070, + "time_per_iteration": 2.5908164978027344 + }, + { + "auxiliary_loss_clip": 0.011041, + "auxiliary_loss_mlp": 0.01038468, + "balance_loss_clip": 1.0445478, + "balance_loss_mlp": 1.02515852, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.5868069549245405, + "language_loss": 0.86248565, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88391137, + "num_input_tokens_seen": 216836835, + "step": 10071, + "time_per_iteration": 2.5332541465759277 + }, + { + "auxiliary_loss_clip": 0.01111371, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.04428053, + "balance_loss_mlp": 1.02309012, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.6495830410383643, + "language_loss": 0.76885331, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79033995, + "num_input_tokens_seen": 216856760, + "step": 10072, + "time_per_iteration": 4.056907653808594 + }, + { + "auxiliary_loss_clip": 0.0109476, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.03756809, + "balance_loss_mlp": 1.01732302, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.7392521528154443, + "language_loss": 0.74406421, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76532716, + "num_input_tokens_seen": 216878795, + "step": 10073, + "time_per_iteration": 2.591932535171509 + }, + { + "auxiliary_loss_clip": 0.01003317, + "auxiliary_loss_mlp": 0.0100174, + "balance_loss_clip": 1.01438856, + "balance_loss_mlp": 1.00034523, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7576192030625525, + "language_loss": 0.55176991, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.5718205, + "num_input_tokens_seen": 216937800, + "step": 10074, + "time_per_iteration": 4.69268536567688 + }, + { + "auxiliary_loss_clip": 0.01076172, + "auxiliary_loss_mlp": 0.01037981, + "balance_loss_clip": 1.03852117, + "balance_loss_mlp": 1.02372372, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.708072665858004, + "language_loss": 0.81669807, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.8378396, + "num_input_tokens_seen": 216955280, + "step": 10075, + "time_per_iteration": 2.5797152519226074 + }, + { + "auxiliary_loss_clip": 0.01107148, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.04051185, + "balance_loss_mlp": 1.01583529, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 1.7921790676237173, + "language_loss": 0.78078461, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80214894, + "num_input_tokens_seen": 216976950, + "step": 10076, + "time_per_iteration": 2.5454981327056885 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01040668, + "balance_loss_clip": 1.03986597, + "balance_loss_mlp": 1.02695918, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.887829787340223, + "language_loss": 0.72345161, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74493825, + "num_input_tokens_seen": 216996945, + "step": 10077, + "time_per_iteration": 2.504777431488037 + }, + { + "auxiliary_loss_clip": 0.01117698, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.04194164, + "balance_loss_mlp": 1.01891518, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 1.5873209158783363, + "language_loss": 0.55366439, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57516503, + "num_input_tokens_seen": 217016580, + "step": 10078, + "time_per_iteration": 2.4624624252319336 + }, + { + "auxiliary_loss_clip": 0.01072207, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.03637695, + "balance_loss_mlp": 1.01764596, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 1.9987578094590388, + "language_loss": 0.70079911, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.7218318, + "num_input_tokens_seen": 217037300, + "step": 10079, + "time_per_iteration": 2.6347975730895996 + }, + { + "auxiliary_loss_clip": 0.01094362, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.04106712, + "balance_loss_mlp": 1.02297688, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.9141605199670018, + "language_loss": 0.62646699, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64776206, + "num_input_tokens_seen": 217055805, + "step": 10080, + "time_per_iteration": 2.517158031463623 + }, + { + "auxiliary_loss_clip": 0.01095058, + "auxiliary_loss_mlp": 0.01029312, + "balance_loss_clip": 1.03905702, + "balance_loss_mlp": 1.01563907, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 1.7314916729379983, + "language_loss": 0.71015441, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73139811, + "num_input_tokens_seen": 217074175, + "step": 10081, + "time_per_iteration": 2.515901565551758 + }, + { + "auxiliary_loss_clip": 0.01090932, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.03953874, + "balance_loss_mlp": 1.01676857, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.6971381176294769, + "language_loss": 0.69378895, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71500194, + "num_input_tokens_seen": 217095695, + "step": 10082, + "time_per_iteration": 2.5844600200653076 + }, + { + "auxiliary_loss_clip": 0.01116444, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.04272854, + "balance_loss_mlp": 1.01668787, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.4144099197469502, + "language_loss": 0.65819657, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.67965221, + "num_input_tokens_seen": 217116260, + "step": 10083, + "time_per_iteration": 2.477806329727173 + }, + { + "auxiliary_loss_clip": 0.01103373, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.03940153, + "balance_loss_mlp": 1.01799202, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 2.2630993711085763, + "language_loss": 0.73672038, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.75806475, + "num_input_tokens_seen": 217134465, + "step": 10084, + "time_per_iteration": 2.4633066654205322 + }, + { + "auxiliary_loss_clip": 0.0109347, + "auxiliary_loss_mlp": 0.01040054, + "balance_loss_clip": 1.04165089, + "balance_loss_mlp": 1.02682209, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 2.554760319087895, + "language_loss": 0.72583508, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74717033, + "num_input_tokens_seen": 217149920, + "step": 10085, + "time_per_iteration": 2.4878273010253906 + }, + { + "auxiliary_loss_clip": 0.011148, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.04169226, + "balance_loss_mlp": 1.02343965, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.011429705569478, + "language_loss": 0.76288867, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78440028, + "num_input_tokens_seen": 217168165, + "step": 10086, + "time_per_iteration": 3.9808623790740967 + }, + { + "auxiliary_loss_clip": 0.01087182, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.04296255, + "balance_loss_mlp": 1.02222574, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.4503033168340118, + "language_loss": 0.72788996, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74911666, + "num_input_tokens_seen": 217190070, + "step": 10087, + "time_per_iteration": 2.5716116428375244 + }, + { + "auxiliary_loss_clip": 0.01101523, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.0406642, + "balance_loss_mlp": 1.02010393, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.6191042573343368, + "language_loss": 0.83934951, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.86067688, + "num_input_tokens_seen": 217209370, + "step": 10088, + "time_per_iteration": 2.513929843902588 + }, + { + "auxiliary_loss_clip": 0.01064086, + "auxiliary_loss_mlp": 0.007788, + "balance_loss_clip": 1.0381515, + "balance_loss_mlp": 1.0006485, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.488722301984712, + "language_loss": 0.71306354, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.7314924, + "num_input_tokens_seen": 217226990, + "step": 10089, + "time_per_iteration": 2.6168479919433594 + }, + { + "auxiliary_loss_clip": 0.01104866, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.04135609, + "balance_loss_mlp": 1.01953983, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 1.9958010469130456, + "language_loss": 0.82941908, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.85077864, + "num_input_tokens_seen": 217244585, + "step": 10090, + "time_per_iteration": 2.4676883220672607 + }, + { + "auxiliary_loss_clip": 0.01079611, + "auxiliary_loss_mlp": 0.01038263, + "balance_loss_clip": 1.03787887, + "balance_loss_mlp": 1.02445304, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.6903671695529567, + "language_loss": 0.75666291, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77784169, + "num_input_tokens_seen": 217263435, + "step": 10091, + "time_per_iteration": 2.549182653427124 + }, + { + "auxiliary_loss_clip": 0.0110933, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.03783584, + "balance_loss_mlp": 1.02115893, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 1.5220544913101737, + "language_loss": 0.79321963, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81465334, + "num_input_tokens_seen": 217283725, + "step": 10092, + "time_per_iteration": 2.4628117084503174 + }, + { + "auxiliary_loss_clip": 0.01093385, + "auxiliary_loss_mlp": 0.01034641, + "balance_loss_clip": 1.03865457, + "balance_loss_mlp": 1.02118182, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.7878829285893942, + "language_loss": 0.75998116, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78126144, + "num_input_tokens_seen": 217301120, + "step": 10093, + "time_per_iteration": 2.4828760623931885 + }, + { + "auxiliary_loss_clip": 0.01088568, + "auxiliary_loss_mlp": 0.01038008, + "balance_loss_clip": 1.03880382, + "balance_loss_mlp": 1.02479959, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 1.8043214718210394, + "language_loss": 0.8712455, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89251131, + "num_input_tokens_seen": 217319585, + "step": 10094, + "time_per_iteration": 2.540215253829956 + }, + { + "auxiliary_loss_clip": 0.01105503, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.04092658, + "balance_loss_mlp": 1.01782274, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.7295881607291101, + "language_loss": 0.71897519, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74033791, + "num_input_tokens_seen": 217338880, + "step": 10095, + "time_per_iteration": 2.4937644004821777 + }, + { + "auxiliary_loss_clip": 0.01087491, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.03517628, + "balance_loss_mlp": 1.01711297, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.7533818908906667, + "language_loss": 0.76627445, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.78744113, + "num_input_tokens_seen": 217357480, + "step": 10096, + "time_per_iteration": 2.5265095233917236 + }, + { + "auxiliary_loss_clip": 0.01114547, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.03967869, + "balance_loss_mlp": 1.02473032, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 2.2988969248059137, + "language_loss": 0.79797107, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.81948698, + "num_input_tokens_seen": 217374575, + "step": 10097, + "time_per_iteration": 2.4150521755218506 + }, + { + "auxiliary_loss_clip": 0.01089173, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.04187596, + "balance_loss_mlp": 1.01735616, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.4035136020171888, + "language_loss": 0.67049813, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69168568, + "num_input_tokens_seen": 217392950, + "step": 10098, + "time_per_iteration": 2.5235793590545654 + }, + { + "auxiliary_loss_clip": 0.011135, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.04026759, + "balance_loss_mlp": 1.01938367, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 1.8826163978911692, + "language_loss": 0.80209529, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82354546, + "num_input_tokens_seen": 217412145, + "step": 10099, + "time_per_iteration": 2.4477124214172363 + }, + { + "auxiliary_loss_clip": 0.01082054, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.03605795, + "balance_loss_mlp": 1.02173626, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 1.7471389101369172, + "language_loss": 0.70926285, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.73043633, + "num_input_tokens_seen": 217432080, + "step": 10100, + "time_per_iteration": 3.9933931827545166 + }, + { + "auxiliary_loss_clip": 0.01082694, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.04317904, + "balance_loss_mlp": 1.01804352, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 2.268160026250224, + "language_loss": 0.70722604, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72837204, + "num_input_tokens_seen": 217450945, + "step": 10101, + "time_per_iteration": 2.540499687194824 + }, + { + "auxiliary_loss_clip": 0.01089779, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.04168117, + "balance_loss_mlp": 1.01925492, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.801750822504247, + "language_loss": 0.70031011, + "learning_rate": 1.410480790256154e-06, + "loss": 0.72152102, + "num_input_tokens_seen": 217473105, + "step": 10102, + "time_per_iteration": 2.5825576782226562 + }, + { + "auxiliary_loss_clip": 0.01115654, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.04127526, + "balance_loss_mlp": 1.02172589, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.8371390437333905, + "language_loss": 0.73653328, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75803196, + "num_input_tokens_seen": 217491780, + "step": 10103, + "time_per_iteration": 2.4759132862091064 + }, + { + "auxiliary_loss_clip": 0.0107658, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.04083729, + "balance_loss_mlp": 1.02141809, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.7590004957829923, + "language_loss": 0.765715, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.78682864, + "num_input_tokens_seen": 217510605, + "step": 10104, + "time_per_iteration": 2.5376334190368652 + }, + { + "auxiliary_loss_clip": 0.01017431, + "auxiliary_loss_mlp": 0.01000955, + "balance_loss_clip": 1.02140212, + "balance_loss_mlp": 0.99968511, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7094460927527633, + "language_loss": 0.56047821, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58066201, + "num_input_tokens_seen": 217574815, + "step": 10105, + "time_per_iteration": 3.13543701171875 + }, + { + "auxiliary_loss_clip": 0.01027962, + "auxiliary_loss_mlp": 0.01005932, + "balance_loss_clip": 1.01349044, + "balance_loss_mlp": 1.00485277, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7585305104137571, + "language_loss": 0.56823224, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58857113, + "num_input_tokens_seen": 217632375, + "step": 10106, + "time_per_iteration": 3.0034639835357666 + }, + { + "auxiliary_loss_clip": 0.01063215, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.03437066, + "balance_loss_mlp": 1.02418542, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.4812953392999209, + "language_loss": 0.68830639, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70930266, + "num_input_tokens_seen": 217653055, + "step": 10107, + "time_per_iteration": 2.662529945373535 + }, + { + "auxiliary_loss_clip": 0.0110371, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.03760576, + "balance_loss_mlp": 1.02010274, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 1.843980340227573, + "language_loss": 0.80775118, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.82911533, + "num_input_tokens_seen": 217671520, + "step": 10108, + "time_per_iteration": 2.463216543197632 + }, + { + "auxiliary_loss_clip": 0.01091963, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.03889108, + "balance_loss_mlp": 1.01844382, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 1.8851296295753908, + "language_loss": 0.71446508, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73571002, + "num_input_tokens_seen": 217691880, + "step": 10109, + "time_per_iteration": 2.6277735233306885 + }, + { + "auxiliary_loss_clip": 0.01089179, + "auxiliary_loss_mlp": 0.01028149, + "balance_loss_clip": 1.03898203, + "balance_loss_mlp": 1.01669288, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.7883200572551237, + "language_loss": 0.80462778, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82580113, + "num_input_tokens_seen": 217710530, + "step": 10110, + "time_per_iteration": 2.5468101501464844 + }, + { + "auxiliary_loss_clip": 0.0108965, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.03788006, + "balance_loss_mlp": 1.01890373, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.2617068140199863, + "language_loss": 0.70262039, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72384137, + "num_input_tokens_seen": 217728650, + "step": 10111, + "time_per_iteration": 4.133704423904419 + }, + { + "auxiliary_loss_clip": 0.01083731, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.03989863, + "balance_loss_mlp": 1.01612997, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 2.0610969429947255, + "language_loss": 0.65449321, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67562664, + "num_input_tokens_seen": 217747135, + "step": 10112, + "time_per_iteration": 2.578563690185547 + }, + { + "auxiliary_loss_clip": 0.01025623, + "auxiliary_loss_mlp": 0.01002084, + "balance_loss_clip": 1.00976992, + "balance_loss_mlp": 1.00097489, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6305015527387827, + "language_loss": 0.49599326, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51627028, + "num_input_tokens_seen": 217811860, + "step": 10113, + "time_per_iteration": 4.5944578647613525 + }, + { + "auxiliary_loss_clip": 0.01024813, + "auxiliary_loss_mlp": 0.01003242, + "balance_loss_clip": 1.00907123, + "balance_loss_mlp": 1.00208592, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8295455022049019, + "language_loss": 0.56951022, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.58979076, + "num_input_tokens_seen": 217866510, + "step": 10114, + "time_per_iteration": 2.9618163108825684 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.04230905, + "balance_loss_mlp": 1.014799, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.6667896256949755, + "language_loss": 0.69861078, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72008002, + "num_input_tokens_seen": 217885650, + "step": 10115, + "time_per_iteration": 2.446734666824341 + }, + { + "auxiliary_loss_clip": 0.01077153, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.03526998, + "balance_loss_mlp": 1.01873732, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 3.2053400443181057, + "language_loss": 0.72705138, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74813604, + "num_input_tokens_seen": 217905300, + "step": 10116, + "time_per_iteration": 2.618483781814575 + }, + { + "auxiliary_loss_clip": 0.01091329, + "auxiliary_loss_mlp": 0.01039069, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.02488899, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.6365323758065313, + "language_loss": 0.53785944, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.55916345, + "num_input_tokens_seen": 217927845, + "step": 10117, + "time_per_iteration": 2.664086103439331 + }, + { + "auxiliary_loss_clip": 0.01096405, + "auxiliary_loss_mlp": 0.01028816, + "balance_loss_clip": 1.04242444, + "balance_loss_mlp": 1.0164125, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.9943140664244403, + "language_loss": 0.69918311, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.72043526, + "num_input_tokens_seen": 217946145, + "step": 10118, + "time_per_iteration": 2.500755548477173 + }, + { + "auxiliary_loss_clip": 0.01054499, + "auxiliary_loss_mlp": 0.01029519, + "balance_loss_clip": 1.03554809, + "balance_loss_mlp": 1.01719236, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.5188284529113505, + "language_loss": 0.74592686, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.76676702, + "num_input_tokens_seen": 217965190, + "step": 10119, + "time_per_iteration": 2.608630418777466 + }, + { + "auxiliary_loss_clip": 0.01101461, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.03968453, + "balance_loss_mlp": 1.0193882, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 1.961829139162069, + "language_loss": 0.67461902, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69595075, + "num_input_tokens_seen": 217983625, + "step": 10120, + "time_per_iteration": 2.4975216388702393 + }, + { + "auxiliary_loss_clip": 0.01107384, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.04218709, + "balance_loss_mlp": 1.01999581, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.6397591389999153, + "language_loss": 0.74083304, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76223266, + "num_input_tokens_seen": 218006005, + "step": 10121, + "time_per_iteration": 2.5053703784942627 + }, + { + "auxiliary_loss_clip": 0.01106536, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.0424844, + "balance_loss_mlp": 1.01660514, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.876800500762092, + "language_loss": 0.80402154, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82537258, + "num_input_tokens_seen": 218024195, + "step": 10122, + "time_per_iteration": 2.4569222927093506 + }, + { + "auxiliary_loss_clip": 0.01100564, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.03981531, + "balance_loss_mlp": 1.02314758, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.4453667667915528, + "language_loss": 0.55790746, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57927108, + "num_input_tokens_seen": 218047190, + "step": 10123, + "time_per_iteration": 2.6028809547424316 + }, + { + "auxiliary_loss_clip": 0.01105551, + "auxiliary_loss_mlp": 0.01032911, + "balance_loss_clip": 1.04297066, + "balance_loss_mlp": 1.02085257, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.9806112635010902, + "language_loss": 0.74034011, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.76172471, + "num_input_tokens_seen": 218065945, + "step": 10124, + "time_per_iteration": 2.483673572540283 + }, + { + "auxiliary_loss_clip": 0.01090546, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.03629446, + "balance_loss_mlp": 1.01914394, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 2.4107281233085343, + "language_loss": 0.65376985, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67499506, + "num_input_tokens_seen": 218085285, + "step": 10125, + "time_per_iteration": 4.098893880844116 + }, + { + "auxiliary_loss_clip": 0.01114765, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.04162765, + "balance_loss_mlp": 1.01778579, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 1.9743148225834564, + "language_loss": 0.76762241, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78907084, + "num_input_tokens_seen": 218104735, + "step": 10126, + "time_per_iteration": 2.4627158641815186 + }, + { + "auxiliary_loss_clip": 0.01079187, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.03971982, + "balance_loss_mlp": 1.01507878, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 7.3565164483525525, + "language_loss": 0.70929486, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.7303707, + "num_input_tokens_seen": 218121855, + "step": 10127, + "time_per_iteration": 2.494539499282837 + }, + { + "auxiliary_loss_clip": 0.01118652, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.04181957, + "balance_loss_mlp": 1.01651835, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 3.1938169008685366, + "language_loss": 0.72788537, + "learning_rate": 1.400812267497691e-06, + "loss": 0.74937177, + "num_input_tokens_seen": 218137325, + "step": 10128, + "time_per_iteration": 2.4370338916778564 + }, + { + "auxiliary_loss_clip": 0.01068386, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.04085994, + "balance_loss_mlp": 1.01796317, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 2.3744883086154753, + "language_loss": 0.73830628, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75928962, + "num_input_tokens_seen": 218155530, + "step": 10129, + "time_per_iteration": 2.5509088039398193 + }, + { + "auxiliary_loss_clip": 0.01113253, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.04023826, + "balance_loss_mlp": 1.02107453, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.4222115804564113, + "language_loss": 0.65459311, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67606074, + "num_input_tokens_seen": 218182535, + "step": 10130, + "time_per_iteration": 2.6391854286193848 + }, + { + "auxiliary_loss_clip": 0.01083039, + "auxiliary_loss_mlp": 0.01026693, + "balance_loss_clip": 1.03658271, + "balance_loss_mlp": 1.01557684, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 2.570599871430732, + "language_loss": 0.7686581, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.7897554, + "num_input_tokens_seen": 218201740, + "step": 10131, + "time_per_iteration": 2.483455181121826 + }, + { + "auxiliary_loss_clip": 0.0108222, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.03989482, + "balance_loss_mlp": 1.02207494, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.8452935318598591, + "language_loss": 0.76970363, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.79085588, + "num_input_tokens_seen": 218219800, + "step": 10132, + "time_per_iteration": 2.591688394546509 + }, + { + "auxiliary_loss_clip": 0.01111588, + "auxiliary_loss_mlp": 0.01032796, + "balance_loss_clip": 1.04231262, + "balance_loss_mlp": 1.02166772, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.7502367854485614, + "language_loss": 0.75979656, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.7812404, + "num_input_tokens_seen": 218237585, + "step": 10133, + "time_per_iteration": 2.4707326889038086 + }, + { + "auxiliary_loss_clip": 0.0110108, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.03846812, + "balance_loss_mlp": 1.02128112, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.8075770644335512, + "language_loss": 0.63956833, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.66091812, + "num_input_tokens_seen": 218258700, + "step": 10134, + "time_per_iteration": 2.5487871170043945 + }, + { + "auxiliary_loss_clip": 0.01089571, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.03876758, + "balance_loss_mlp": 1.02216196, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.7656826999514021, + "language_loss": 0.78805077, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.8092885, + "num_input_tokens_seen": 218275655, + "step": 10135, + "time_per_iteration": 2.5190091133117676 + }, + { + "auxiliary_loss_clip": 0.01088831, + "auxiliary_loss_mlp": 0.01027977, + "balance_loss_clip": 1.03784966, + "balance_loss_mlp": 1.01672387, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 1.736432678607068, + "language_loss": 0.71890044, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74006855, + "num_input_tokens_seen": 218295720, + "step": 10136, + "time_per_iteration": 2.552130699157715 + }, + { + "auxiliary_loss_clip": 0.01117082, + "auxiliary_loss_mlp": 0.01032554, + "balance_loss_clip": 1.04278219, + "balance_loss_mlp": 1.02000737, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 2.2536776196477515, + "language_loss": 0.74565464, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.767151, + "num_input_tokens_seen": 218316745, + "step": 10137, + "time_per_iteration": 2.617732048034668 + }, + { + "auxiliary_loss_clip": 0.01101344, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.03811979, + "balance_loss_mlp": 1.02906275, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 2.197748866440277, + "language_loss": 0.80276072, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82420564, + "num_input_tokens_seen": 218335385, + "step": 10138, + "time_per_iteration": 4.075772523880005 + }, + { + "auxiliary_loss_clip": 0.01082791, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.03759718, + "balance_loss_mlp": 1.02444017, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.5815259598323899, + "language_loss": 0.81168306, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83287162, + "num_input_tokens_seen": 218353320, + "step": 10139, + "time_per_iteration": 2.4690139293670654 + }, + { + "auxiliary_loss_clip": 0.0107839, + "auxiliary_loss_mlp": 0.01036684, + "balance_loss_clip": 1.038234, + "balance_loss_mlp": 1.02337432, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.124542142729894, + "language_loss": 0.83610046, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85725117, + "num_input_tokens_seen": 218365620, + "step": 10140, + "time_per_iteration": 2.46871280670166 + }, + { + "auxiliary_loss_clip": 0.01105744, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.04124248, + "balance_loss_mlp": 1.02213705, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 2.9583371551524253, + "language_loss": 0.75996697, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.78136611, + "num_input_tokens_seen": 218383785, + "step": 10141, + "time_per_iteration": 2.5008459091186523 + }, + { + "auxiliary_loss_clip": 0.0108469, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.03567994, + "balance_loss_mlp": 1.02329111, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 2.130245030678552, + "language_loss": 0.76774818, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78896296, + "num_input_tokens_seen": 218399055, + "step": 10142, + "time_per_iteration": 2.4724206924438477 + }, + { + "auxiliary_loss_clip": 0.01114965, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.0415504, + "balance_loss_mlp": 1.02005231, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.8029462119662172, + "language_loss": 0.76390183, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.78538048, + "num_input_tokens_seen": 218419120, + "step": 10143, + "time_per_iteration": 2.511436939239502 + }, + { + "auxiliary_loss_clip": 0.011008, + "auxiliary_loss_mlp": 0.01044876, + "balance_loss_clip": 1.03844881, + "balance_loss_mlp": 1.0304811, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.8201398783116973, + "language_loss": 0.75344795, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77490473, + "num_input_tokens_seen": 218435290, + "step": 10144, + "time_per_iteration": 2.4464399814605713 + }, + { + "auxiliary_loss_clip": 0.01086228, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.03700984, + "balance_loss_mlp": 1.01754785, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 2.20024972638785, + "language_loss": 0.73021686, + "learning_rate": 1.394498830235383e-06, + "loss": 0.75138772, + "num_input_tokens_seen": 218457880, + "step": 10145, + "time_per_iteration": 2.691654920578003 + }, + { + "auxiliary_loss_clip": 0.01092718, + "auxiliary_loss_mlp": 0.01036875, + "balance_loss_clip": 1.03761613, + "balance_loss_mlp": 1.02456641, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 1.6690711935742872, + "language_loss": 0.69168305, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71297896, + "num_input_tokens_seen": 218475930, + "step": 10146, + "time_per_iteration": 2.5089266300201416 + }, + { + "auxiliary_loss_clip": 0.01069801, + "auxiliary_loss_mlp": 0.00776158, + "balance_loss_clip": 1.03999126, + "balance_loss_mlp": 1.0005331, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.768446622615727, + "language_loss": 0.76852173, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.78698134, + "num_input_tokens_seen": 218493675, + "step": 10147, + "time_per_iteration": 2.5646865367889404 + }, + { + "auxiliary_loss_clip": 0.01090566, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.03709757, + "balance_loss_mlp": 1.02003741, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.9518472061627572, + "language_loss": 0.78194547, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80317241, + "num_input_tokens_seen": 218511780, + "step": 10148, + "time_per_iteration": 2.485177993774414 + }, + { + "auxiliary_loss_clip": 0.01084197, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.0364368, + "balance_loss_mlp": 1.02347076, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.1902220276546145, + "language_loss": 0.5447976, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56601942, + "num_input_tokens_seen": 218531850, + "step": 10149, + "time_per_iteration": 2.6163270473480225 + }, + { + "auxiliary_loss_clip": 0.01089367, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.04021454, + "balance_loss_mlp": 1.02069831, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.7437181026677302, + "language_loss": 0.80486488, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.8260836, + "num_input_tokens_seen": 218551245, + "step": 10150, + "time_per_iteration": 2.5448813438415527 + }, + { + "auxiliary_loss_clip": 0.01093313, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.04443514, + "balance_loss_mlp": 1.0255506, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 1.7169564209610146, + "language_loss": 0.69075638, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.71207488, + "num_input_tokens_seen": 218571365, + "step": 10151, + "time_per_iteration": 4.0783467292785645 + }, + { + "auxiliary_loss_clip": 0.01112172, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.03918719, + "balance_loss_mlp": 1.02451324, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.6371681374990488, + "language_loss": 0.70744801, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.7289319, + "num_input_tokens_seen": 218588315, + "step": 10152, + "time_per_iteration": 2.5110580921173096 + }, + { + "auxiliary_loss_clip": 0.01081523, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.04120743, + "balance_loss_mlp": 1.01795578, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 1.8118616350368868, + "language_loss": 0.78707778, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80819708, + "num_input_tokens_seen": 218605940, + "step": 10153, + "time_per_iteration": 3.9165751934051514 + }, + { + "auxiliary_loss_clip": 0.01088411, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.03604507, + "balance_loss_mlp": 1.02289534, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.9429952789632208, + "language_loss": 0.78851104, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.80975008, + "num_input_tokens_seen": 218626100, + "step": 10154, + "time_per_iteration": 2.5775814056396484 + }, + { + "auxiliary_loss_clip": 0.01103872, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.04251981, + "balance_loss_mlp": 1.01937556, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.560172697500532, + "language_loss": 0.69955176, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72090369, + "num_input_tokens_seen": 218645060, + "step": 10155, + "time_per_iteration": 2.508561849594116 + }, + { + "auxiliary_loss_clip": 0.01105875, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.04205871, + "balance_loss_mlp": 1.02144158, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.5931062741714168, + "language_loss": 0.71485305, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73626471, + "num_input_tokens_seen": 218667690, + "step": 10156, + "time_per_iteration": 2.548715353012085 + }, + { + "auxiliary_loss_clip": 0.01089523, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.03989911, + "balance_loss_mlp": 1.01925433, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.5798790671802143, + "language_loss": 0.67600429, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69721794, + "num_input_tokens_seen": 218687505, + "step": 10157, + "time_per_iteration": 2.4918251037597656 + }, + { + "auxiliary_loss_clip": 0.01072687, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.03233576, + "balance_loss_mlp": 1.01455593, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 3.4122132087228474, + "language_loss": 0.72291863, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74391639, + "num_input_tokens_seen": 218705315, + "step": 10158, + "time_per_iteration": 2.5250701904296875 + }, + { + "auxiliary_loss_clip": 0.01101699, + "auxiliary_loss_mlp": 0.01037221, + "balance_loss_clip": 1.0421474, + "balance_loss_mlp": 1.02488852, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.5604635621817198, + "language_loss": 0.69282687, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71421611, + "num_input_tokens_seen": 218725735, + "step": 10159, + "time_per_iteration": 2.5380055904388428 + }, + { + "auxiliary_loss_clip": 0.01117641, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.04223454, + "balance_loss_mlp": 1.01669359, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 1.9271697258356197, + "language_loss": 0.78889716, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81037104, + "num_input_tokens_seen": 218743215, + "step": 10160, + "time_per_iteration": 2.4182968139648438 + }, + { + "auxiliary_loss_clip": 0.01027867, + "auxiliary_loss_mlp": 0.01001788, + "balance_loss_clip": 1.01350451, + "balance_loss_mlp": 1.00053644, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8196042213603423, + "language_loss": 0.61470717, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63500369, + "num_input_tokens_seen": 218806440, + "step": 10161, + "time_per_iteration": 3.18993878364563 + }, + { + "auxiliary_loss_clip": 0.0109704, + "auxiliary_loss_mlp": 0.00778851, + "balance_loss_clip": 1.04280782, + "balance_loss_mlp": 1.00063992, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.6022062947422309, + "language_loss": 0.75897282, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.77773178, + "num_input_tokens_seen": 218825720, + "step": 10162, + "time_per_iteration": 2.5172297954559326 + }, + { + "auxiliary_loss_clip": 0.01115397, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.04146874, + "balance_loss_mlp": 1.01644301, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 1.7512675746144313, + "language_loss": 0.71411002, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73555946, + "num_input_tokens_seen": 218847735, + "step": 10163, + "time_per_iteration": 2.5378284454345703 + }, + { + "auxiliary_loss_clip": 0.01110743, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.0390203, + "balance_loss_mlp": 1.01856422, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 1.9267707008883548, + "language_loss": 0.59913337, + "learning_rate": 1.387450491396625e-06, + "loss": 0.62054658, + "num_input_tokens_seen": 218866585, + "step": 10164, + "time_per_iteration": 4.059423446655273 + }, + { + "auxiliary_loss_clip": 0.01098237, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.04065776, + "balance_loss_mlp": 1.02126527, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.7727336964297191, + "language_loss": 0.75606, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.77737558, + "num_input_tokens_seen": 218885560, + "step": 10165, + "time_per_iteration": 2.511754035949707 + }, + { + "auxiliary_loss_clip": 0.01092777, + "auxiliary_loss_mlp": 0.01029514, + "balance_loss_clip": 1.04114771, + "balance_loss_mlp": 1.01672292, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.6454519284067446, + "language_loss": 0.79340553, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81462848, + "num_input_tokens_seen": 218905055, + "step": 10166, + "time_per_iteration": 2.5382964611053467 + }, + { + "auxiliary_loss_clip": 0.01097188, + "auxiliary_loss_mlp": 0.01028935, + "balance_loss_clip": 1.0460279, + "balance_loss_mlp": 1.01580429, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 1.7443866117566025, + "language_loss": 0.6745019, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69576317, + "num_input_tokens_seen": 218924030, + "step": 10167, + "time_per_iteration": 2.5362470149993896 + }, + { + "auxiliary_loss_clip": 0.0111332, + "auxiliary_loss_mlp": 0.0103606, + "balance_loss_clip": 1.04173899, + "balance_loss_mlp": 1.02450311, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.5907586969386702, + "language_loss": 0.79103833, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.81253207, + "num_input_tokens_seen": 218943750, + "step": 10168, + "time_per_iteration": 2.448974370956421 + }, + { + "auxiliary_loss_clip": 0.0112058, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.04041505, + "balance_loss_mlp": 1.0222261, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 5.069529871310034, + "language_loss": 0.85366279, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87523973, + "num_input_tokens_seen": 218957585, + "step": 10169, + "time_per_iteration": 2.3991048336029053 + }, + { + "auxiliary_loss_clip": 0.01110511, + "auxiliary_loss_mlp": 0.01028263, + "balance_loss_clip": 1.03842187, + "balance_loss_mlp": 1.01656818, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 2.0740664561448683, + "language_loss": 0.78664565, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.80803335, + "num_input_tokens_seen": 218980025, + "step": 10170, + "time_per_iteration": 2.6179754734039307 + }, + { + "auxiliary_loss_clip": 0.01094935, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.03795099, + "balance_loss_mlp": 1.02000415, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 2.0871813640470878, + "language_loss": 0.68642294, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.70771599, + "num_input_tokens_seen": 218998200, + "step": 10171, + "time_per_iteration": 2.498439073562622 + }, + { + "auxiliary_loss_clip": 0.01087632, + "auxiliary_loss_mlp": 0.01037683, + "balance_loss_clip": 1.039276, + "balance_loss_mlp": 1.02322876, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 2.2901631305998547, + "language_loss": 0.79159784, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81285107, + "num_input_tokens_seen": 219017910, + "step": 10172, + "time_per_iteration": 2.6204171180725098 + }, + { + "auxiliary_loss_clip": 0.01088697, + "auxiliary_loss_mlp": 0.01032253, + "balance_loss_clip": 1.04496706, + "balance_loss_mlp": 1.01949716, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.7622282143291153, + "language_loss": 0.67443228, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69564176, + "num_input_tokens_seen": 219037730, + "step": 10173, + "time_per_iteration": 2.545024871826172 + }, + { + "auxiliary_loss_clip": 0.01091524, + "auxiliary_loss_mlp": 0.01036285, + "balance_loss_clip": 1.03902113, + "balance_loss_mlp": 1.0229094, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.9733126957812857, + "language_loss": 0.55446661, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57574469, + "num_input_tokens_seen": 219056755, + "step": 10174, + "time_per_iteration": 2.478780508041382 + }, + { + "auxiliary_loss_clip": 0.01096269, + "auxiliary_loss_mlp": 0.01033152, + "balance_loss_clip": 1.04049945, + "balance_loss_mlp": 1.01988435, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 1.8850224571852197, + "language_loss": 0.66491014, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68620443, + "num_input_tokens_seen": 219076985, + "step": 10175, + "time_per_iteration": 2.528338670730591 + }, + { + "auxiliary_loss_clip": 0.01098749, + "auxiliary_loss_mlp": 0.00777574, + "balance_loss_clip": 1.036937, + "balance_loss_mlp": 1.00061297, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.0441887532682412, + "language_loss": 0.82634699, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84511024, + "num_input_tokens_seen": 219096050, + "step": 10176, + "time_per_iteration": 2.5279884338378906 + }, + { + "auxiliary_loss_clip": 0.01096312, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.04177594, + "balance_loss_mlp": 1.02007067, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 2.591747489053651, + "language_loss": 0.76773363, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.78903145, + "num_input_tokens_seen": 219112665, + "step": 10177, + "time_per_iteration": 2.5744776725769043 + }, + { + "auxiliary_loss_clip": 0.01101188, + "auxiliary_loss_mlp": 0.00779648, + "balance_loss_clip": 1.03777432, + "balance_loss_mlp": 1.00063884, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 2.6458649199917694, + "language_loss": 0.75524604, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77405441, + "num_input_tokens_seen": 219129120, + "step": 10178, + "time_per_iteration": 3.934549331665039 + }, + { + "auxiliary_loss_clip": 0.01090148, + "auxiliary_loss_mlp": 0.01040289, + "balance_loss_clip": 1.03870535, + "balance_loss_mlp": 1.02574563, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.5341946071483343, + "language_loss": 0.67196506, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.69326943, + "num_input_tokens_seen": 219148950, + "step": 10179, + "time_per_iteration": 2.4984452724456787 + }, + { + "auxiliary_loss_clip": 0.01093862, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.04054165, + "balance_loss_mlp": 1.01909018, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 2.4151259664397733, + "language_loss": 0.84018373, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.86143124, + "num_input_tokens_seen": 219165585, + "step": 10180, + "time_per_iteration": 2.506460189819336 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.04235172, + "balance_loss_mlp": 1.01552796, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.5747378668879501, + "language_loss": 0.77666742, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79811037, + "num_input_tokens_seen": 219183280, + "step": 10181, + "time_per_iteration": 2.440610647201538 + }, + { + "auxiliary_loss_clip": 0.01115668, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.04133415, + "balance_loss_mlp": 1.02072668, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 6.447711908904239, + "language_loss": 0.80369425, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82518214, + "num_input_tokens_seen": 219197200, + "step": 10182, + "time_per_iteration": 2.402634859085083 + }, + { + "auxiliary_loss_clip": 0.01081542, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.04037333, + "balance_loss_mlp": 1.02408659, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.5741045501197066, + "language_loss": 0.83085334, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.8520236, + "num_input_tokens_seen": 219216825, + "step": 10183, + "time_per_iteration": 2.573106288909912 + }, + { + "auxiliary_loss_clip": 0.01028104, + "auxiliary_loss_mlp": 0.01001957, + "balance_loss_clip": 1.017048, + "balance_loss_mlp": 1.00100958, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7121706144613544, + "language_loss": 0.62931162, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64961225, + "num_input_tokens_seen": 219283795, + "step": 10184, + "time_per_iteration": 3.1812798976898193 + }, + { + "auxiliary_loss_clip": 0.0110923, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.04486787, + "balance_loss_mlp": 1.0174346, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 1.699159332827806, + "language_loss": 0.82007992, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84146678, + "num_input_tokens_seen": 219302385, + "step": 10185, + "time_per_iteration": 2.4871163368225098 + }, + { + "auxiliary_loss_clip": 0.01092098, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.04050624, + "balance_loss_mlp": 1.02143574, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 1.9575192581752296, + "language_loss": 0.75023133, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.77149606, + "num_input_tokens_seen": 219319765, + "step": 10186, + "time_per_iteration": 2.536078691482544 + }, + { + "auxiliary_loss_clip": 0.01099536, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.03756487, + "balance_loss_mlp": 1.01956129, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.5665323290752768, + "language_loss": 0.78345263, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.804757, + "num_input_tokens_seen": 219337440, + "step": 10187, + "time_per_iteration": 2.4831888675689697 + }, + { + "auxiliary_loss_clip": 0.01112641, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.03976452, + "balance_loss_mlp": 1.01761091, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 2.0683213081037857, + "language_loss": 0.83208835, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85351539, + "num_input_tokens_seen": 219357525, + "step": 10188, + "time_per_iteration": 2.4594593048095703 + }, + { + "auxiliary_loss_clip": 0.0108744, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.0436151, + "balance_loss_mlp": 1.01848769, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 1.7308742425496555, + "language_loss": 0.75463474, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77581567, + "num_input_tokens_seen": 219374855, + "step": 10189, + "time_per_iteration": 2.5249361991882324 + }, + { + "auxiliary_loss_clip": 0.01100085, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.0375762, + "balance_loss_mlp": 1.01974761, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.52788998740565, + "language_loss": 0.74051845, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76184785, + "num_input_tokens_seen": 219394740, + "step": 10190, + "time_per_iteration": 2.573141574859619 + }, + { + "auxiliary_loss_clip": 0.01102934, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.0398643, + "balance_loss_mlp": 1.01902652, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 2.1056617456284275, + "language_loss": 0.68579859, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70714748, + "num_input_tokens_seen": 219413755, + "step": 10191, + "time_per_iteration": 4.136383056640625 + }, + { + "auxiliary_loss_clip": 0.01101337, + "auxiliary_loss_mlp": 0.01035529, + "balance_loss_clip": 1.03738153, + "balance_loss_mlp": 1.02232647, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.103007215174528, + "language_loss": 0.74112105, + "learning_rate": 1.377078777445467e-06, + "loss": 0.76248968, + "num_input_tokens_seen": 219433560, + "step": 10192, + "time_per_iteration": 3.8772666454315186 + }, + { + "auxiliary_loss_clip": 0.01075779, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.04031634, + "balance_loss_mlp": 1.01963258, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 2.050054738499455, + "language_loss": 0.8326357, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85370892, + "num_input_tokens_seen": 219452640, + "step": 10193, + "time_per_iteration": 2.550534248352051 + }, + { + "auxiliary_loss_clip": 0.01080365, + "auxiliary_loss_mlp": 0.01027547, + "balance_loss_clip": 1.03850949, + "balance_loss_mlp": 1.01545894, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.1590873362259377, + "language_loss": 0.69633073, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.71740985, + "num_input_tokens_seen": 219468585, + "step": 10194, + "time_per_iteration": 2.5724005699157715 + }, + { + "auxiliary_loss_clip": 0.01010493, + "auxiliary_loss_mlp": 0.01007188, + "balance_loss_clip": 1.01492858, + "balance_loss_mlp": 1.00597239, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.823566196257012, + "language_loss": 0.58677709, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60695386, + "num_input_tokens_seen": 219523015, + "step": 10195, + "time_per_iteration": 2.9165964126586914 + }, + { + "auxiliary_loss_clip": 0.01094655, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.04162669, + "balance_loss_mlp": 1.02059197, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 2.157371674119835, + "language_loss": 0.69554073, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71682078, + "num_input_tokens_seen": 219539980, + "step": 10196, + "time_per_iteration": 2.4950692653656006 + }, + { + "auxiliary_loss_clip": 0.01089798, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.03753638, + "balance_loss_mlp": 1.0237298, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 1.7377640796595708, + "language_loss": 0.71562409, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73688161, + "num_input_tokens_seen": 219556980, + "step": 10197, + "time_per_iteration": 2.5509955883026123 + }, + { + "auxiliary_loss_clip": 0.01101947, + "auxiliary_loss_mlp": 0.01046619, + "balance_loss_clip": 1.03839517, + "balance_loss_mlp": 1.03228426, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 2.0557025467133014, + "language_loss": 0.79256064, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81404632, + "num_input_tokens_seen": 219576410, + "step": 10198, + "time_per_iteration": 2.616133451461792 + }, + { + "auxiliary_loss_clip": 0.01091363, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.04539287, + "balance_loss_mlp": 1.01705217, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.4178778267805756, + "language_loss": 0.74621993, + "learning_rate": 1.374488730519181e-06, + "loss": 0.7674315, + "num_input_tokens_seen": 219597180, + "step": 10199, + "time_per_iteration": 2.572765588760376 + }, + { + "auxiliary_loss_clip": 0.01091087, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.03938282, + "balance_loss_mlp": 1.02355564, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 1.7808479579335506, + "language_loss": 0.62195104, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64323115, + "num_input_tokens_seen": 219617630, + "step": 10200, + "time_per_iteration": 2.6060335636138916 + }, + { + "auxiliary_loss_clip": 0.01090353, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03923726, + "balance_loss_mlp": 1.01998568, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 1.8127929708277164, + "language_loss": 0.68615615, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70738316, + "num_input_tokens_seen": 219637025, + "step": 10201, + "time_per_iteration": 2.5321929454803467 + }, + { + "auxiliary_loss_clip": 0.01093321, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.03975224, + "balance_loss_mlp": 1.01488209, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 2.0047195238079296, + "language_loss": 0.83752322, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.85873556, + "num_input_tokens_seen": 219656625, + "step": 10202, + "time_per_iteration": 2.554748773574829 + }, + { + "auxiliary_loss_clip": 0.01036688, + "auxiliary_loss_mlp": 0.01000728, + "balance_loss_clip": 1.01228237, + "balance_loss_mlp": 0.99969721, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.8886571101683257, + "language_loss": 0.67149758, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69187176, + "num_input_tokens_seen": 219718090, + "step": 10203, + "time_per_iteration": 3.047635316848755 + }, + { + "auxiliary_loss_clip": 0.0110737, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.04260087, + "balance_loss_mlp": 1.01976264, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 1.7604760952423564, + "language_loss": 0.61167622, + "learning_rate": 1.37263940830327e-06, + "loss": 0.6330719, + "num_input_tokens_seen": 219740100, + "step": 10204, + "time_per_iteration": 4.14204740524292 + }, + { + "auxiliary_loss_clip": 0.01077364, + "auxiliary_loss_mlp": 0.01027869, + "balance_loss_clip": 1.03735733, + "balance_loss_mlp": 1.01601338, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 2.3604919613088233, + "language_loss": 0.72305417, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74410653, + "num_input_tokens_seen": 219761225, + "step": 10205, + "time_per_iteration": 2.549896240234375 + }, + { + "auxiliary_loss_clip": 0.01100316, + "auxiliary_loss_mlp": 0.01026843, + "balance_loss_clip": 1.03956437, + "balance_loss_mlp": 1.01401651, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.573559672679133, + "language_loss": 0.76342684, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78469843, + "num_input_tokens_seen": 219780085, + "step": 10206, + "time_per_iteration": 2.486431121826172 + }, + { + "auxiliary_loss_clip": 0.01091507, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.05114961, + "balance_loss_mlp": 1.01693082, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 2.1283832421764215, + "language_loss": 0.75748646, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77869844, + "num_input_tokens_seen": 219797895, + "step": 10207, + "time_per_iteration": 2.655941963195801 + }, + { + "auxiliary_loss_clip": 0.01102898, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.03988719, + "balance_loss_mlp": 1.02073431, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.182023723600937, + "language_loss": 0.82604688, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84740508, + "num_input_tokens_seen": 219811295, + "step": 10208, + "time_per_iteration": 2.4429123401641846 + }, + { + "auxiliary_loss_clip": 0.01097031, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.04073775, + "balance_loss_mlp": 1.0202626, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 2.767852812710328, + "language_loss": 0.72456759, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74587733, + "num_input_tokens_seen": 219832735, + "step": 10209, + "time_per_iteration": 2.627060890197754 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01037171, + "balance_loss_clip": 1.04112148, + "balance_loss_mlp": 1.02501142, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6331159972003015, + "language_loss": 0.74142802, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76292372, + "num_input_tokens_seen": 219852755, + "step": 10210, + "time_per_iteration": 2.4867594242095947 + }, + { + "auxiliary_loss_clip": 0.01013229, + "auxiliary_loss_mlp": 0.01005672, + "balance_loss_clip": 1.01409876, + "balance_loss_mlp": 1.00451589, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8570831263153209, + "language_loss": 0.64940131, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.66959035, + "num_input_tokens_seen": 219922785, + "step": 10211, + "time_per_iteration": 3.2518463134765625 + }, + { + "auxiliary_loss_clip": 0.01091475, + "auxiliary_loss_mlp": 0.00778732, + "balance_loss_clip": 1.03889024, + "balance_loss_mlp": 1.0006032, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.8441423835826123, + "language_loss": 0.7575655, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77626753, + "num_input_tokens_seen": 219942215, + "step": 10212, + "time_per_iteration": 2.554565191268921 + }, + { + "auxiliary_loss_clip": 0.01089239, + "auxiliary_loss_mlp": 0.01043627, + "balance_loss_clip": 1.03840804, + "balance_loss_mlp": 1.029459, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.820425552173742, + "language_loss": 0.73982614, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76115477, + "num_input_tokens_seen": 219963830, + "step": 10213, + "time_per_iteration": 2.5573034286499023 + }, + { + "auxiliary_loss_clip": 0.01098872, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.04159117, + "balance_loss_mlp": 1.02133679, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 2.9066373550320352, + "language_loss": 0.72892058, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75025225, + "num_input_tokens_seen": 219983815, + "step": 10214, + "time_per_iteration": 2.5505640506744385 + }, + { + "auxiliary_loss_clip": 0.01116847, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.04127669, + "balance_loss_mlp": 1.01957226, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.7476462145756981, + "language_loss": 0.74382842, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76532626, + "num_input_tokens_seen": 220003165, + "step": 10215, + "time_per_iteration": 2.4652957916259766 + }, + { + "auxiliary_loss_clip": 0.01099673, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.03817558, + "balance_loss_mlp": 1.01851654, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.6701338379779629, + "language_loss": 0.78356987, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80487937, + "num_input_tokens_seen": 220021015, + "step": 10216, + "time_per_iteration": 2.4991230964660645 + }, + { + "auxiliary_loss_clip": 0.01113566, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.04083204, + "balance_loss_mlp": 1.02086413, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 2.083469059733567, + "language_loss": 0.79691976, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.81840038, + "num_input_tokens_seen": 220035780, + "step": 10217, + "time_per_iteration": 3.867062568664551 + }, + { + "auxiliary_loss_clip": 0.01093462, + "auxiliary_loss_mlp": 0.01025122, + "balance_loss_clip": 1.03872859, + "balance_loss_mlp": 1.01249731, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.799658282411671, + "language_loss": 0.78326589, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.8044517, + "num_input_tokens_seen": 220054280, + "step": 10218, + "time_per_iteration": 2.5572166442871094 + }, + { + "auxiliary_loss_clip": 0.01104326, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.0404675, + "balance_loss_mlp": 1.02008963, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.515487367341358, + "language_loss": 0.81919396, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84056175, + "num_input_tokens_seen": 220074120, + "step": 10219, + "time_per_iteration": 2.486959457397461 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.04203534, + "balance_loss_mlp": 1.01795113, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 2.073434197164059, + "language_loss": 0.66723716, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.6885826, + "num_input_tokens_seen": 220096320, + "step": 10220, + "time_per_iteration": 2.6491241455078125 + }, + { + "auxiliary_loss_clip": 0.01103468, + "auxiliary_loss_mlp": 0.01029762, + "balance_loss_clip": 1.03965902, + "balance_loss_mlp": 1.01707768, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 1.8986301859195294, + "language_loss": 0.71833414, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.7396664, + "num_input_tokens_seen": 220114850, + "step": 10221, + "time_per_iteration": 2.477186679840088 + }, + { + "auxiliary_loss_clip": 0.01067656, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.03662562, + "balance_loss_mlp": 1.01796222, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.6486424190357416, + "language_loss": 0.79339945, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81437558, + "num_input_tokens_seen": 220133395, + "step": 10222, + "time_per_iteration": 2.5689399242401123 + }, + { + "auxiliary_loss_clip": 0.01090473, + "auxiliary_loss_mlp": 0.01040454, + "balance_loss_clip": 1.03849101, + "balance_loss_mlp": 1.02704322, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 1.9599049097441381, + "language_loss": 0.76064825, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78195751, + "num_input_tokens_seen": 220152790, + "step": 10223, + "time_per_iteration": 2.5234432220458984 + }, + { + "auxiliary_loss_clip": 0.01090108, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.03898668, + "balance_loss_mlp": 1.02349019, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 2.045194034748251, + "language_loss": 0.78549874, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.8067596, + "num_input_tokens_seen": 220169535, + "step": 10224, + "time_per_iteration": 2.4640660285949707 + }, + { + "auxiliary_loss_clip": 0.01077415, + "auxiliary_loss_mlp": 0.01027098, + "balance_loss_clip": 1.03658521, + "balance_loss_mlp": 1.01570797, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.2485475262791246, + "language_loss": 0.663297, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68434215, + "num_input_tokens_seen": 220195305, + "step": 10225, + "time_per_iteration": 2.8677611351013184 + }, + { + "auxiliary_loss_clip": 0.01101749, + "auxiliary_loss_mlp": 0.00779329, + "balance_loss_clip": 1.04121459, + "balance_loss_mlp": 1.00065696, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.2816296208850164, + "language_loss": 0.63519281, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65400362, + "num_input_tokens_seen": 220215040, + "step": 10226, + "time_per_iteration": 2.570570945739746 + }, + { + "auxiliary_loss_clip": 0.010892, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.03670049, + "balance_loss_mlp": 1.02157879, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.84768841570757, + "language_loss": 0.75557518, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77681804, + "num_input_tokens_seen": 220234205, + "step": 10227, + "time_per_iteration": 2.510512113571167 + }, + { + "auxiliary_loss_clip": 0.01056714, + "auxiliary_loss_mlp": 0.01042268, + "balance_loss_clip": 1.03380167, + "balance_loss_mlp": 1.02715206, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 2.091507253461798, + "language_loss": 0.62343633, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.64442611, + "num_input_tokens_seen": 220252730, + "step": 10228, + "time_per_iteration": 2.5963988304138184 + }, + { + "auxiliary_loss_clip": 0.01091352, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.03783131, + "balance_loss_mlp": 1.01950145, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.3740286956094276, + "language_loss": 0.74102908, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76226473, + "num_input_tokens_seen": 220273345, + "step": 10229, + "time_per_iteration": 4.125056266784668 + }, + { + "auxiliary_loss_clip": 0.01115747, + "auxiliary_loss_mlp": 0.01039944, + "balance_loss_clip": 1.04139948, + "balance_loss_mlp": 1.02670014, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.7126868032885798, + "language_loss": 0.77712524, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.79868209, + "num_input_tokens_seen": 220293845, + "step": 10230, + "time_per_iteration": 2.5084152221679688 + }, + { + "auxiliary_loss_clip": 0.01085396, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.0363158, + "balance_loss_mlp": 1.02165043, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.697878583408346, + "language_loss": 0.73310816, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75430381, + "num_input_tokens_seen": 220316070, + "step": 10231, + "time_per_iteration": 4.005016088485718 + }, + { + "auxiliary_loss_clip": 0.01093309, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.03875482, + "balance_loss_mlp": 1.02025962, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.569143535137614, + "language_loss": 0.69605124, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71731019, + "num_input_tokens_seen": 220335695, + "step": 10232, + "time_per_iteration": 2.6143293380737305 + }, + { + "auxiliary_loss_clip": 0.01100675, + "auxiliary_loss_mlp": 0.00778245, + "balance_loss_clip": 1.0390166, + "balance_loss_mlp": 1.00067794, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.91356546235392, + "language_loss": 0.91774833, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.93653762, + "num_input_tokens_seen": 220353720, + "step": 10233, + "time_per_iteration": 2.461914300918579 + }, + { + "auxiliary_loss_clip": 0.01082981, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.04134476, + "balance_loss_mlp": 1.02213144, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.7930105252826734, + "language_loss": 0.71427327, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73544186, + "num_input_tokens_seen": 220372515, + "step": 10234, + "time_per_iteration": 2.5700573921203613 + }, + { + "auxiliary_loss_clip": 0.01104321, + "auxiliary_loss_mlp": 0.00779485, + "balance_loss_clip": 1.03964305, + "balance_loss_mlp": 1.00087762, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 5.537942604522821, + "language_loss": 0.67315507, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.69199312, + "num_input_tokens_seen": 220393490, + "step": 10235, + "time_per_iteration": 2.528989791870117 + }, + { + "auxiliary_loss_clip": 0.0110289, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.04103386, + "balance_loss_mlp": 1.0160979, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 1.9889091088433715, + "language_loss": 0.81185669, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83317423, + "num_input_tokens_seen": 220412855, + "step": 10236, + "time_per_iteration": 2.489624500274658 + }, + { + "auxiliary_loss_clip": 0.01117034, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.0395906, + "balance_loss_mlp": 1.01817966, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 2.058277660579171, + "language_loss": 0.80105269, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82253683, + "num_input_tokens_seen": 220433440, + "step": 10237, + "time_per_iteration": 2.4597182273864746 + }, + { + "auxiliary_loss_clip": 0.01098118, + "auxiliary_loss_mlp": 0.01042111, + "balance_loss_clip": 1.04061079, + "balance_loss_mlp": 1.02937949, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.6894980064221483, + "language_loss": 0.76012111, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78152335, + "num_input_tokens_seen": 220453445, + "step": 10238, + "time_per_iteration": 2.5127203464508057 + }, + { + "auxiliary_loss_clip": 0.0099395, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.01586211, + "balance_loss_mlp": 1.03023958, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 1.041409018578694, + "language_loss": 0.57688737, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59714305, + "num_input_tokens_seen": 220509730, + "step": 10239, + "time_per_iteration": 3.2011172771453857 + }, + { + "auxiliary_loss_clip": 0.01096456, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.03816271, + "balance_loss_mlp": 1.02022672, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 1.8769184219232498, + "language_loss": 0.77404207, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79533982, + "num_input_tokens_seen": 220527295, + "step": 10240, + "time_per_iteration": 2.4882547855377197 + }, + { + "auxiliary_loss_clip": 0.01115647, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.04074335, + "balance_loss_mlp": 1.02316666, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 2.524971194213472, + "language_loss": 0.72867727, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75019938, + "num_input_tokens_seen": 220542730, + "step": 10241, + "time_per_iteration": 2.402900457382202 + }, + { + "auxiliary_loss_clip": 0.01111899, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.0395968, + "balance_loss_mlp": 1.0151093, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.8203638435204619, + "language_loss": 0.71767622, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.73907, + "num_input_tokens_seen": 220562995, + "step": 10242, + "time_per_iteration": 2.450009822845459 + }, + { + "auxiliary_loss_clip": 0.0110516, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.04101539, + "balance_loss_mlp": 1.02128565, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 2.228340899434084, + "language_loss": 0.72619832, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74758172, + "num_input_tokens_seen": 220581775, + "step": 10243, + "time_per_iteration": 3.9750936031341553 + }, + { + "auxiliary_loss_clip": 0.01027989, + "auxiliary_loss_mlp": 0.01003773, + "balance_loss_clip": 1.01311302, + "balance_loss_mlp": 1.00274742, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7538296738736389, + "language_loss": 0.56848156, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58879924, + "num_input_tokens_seen": 220646395, + "step": 10244, + "time_per_iteration": 3.080447196960449 + }, + { + "auxiliary_loss_clip": 0.01114554, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.04021108, + "balance_loss_mlp": 1.01995528, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.7174923115997713, + "language_loss": 0.63987637, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.66135603, + "num_input_tokens_seen": 220668335, + "step": 10245, + "time_per_iteration": 2.5440332889556885 + }, + { + "auxiliary_loss_clip": 0.0106465, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.03422141, + "balance_loss_mlp": 1.01889062, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 2.0213977444368187, + "language_loss": 0.79089987, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81185979, + "num_input_tokens_seen": 220688915, + "step": 10246, + "time_per_iteration": 2.619124174118042 + }, + { + "auxiliary_loss_clip": 0.01080492, + "auxiliary_loss_mlp": 0.00781143, + "balance_loss_clip": 1.03931069, + "balance_loss_mlp": 1.00068533, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.482707281147312, + "language_loss": 0.87250835, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89112461, + "num_input_tokens_seen": 220703465, + "step": 10247, + "time_per_iteration": 2.4881949424743652 + }, + { + "auxiliary_loss_clip": 0.0105193, + "auxiliary_loss_mlp": 0.01042848, + "balance_loss_clip": 1.04042482, + "balance_loss_mlp": 1.02843583, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 2.0312908574423516, + "language_loss": 0.80141103, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82235879, + "num_input_tokens_seen": 220722090, + "step": 10248, + "time_per_iteration": 2.6270253658294678 + }, + { + "auxiliary_loss_clip": 0.0107147, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.0374788, + "balance_loss_mlp": 1.02158117, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 3.1203680413657215, + "language_loss": 0.86964965, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89070463, + "num_input_tokens_seen": 220741075, + "step": 10249, + "time_per_iteration": 2.5817630290985107 + }, + { + "auxiliary_loss_clip": 0.01115656, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.04135752, + "balance_loss_mlp": 1.01641166, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 2.336027169287779, + "language_loss": 0.68613344, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.70758754, + "num_input_tokens_seen": 220763395, + "step": 10250, + "time_per_iteration": 2.6256744861602783 + }, + { + "auxiliary_loss_clip": 0.01084707, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.03629327, + "balance_loss_mlp": 1.01674795, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 1.7450741200945372, + "language_loss": 0.74053115, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.761666, + "num_input_tokens_seen": 220780640, + "step": 10251, + "time_per_iteration": 2.529789447784424 + }, + { + "auxiliary_loss_clip": 0.01098856, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.03624952, + "balance_loss_mlp": 1.02057338, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.035564576130074, + "language_loss": 0.6767419, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.69808114, + "num_input_tokens_seen": 220797960, + "step": 10252, + "time_per_iteration": 2.4643702507019043 + }, + { + "auxiliary_loss_clip": 0.00981902, + "auxiliary_loss_mlp": 0.01004643, + "balance_loss_clip": 1.01864028, + "balance_loss_mlp": 1.00349259, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8837434880142367, + "language_loss": 0.57836378, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59822923, + "num_input_tokens_seen": 220856930, + "step": 10253, + "time_per_iteration": 3.4980673789978027 + }, + { + "auxiliary_loss_clip": 0.01092387, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.03688741, + "balance_loss_mlp": 1.01915562, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.6732352742181251, + "language_loss": 0.79745859, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81870472, + "num_input_tokens_seen": 220877595, + "step": 10254, + "time_per_iteration": 2.8588178157806396 + }, + { + "auxiliary_loss_clip": 0.01093401, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.04223371, + "balance_loss_mlp": 1.01860273, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 2.3912739272517882, + "language_loss": 0.80409539, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82534122, + "num_input_tokens_seen": 220896880, + "step": 10255, + "time_per_iteration": 2.6264967918395996 + }, + { + "auxiliary_loss_clip": 0.01096483, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.0397923, + "balance_loss_mlp": 1.02199268, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 2.8059332654504128, + "language_loss": 0.65470529, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67603076, + "num_input_tokens_seen": 220916425, + "step": 10256, + "time_per_iteration": 2.651777505874634 + }, + { + "auxiliary_loss_clip": 0.01104271, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.04236305, + "balance_loss_mlp": 1.02162445, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.837105746866024, + "language_loss": 0.71763766, + "learning_rate": 1.353073501949825e-06, + "loss": 0.73902035, + "num_input_tokens_seen": 220935050, + "step": 10257, + "time_per_iteration": 4.363720893859863 + }, + { + "auxiliary_loss_clip": 0.01096855, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.04042614, + "balance_loss_mlp": 1.02184939, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 1.6803316405108109, + "language_loss": 0.72020757, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74152505, + "num_input_tokens_seen": 220953085, + "step": 10258, + "time_per_iteration": 2.5478310585021973 + }, + { + "auxiliary_loss_clip": 0.01088402, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.03477359, + "balance_loss_mlp": 1.02765477, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.6630085749618564, + "language_loss": 0.64376026, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.66506976, + "num_input_tokens_seen": 220969050, + "step": 10259, + "time_per_iteration": 2.53001070022583 + }, + { + "auxiliary_loss_clip": 0.01077301, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.03812897, + "balance_loss_mlp": 1.019243, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 2.0412411218260185, + "language_loss": 0.71426278, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73535651, + "num_input_tokens_seen": 220985825, + "step": 10260, + "time_per_iteration": 2.5096025466918945 + }, + { + "auxiliary_loss_clip": 0.01110941, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.04416537, + "balance_loss_mlp": 1.0191344, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 2.058678321932721, + "language_loss": 0.68603319, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70747733, + "num_input_tokens_seen": 221004465, + "step": 10261, + "time_per_iteration": 2.5152182579040527 + }, + { + "auxiliary_loss_clip": 0.01078085, + "auxiliary_loss_mlp": 0.01038925, + "balance_loss_clip": 1.03786147, + "balance_loss_mlp": 1.02673006, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.7521982997468628, + "language_loss": 0.71039128, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73156142, + "num_input_tokens_seen": 221023260, + "step": 10262, + "time_per_iteration": 2.5470728874206543 + }, + { + "auxiliary_loss_clip": 0.01096543, + "auxiliary_loss_mlp": 0.01036643, + "balance_loss_clip": 1.03929353, + "balance_loss_mlp": 1.02301764, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.7647393130783797, + "language_loss": 0.70277297, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72410488, + "num_input_tokens_seen": 221043090, + "step": 10263, + "time_per_iteration": 2.5395119190216064 + }, + { + "auxiliary_loss_clip": 0.01050219, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.03275394, + "balance_loss_mlp": 1.02009737, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 2.5314171709610904, + "language_loss": 0.76285899, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78369164, + "num_input_tokens_seen": 221061435, + "step": 10264, + "time_per_iteration": 2.6128904819488525 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.04189575, + "balance_loss_mlp": 1.02171612, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.746953880808474, + "language_loss": 0.85356629, + "learning_rate": 1.350126092092247e-06, + "loss": 0.8750788, + "num_input_tokens_seen": 221078705, + "step": 10265, + "time_per_iteration": 2.4588849544525146 + }, + { + "auxiliary_loss_clip": 0.01065362, + "auxiliary_loss_mlp": 0.0103862, + "balance_loss_clip": 1.03948843, + "balance_loss_mlp": 1.02545977, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 1.95390863568234, + "language_loss": 0.64748871, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66852844, + "num_input_tokens_seen": 221099245, + "step": 10266, + "time_per_iteration": 2.6870808601379395 + }, + { + "auxiliary_loss_clip": 0.01079181, + "auxiliary_loss_mlp": 0.01035552, + "balance_loss_clip": 1.03691673, + "balance_loss_mlp": 1.02336264, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.8396272908989308, + "language_loss": 0.75740826, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77855557, + "num_input_tokens_seen": 221116930, + "step": 10267, + "time_per_iteration": 2.5638108253479004 + }, + { + "auxiliary_loss_clip": 0.01086179, + "auxiliary_loss_mlp": 0.0102632, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 1.0130105, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 2.0394341544344274, + "language_loss": 0.75164175, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.77276671, + "num_input_tokens_seen": 221137660, + "step": 10268, + "time_per_iteration": 2.6429200172424316 + }, + { + "auxiliary_loss_clip": 0.0109514, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.03983378, + "balance_loss_mlp": 1.01924586, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 2.3918719650183062, + "language_loss": 0.75700033, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.7782737, + "num_input_tokens_seen": 221156225, + "step": 10269, + "time_per_iteration": 4.043700456619263 + }, + { + "auxiliary_loss_clip": 0.01112162, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.03827262, + "balance_loss_mlp": 1.01754522, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.1520084484073507, + "language_loss": 0.76711953, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78853965, + "num_input_tokens_seen": 221173820, + "step": 10270, + "time_per_iteration": 2.4234442710876465 + }, + { + "auxiliary_loss_clip": 0.01094208, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.03845823, + "balance_loss_mlp": 1.01571047, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.9900864708914467, + "language_loss": 0.82371032, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84493685, + "num_input_tokens_seen": 221191815, + "step": 10271, + "time_per_iteration": 4.254828929901123 + }, + { + "auxiliary_loss_clip": 0.01115689, + "auxiliary_loss_mlp": 0.00778044, + "balance_loss_clip": 1.04060555, + "balance_loss_mlp": 1.00076056, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 1.635170734703948, + "language_loss": 0.77031684, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.78925413, + "num_input_tokens_seen": 221211205, + "step": 10272, + "time_per_iteration": 2.5091629028320312 + }, + { + "auxiliary_loss_clip": 0.01010346, + "auxiliary_loss_mlp": 0.01003713, + "balance_loss_clip": 1.01324093, + "balance_loss_mlp": 1.00246131, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8085140781218395, + "language_loss": 0.59103251, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61117309, + "num_input_tokens_seen": 221268430, + "step": 10273, + "time_per_iteration": 3.0397753715515137 + }, + { + "auxiliary_loss_clip": 0.01086355, + "auxiliary_loss_mlp": 0.01036033, + "balance_loss_clip": 1.03524482, + "balance_loss_mlp": 1.02157259, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.2356270861271894, + "language_loss": 0.72945946, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75068331, + "num_input_tokens_seen": 221281930, + "step": 10274, + "time_per_iteration": 2.4902682304382324 + }, + { + "auxiliary_loss_clip": 0.0110287, + "auxiliary_loss_mlp": 0.00777582, + "balance_loss_clip": 1.03828132, + "balance_loss_mlp": 1.0007242, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 1.8051330354281356, + "language_loss": 0.77341318, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79221767, + "num_input_tokens_seen": 221301605, + "step": 10275, + "time_per_iteration": 2.487055778503418 + }, + { + "auxiliary_loss_clip": 0.01076684, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.03988361, + "balance_loss_mlp": 1.01504004, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.6179821745982648, + "language_loss": 0.79525089, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81629014, + "num_input_tokens_seen": 221320105, + "step": 10276, + "time_per_iteration": 2.582219362258911 + }, + { + "auxiliary_loss_clip": 0.01056116, + "auxiliary_loss_mlp": 0.01041887, + "balance_loss_clip": 1.0394671, + "balance_loss_mlp": 1.02787983, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 1.8860230423161777, + "language_loss": 0.80682278, + "learning_rate": 1.345707936733612e-06, + "loss": 0.8278029, + "num_input_tokens_seen": 221335915, + "step": 10277, + "time_per_iteration": 2.6256444454193115 + }, + { + "auxiliary_loss_clip": 0.01087641, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.03967738, + "balance_loss_mlp": 1.01849926, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.7022715981865189, + "language_loss": 0.81603134, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83723032, + "num_input_tokens_seen": 221353965, + "step": 10278, + "time_per_iteration": 2.5928735733032227 + }, + { + "auxiliary_loss_clip": 0.01068359, + "auxiliary_loss_mlp": 0.00777317, + "balance_loss_clip": 1.03525531, + "balance_loss_mlp": 1.00059783, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.6689764170062986, + "language_loss": 0.73853779, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.75699461, + "num_input_tokens_seen": 221374080, + "step": 10279, + "time_per_iteration": 2.7431325912475586 + }, + { + "auxiliary_loss_clip": 0.01096171, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.03456986, + "balance_loss_mlp": 1.01699066, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.3961214858595312, + "language_loss": 0.70626569, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.72752041, + "num_input_tokens_seen": 221392910, + "step": 10280, + "time_per_iteration": 2.4864938259124756 + }, + { + "auxiliary_loss_clip": 0.01113612, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.03968751, + "balance_loss_mlp": 1.01845646, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.7733163802714782, + "language_loss": 0.72837424, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.74982417, + "num_input_tokens_seen": 221410990, + "step": 10281, + "time_per_iteration": 2.434046506881714 + }, + { + "auxiliary_loss_clip": 0.01089216, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.0381372, + "balance_loss_mlp": 1.01891577, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.522632656647482, + "language_loss": 0.76628911, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.7874769, + "num_input_tokens_seen": 221431020, + "step": 10282, + "time_per_iteration": 4.2780351638793945 + }, + { + "auxiliary_loss_clip": 0.0109052, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.03707576, + "balance_loss_mlp": 1.02004337, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.724182794067736, + "language_loss": 0.68872923, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71000117, + "num_input_tokens_seen": 221453235, + "step": 10283, + "time_per_iteration": 2.560774564743042 + }, + { + "auxiliary_loss_clip": 0.01107505, + "auxiliary_loss_mlp": 0.01028248, + "balance_loss_clip": 1.03764963, + "balance_loss_mlp": 1.01500428, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.788102206701066, + "language_loss": 0.75003284, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77139044, + "num_input_tokens_seen": 221472560, + "step": 10284, + "time_per_iteration": 2.508678436279297 + }, + { + "auxiliary_loss_clip": 0.01096343, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.04036212, + "balance_loss_mlp": 1.02110517, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.4956105218417999, + "language_loss": 0.75894785, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.78025019, + "num_input_tokens_seen": 221492835, + "step": 10285, + "time_per_iteration": 2.495522975921631 + }, + { + "auxiliary_loss_clip": 0.01079777, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.03614378, + "balance_loss_mlp": 1.0191853, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.5044692845812984, + "language_loss": 0.7281785, + "learning_rate": 1.342396663517503e-06, + "loss": 0.74929792, + "num_input_tokens_seen": 221511870, + "step": 10286, + "time_per_iteration": 2.5842337608337402 + }, + { + "auxiliary_loss_clip": 0.01111437, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.03885818, + "balance_loss_mlp": 1.01686847, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 1.6762086638804292, + "language_loss": 0.76251173, + "learning_rate": 1.342028868767199e-06, + "loss": 0.78391397, + "num_input_tokens_seen": 221529915, + "step": 10287, + "time_per_iteration": 2.450277805328369 + }, + { + "auxiliary_loss_clip": 0.01074593, + "auxiliary_loss_mlp": 0.01030745, + "balance_loss_clip": 1.0359081, + "balance_loss_mlp": 1.01827013, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 1.6818043255423925, + "language_loss": 0.73157048, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.75262392, + "num_input_tokens_seen": 221549745, + "step": 10288, + "time_per_iteration": 2.5688157081604004 + }, + { + "auxiliary_loss_clip": 0.01099068, + "auxiliary_loss_mlp": 0.01029627, + "balance_loss_clip": 1.03733051, + "balance_loss_mlp": 1.01771176, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 2.4195974860690463, + "language_loss": 0.72888803, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.750175, + "num_input_tokens_seen": 221572455, + "step": 10289, + "time_per_iteration": 2.6889374256134033 + }, + { + "auxiliary_loss_clip": 0.01089597, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.03732681, + "balance_loss_mlp": 1.01580882, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.5188131697461476, + "language_loss": 0.79157454, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81276059, + "num_input_tokens_seen": 221591325, + "step": 10290, + "time_per_iteration": 2.606511116027832 + }, + { + "auxiliary_loss_clip": 0.0110384, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.03904998, + "balance_loss_mlp": 1.01902592, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 2.074331961762235, + "language_loss": 0.81926876, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.84062636, + "num_input_tokens_seen": 221611640, + "step": 10291, + "time_per_iteration": 2.544647693634033 + }, + { + "auxiliary_loss_clip": 0.01113493, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.03919601, + "balance_loss_mlp": 1.01931882, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.762693299954103, + "language_loss": 0.77496457, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.79641354, + "num_input_tokens_seen": 221631225, + "step": 10292, + "time_per_iteration": 2.588317394256592 + }, + { + "auxiliary_loss_clip": 0.01089919, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.03723514, + "balance_loss_mlp": 1.02549732, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 1.722835527064765, + "language_loss": 0.73443907, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75574791, + "num_input_tokens_seen": 221651035, + "step": 10293, + "time_per_iteration": 2.6051411628723145 + }, + { + "auxiliary_loss_clip": 0.01084111, + "auxiliary_loss_mlp": 0.00777429, + "balance_loss_clip": 1.03964579, + "balance_loss_mlp": 1.00065255, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 1.77929734647579, + "language_loss": 0.82922518, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.84784055, + "num_input_tokens_seen": 221671300, + "step": 10294, + "time_per_iteration": 2.5676462650299072 + }, + { + "auxiliary_loss_clip": 0.01095807, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.03983605, + "balance_loss_mlp": 1.01815534, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.37063228867684, + "language_loss": 0.70412636, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.72539115, + "num_input_tokens_seen": 221687320, + "step": 10295, + "time_per_iteration": 2.4831981658935547 + }, + { + "auxiliary_loss_clip": 0.01114634, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.04148936, + "balance_loss_mlp": 1.02006495, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.808698639622042, + "language_loss": 0.70591354, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72738987, + "num_input_tokens_seen": 221710175, + "step": 10296, + "time_per_iteration": 3.995150089263916 + }, + { + "auxiliary_loss_clip": 0.01081172, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.03848147, + "balance_loss_mlp": 1.01943254, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 2.028856615145462, + "language_loss": 0.71725488, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73840153, + "num_input_tokens_seen": 221728145, + "step": 10297, + "time_per_iteration": 2.5550973415374756 + }, + { + "auxiliary_loss_clip": 0.0103543, + "auxiliary_loss_mlp": 0.00999991, + "balance_loss_clip": 1.01097918, + "balance_loss_mlp": 0.99881071, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8721072970538443, + "language_loss": 0.64126873, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66162294, + "num_input_tokens_seen": 221786100, + "step": 10298, + "time_per_iteration": 2.952510118484497 + }, + { + "auxiliary_loss_clip": 0.0111769, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.04176784, + "balance_loss_mlp": 1.02496338, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.6386814644308254, + "language_loss": 0.74313092, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.7646842, + "num_input_tokens_seen": 221806450, + "step": 10299, + "time_per_iteration": 2.477277994155884 + }, + { + "auxiliary_loss_clip": 0.01109799, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.04175234, + "balance_loss_mlp": 1.01976907, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.7076406192872235, + "language_loss": 0.68076873, + "learning_rate": 1.337249812568732e-06, + "loss": 0.7021929, + "num_input_tokens_seen": 221823330, + "step": 10300, + "time_per_iteration": 2.462733030319214 + }, + { + "auxiliary_loss_clip": 0.01106319, + "auxiliary_loss_mlp": 0.00779605, + "balance_loss_clip": 1.04484868, + "balance_loss_mlp": 1.00075281, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.9694650937150682, + "language_loss": 0.66711748, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.68597674, + "num_input_tokens_seen": 221839360, + "step": 10301, + "time_per_iteration": 2.463857412338257 + }, + { + "auxiliary_loss_clip": 0.01072265, + "auxiliary_loss_mlp": 0.01033497, + "balance_loss_clip": 1.03632832, + "balance_loss_mlp": 1.02102149, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 1.5745369862422824, + "language_loss": 0.73221827, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.75327587, + "num_input_tokens_seen": 221859465, + "step": 10302, + "time_per_iteration": 2.629002332687378 + }, + { + "auxiliary_loss_clip": 0.01091286, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.04294097, + "balance_loss_mlp": 1.01848197, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 1.762372110079701, + "language_loss": 0.80353534, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82476628, + "num_input_tokens_seen": 221878555, + "step": 10303, + "time_per_iteration": 2.5614941120147705 + }, + { + "auxiliary_loss_clip": 0.01117077, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.04033518, + "balance_loss_mlp": 1.01654363, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.6650190720247406, + "language_loss": 0.76571465, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78719032, + "num_input_tokens_seen": 221898790, + "step": 10304, + "time_per_iteration": 2.4448063373565674 + }, + { + "auxiliary_loss_clip": 0.01086245, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.04118204, + "balance_loss_mlp": 1.02216816, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 1.9014763503174705, + "language_loss": 0.7715798, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.7927978, + "num_input_tokens_seen": 221918875, + "step": 10305, + "time_per_iteration": 2.5623624324798584 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.04117441, + "balance_loss_mlp": 1.02117801, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.6179475072763267, + "language_loss": 0.78682113, + "learning_rate": 1.335045524968045e-06, + "loss": 0.80828381, + "num_input_tokens_seen": 221937895, + "step": 10306, + "time_per_iteration": 2.4759786128997803 + }, + { + "auxiliary_loss_clip": 0.01055175, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.03774238, + "balance_loss_mlp": 1.01650667, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.700466051597162, + "language_loss": 0.79856837, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.81939662, + "num_input_tokens_seen": 221955920, + "step": 10307, + "time_per_iteration": 2.6226022243499756 + }, + { + "auxiliary_loss_clip": 0.00999417, + "auxiliary_loss_mlp": 0.01005381, + "balance_loss_clip": 1.0107398, + "balance_loss_mlp": 1.00424898, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8080736282871352, + "language_loss": 0.59397906, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61402702, + "num_input_tokens_seen": 222011405, + "step": 10308, + "time_per_iteration": 4.682998895645142 + }, + { + "auxiliary_loss_clip": 0.01085949, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.03833723, + "balance_loss_mlp": 1.01752675, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.757481182527679, + "language_loss": 0.67585284, + "learning_rate": 1.333943721384037e-06, + "loss": 0.69699705, + "num_input_tokens_seen": 222034545, + "step": 10309, + "time_per_iteration": 2.6073498725891113 + }, + { + "auxiliary_loss_clip": 0.01085999, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.03651083, + "balance_loss_mlp": 1.02672172, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 4.752435146725957, + "language_loss": 0.72027004, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74152899, + "num_input_tokens_seen": 222052690, + "step": 10310, + "time_per_iteration": 3.920121431350708 + }, + { + "auxiliary_loss_clip": 0.01098569, + "auxiliary_loss_mlp": 0.01035479, + "balance_loss_clip": 1.04316199, + "balance_loss_mlp": 1.02202034, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 1.9746177967504812, + "language_loss": 0.79025835, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81159884, + "num_input_tokens_seen": 222069095, + "step": 10311, + "time_per_iteration": 2.563472270965576 + }, + { + "auxiliary_loss_clip": 0.01085186, + "auxiliary_loss_mlp": 0.01038068, + "balance_loss_clip": 1.0420804, + "balance_loss_mlp": 1.02527714, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.830547134767544, + "language_loss": 0.72764933, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.74888194, + "num_input_tokens_seen": 222087360, + "step": 10312, + "time_per_iteration": 2.6051814556121826 + }, + { + "auxiliary_loss_clip": 0.01072632, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.0436089, + "balance_loss_mlp": 1.01717257, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 4.609145035002588, + "language_loss": 0.72018361, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.74120796, + "num_input_tokens_seen": 222106130, + "step": 10313, + "time_per_iteration": 2.6081011295318604 + }, + { + "auxiliary_loss_clip": 0.01109938, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.04145193, + "balance_loss_mlp": 1.02038288, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.9670074907533213, + "language_loss": 0.78314549, + "learning_rate": 1.332107887401416e-06, + "loss": 0.8045857, + "num_input_tokens_seen": 222123125, + "step": 10314, + "time_per_iteration": 2.4850523471832275 + }, + { + "auxiliary_loss_clip": 0.01102622, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.03785896, + "balance_loss_mlp": 1.0213927, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.6978670733270627, + "language_loss": 0.78159571, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80296338, + "num_input_tokens_seen": 222140655, + "step": 10315, + "time_per_iteration": 2.483949661254883 + }, + { + "auxiliary_loss_clip": 0.01080428, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.04114544, + "balance_loss_mlp": 1.02453864, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 1.7936352832702398, + "language_loss": 0.76063794, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78181505, + "num_input_tokens_seen": 222160450, + "step": 10316, + "time_per_iteration": 2.5498390197753906 + }, + { + "auxiliary_loss_clip": 0.01115453, + "auxiliary_loss_mlp": 0.010315, + "balance_loss_clip": 1.03813159, + "balance_loss_mlp": 1.0184412, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 1.8367169880122516, + "language_loss": 0.77282816, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.7942977, + "num_input_tokens_seen": 222179170, + "step": 10317, + "time_per_iteration": 2.501232385635376 + }, + { + "auxiliary_loss_clip": 0.01019289, + "auxiliary_loss_mlp": 0.01005054, + "balance_loss_clip": 1.01291704, + "balance_loss_mlp": 1.00399876, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6881535463946677, + "language_loss": 0.59028929, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.6105327, + "num_input_tokens_seen": 222242660, + "step": 10318, + "time_per_iteration": 3.1452417373657227 + }, + { + "auxiliary_loss_clip": 0.01088292, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.03993535, + "balance_loss_mlp": 1.02183986, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.5149022157391403, + "language_loss": 0.78067529, + "learning_rate": 1.330272686582143e-06, + "loss": 0.80191219, + "num_input_tokens_seen": 222262170, + "step": 10319, + "time_per_iteration": 2.55157208442688 + }, + { + "auxiliary_loss_clip": 0.01095098, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.04100847, + "balance_loss_mlp": 1.01923156, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 1.7749518653379661, + "language_loss": 0.6650728, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68633479, + "num_input_tokens_seen": 222280375, + "step": 10320, + "time_per_iteration": 2.523822546005249 + }, + { + "auxiliary_loss_clip": 0.01072934, + "auxiliary_loss_mlp": 0.01033166, + "balance_loss_clip": 1.03518343, + "balance_loss_mlp": 1.02069712, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.6557477213207168, + "language_loss": 0.76202726, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78308827, + "num_input_tokens_seen": 222297325, + "step": 10321, + "time_per_iteration": 3.942223310470581 + }, + { + "auxiliary_loss_clip": 0.01085708, + "auxiliary_loss_mlp": 0.01028958, + "balance_loss_clip": 1.03733218, + "balance_loss_mlp": 1.01691198, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.7457888895552887, + "language_loss": 0.73823649, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75938308, + "num_input_tokens_seen": 222317095, + "step": 10322, + "time_per_iteration": 2.5149548053741455 + }, + { + "auxiliary_loss_clip": 0.01074193, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.03393686, + "balance_loss_mlp": 1.01770818, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 1.9436575714837183, + "language_loss": 0.72768557, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.74872792, + "num_input_tokens_seen": 222337055, + "step": 10323, + "time_per_iteration": 2.5705206394195557 + }, + { + "auxiliary_loss_clip": 0.01113962, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.04267287, + "balance_loss_mlp": 1.01659536, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 3.4807033784714836, + "language_loss": 0.58653766, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.60797471, + "num_input_tokens_seen": 222354515, + "step": 10324, + "time_per_iteration": 2.4790356159210205 + }, + { + "auxiliary_loss_clip": 0.01075463, + "auxiliary_loss_mlp": 0.01042371, + "balance_loss_clip": 1.03745461, + "balance_loss_mlp": 1.02715969, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 1.9475770006347324, + "language_loss": 0.76337945, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78455782, + "num_input_tokens_seen": 222372755, + "step": 10325, + "time_per_iteration": 2.5215864181518555 + }, + { + "auxiliary_loss_clip": 0.01109342, + "auxiliary_loss_mlp": 0.01028327, + "balance_loss_clip": 1.04152739, + "balance_loss_mlp": 1.0146594, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 4.974464552507867, + "language_loss": 0.72186077, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74323744, + "num_input_tokens_seen": 222391380, + "step": 10326, + "time_per_iteration": 2.4919543266296387 + }, + { + "auxiliary_loss_clip": 0.01107254, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.04073453, + "balance_loss_mlp": 1.02302837, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 2.5663750524424054, + "language_loss": 0.74171078, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.7631458, + "num_input_tokens_seen": 222411165, + "step": 10327, + "time_per_iteration": 2.505375385284424 + }, + { + "auxiliary_loss_clip": 0.01095776, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.04349232, + "balance_loss_mlp": 1.02012038, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 2.07919567636389, + "language_loss": 0.79886931, + "learning_rate": 1.326970926232066e-06, + "loss": 0.82016706, + "num_input_tokens_seen": 222428110, + "step": 10328, + "time_per_iteration": 2.5117123126983643 + }, + { + "auxiliary_loss_clip": 0.01082054, + "auxiliary_loss_mlp": 0.0103948, + "balance_loss_clip": 1.03605759, + "balance_loss_mlp": 1.02633739, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.742867412040248, + "language_loss": 0.77897865, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.80019403, + "num_input_tokens_seen": 222446385, + "step": 10329, + "time_per_iteration": 2.544677972793579 + }, + { + "auxiliary_loss_clip": 0.01025221, + "auxiliary_loss_mlp": 0.00999952, + "balance_loss_clip": 1.01135659, + "balance_loss_mlp": 0.99878973, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.823529418301511, + "language_loss": 0.62162691, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64187866, + "num_input_tokens_seen": 222502150, + "step": 10330, + "time_per_iteration": 3.0032079219818115 + }, + { + "auxiliary_loss_clip": 0.0111002, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_clip": 1.03985226, + "balance_loss_mlp": 1.02544737, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 1.8400279028926938, + "language_loss": 0.7754553, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79694808, + "num_input_tokens_seen": 222519880, + "step": 10331, + "time_per_iteration": 2.519623041152954 + }, + { + "auxiliary_loss_clip": 0.01119686, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.04186416, + "balance_loss_mlp": 1.02204061, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 3.959644471913708, + "language_loss": 0.67995125, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.70150197, + "num_input_tokens_seen": 222538545, + "step": 10332, + "time_per_iteration": 2.446089506149292 + }, + { + "auxiliary_loss_clip": 0.01081932, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.03670025, + "balance_loss_mlp": 1.01962674, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.419406273559708, + "language_loss": 0.76507461, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78621423, + "num_input_tokens_seen": 222556935, + "step": 10333, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.01092432, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.04350889, + "balance_loss_mlp": 1.01981997, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 1.9994510350149208, + "language_loss": 0.69942337, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.72067028, + "num_input_tokens_seen": 222574035, + "step": 10334, + "time_per_iteration": 2.49902606010437 + }, + { + "auxiliary_loss_clip": 0.01091761, + "auxiliary_loss_mlp": 0.00778367, + "balance_loss_clip": 1.03986192, + "balance_loss_mlp": 1.0006156, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 1.6932504462131233, + "language_loss": 0.70110655, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.7198078, + "num_input_tokens_seen": 222592290, + "step": 10335, + "time_per_iteration": 2.515855550765991 + }, + { + "auxiliary_loss_clip": 0.01067663, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.03771293, + "balance_loss_mlp": 1.0213275, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.470971340615664, + "language_loss": 0.80367303, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82468569, + "num_input_tokens_seen": 222612805, + "step": 10336, + "time_per_iteration": 4.078368425369263 + }, + { + "auxiliary_loss_clip": 0.01112254, + "auxiliary_loss_mlp": 0.0103032, + "balance_loss_clip": 1.04034543, + "balance_loss_mlp": 1.01822019, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.76513260863993, + "language_loss": 0.73178864, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75321436, + "num_input_tokens_seen": 222632260, + "step": 10337, + "time_per_iteration": 2.4765777587890625 + }, + { + "auxiliary_loss_clip": 0.01118798, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.04090917, + "balance_loss_mlp": 1.02287281, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 2.0887298130427356, + "language_loss": 0.63461292, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65616375, + "num_input_tokens_seen": 222653570, + "step": 10338, + "time_per_iteration": 2.5267364978790283 + }, + { + "auxiliary_loss_clip": 0.01102788, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.03994834, + "balance_loss_mlp": 1.02088666, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 2.713094690393577, + "language_loss": 0.71528, + "learning_rate": 1.322938249724991e-06, + "loss": 0.73664939, + "num_input_tokens_seen": 222672480, + "step": 10339, + "time_per_iteration": 2.5120086669921875 + }, + { + "auxiliary_loss_clip": 0.01062851, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.0344336, + "balance_loss_mlp": 1.02249956, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.6226659152226948, + "language_loss": 0.69361299, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71460325, + "num_input_tokens_seen": 222691200, + "step": 10340, + "time_per_iteration": 2.579570770263672 + }, + { + "auxiliary_loss_clip": 0.01072738, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.03832722, + "balance_loss_mlp": 1.02049303, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 2.689276215181959, + "language_loss": 0.68926108, + "learning_rate": 1.322205369037788e-06, + "loss": 0.71031737, + "num_input_tokens_seen": 222709975, + "step": 10341, + "time_per_iteration": 2.585881471633911 + }, + { + "auxiliary_loss_clip": 0.01104687, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.04082513, + "balance_loss_mlp": 1.01759863, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 1.9214466184217267, + "language_loss": 0.80558783, + "learning_rate": 1.321838967240299e-06, + "loss": 0.82694745, + "num_input_tokens_seen": 222729005, + "step": 10342, + "time_per_iteration": 2.4349560737609863 + }, + { + "auxiliary_loss_clip": 0.01017862, + "auxiliary_loss_mlp": 0.01003595, + "balance_loss_clip": 1.01073909, + "balance_loss_mlp": 1.0023911, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.856711391432992, + "language_loss": 0.57368094, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59389549, + "num_input_tokens_seen": 222786090, + "step": 10343, + "time_per_iteration": 3.0058770179748535 + }, + { + "auxiliary_loss_clip": 0.01074228, + "auxiliary_loss_mlp": 0.01027785, + "balance_loss_clip": 1.0342629, + "balance_loss_mlp": 1.01612639, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.9583933339276103, + "language_loss": 0.73035371, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75137389, + "num_input_tokens_seen": 222806100, + "step": 10344, + "time_per_iteration": 2.579216480255127 + }, + { + "auxiliary_loss_clip": 0.0110411, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.04009771, + "balance_loss_mlp": 1.0252912, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 2.474635064733662, + "language_loss": 0.60290074, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62431234, + "num_input_tokens_seen": 222826575, + "step": 10345, + "time_per_iteration": 2.5148704051971436 + }, + { + "auxiliary_loss_clip": 0.01056659, + "auxiliary_loss_mlp": 0.01038288, + "balance_loss_clip": 1.03722763, + "balance_loss_mlp": 1.02517509, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 2.6091312717334487, + "language_loss": 0.77927071, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80022019, + "num_input_tokens_seen": 222845285, + "step": 10346, + "time_per_iteration": 2.641065835952759 + }, + { + "auxiliary_loss_clip": 0.01087779, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.04397821, + "balance_loss_mlp": 1.01960862, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.6914052110111746, + "language_loss": 0.71309674, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73430741, + "num_input_tokens_seen": 222864575, + "step": 10347, + "time_per_iteration": 2.628589630126953 + }, + { + "auxiliary_loss_clip": 0.01098434, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.03647375, + "balance_loss_mlp": 1.0190376, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.672187293497324, + "language_loss": 0.72109658, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74239272, + "num_input_tokens_seen": 222884420, + "step": 10348, + "time_per_iteration": 4.068922996520996 + }, + { + "auxiliary_loss_clip": 0.01013583, + "auxiliary_loss_mlp": 0.01001746, + "balance_loss_clip": 1.02223754, + "balance_loss_mlp": 1.00059021, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8040606348634373, + "language_loss": 0.54096395, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56111723, + "num_input_tokens_seen": 222944690, + "step": 10349, + "time_per_iteration": 4.540219068527222 + }, + { + "auxiliary_loss_clip": 0.01078792, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.03655386, + "balance_loss_mlp": 1.01637042, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 1.847733133813218, + "language_loss": 0.69648093, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71755606, + "num_input_tokens_seen": 222962990, + "step": 10350, + "time_per_iteration": 2.594027280807495 + }, + { + "auxiliary_loss_clip": 0.01117204, + "auxiliary_loss_mlp": 0.01035895, + "balance_loss_clip": 1.04143453, + "balance_loss_mlp": 1.02280617, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 2.4921527270939823, + "language_loss": 0.57170331, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59323436, + "num_input_tokens_seen": 222980715, + "step": 10351, + "time_per_iteration": 2.492915153503418 + }, + { + "auxiliary_loss_clip": 0.01025421, + "auxiliary_loss_mlp": 0.01000567, + "balance_loss_clip": 1.01424885, + "balance_loss_mlp": 0.99936849, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 1.0153654808753265, + "language_loss": 0.6108579, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63111776, + "num_input_tokens_seen": 223040685, + "step": 10352, + "time_per_iteration": 3.0240328311920166 + }, + { + "auxiliary_loss_clip": 0.01109532, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.037521, + "balance_loss_mlp": 1.02208543, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 1.9548216242283514, + "language_loss": 0.81488436, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.83632439, + "num_input_tokens_seen": 223059000, + "step": 10353, + "time_per_iteration": 2.457348585128784 + }, + { + "auxiliary_loss_clip": 0.01095639, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.03655136, + "balance_loss_mlp": 1.01874089, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.7087513355509503, + "language_loss": 0.75677323, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77802914, + "num_input_tokens_seen": 223079345, + "step": 10354, + "time_per_iteration": 2.5067086219787598 + }, + { + "auxiliary_loss_clip": 0.0107263, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.03547597, + "balance_loss_mlp": 1.01954997, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.489000555300258, + "language_loss": 0.78908885, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.81014365, + "num_input_tokens_seen": 223097880, + "step": 10355, + "time_per_iteration": 2.538595676422119 + }, + { + "auxiliary_loss_clip": 0.01104941, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.0425148, + "balance_loss_mlp": 1.02079952, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.9064611251191985, + "language_loss": 0.77897888, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80035526, + "num_input_tokens_seen": 223118185, + "step": 10356, + "time_per_iteration": 2.5345571041107178 + }, + { + "auxiliary_loss_clip": 0.01096155, + "auxiliary_loss_mlp": 0.00778928, + "balance_loss_clip": 1.03761995, + "balance_loss_mlp": 1.00078392, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 2.6925940821721444, + "language_loss": 0.67842436, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.6971752, + "num_input_tokens_seen": 223137600, + "step": 10357, + "time_per_iteration": 2.5266923904418945 + }, + { + "auxiliary_loss_clip": 0.01095894, + "auxiliary_loss_mlp": 0.01030897, + "balance_loss_clip": 1.03830314, + "balance_loss_mlp": 1.01672888, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 2.94211431866787, + "language_loss": 0.76091582, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78218365, + "num_input_tokens_seen": 223154360, + "step": 10358, + "time_per_iteration": 2.5829999446868896 + }, + { + "auxiliary_loss_clip": 0.01089438, + "auxiliary_loss_mlp": 0.01030253, + "balance_loss_clip": 1.03655636, + "balance_loss_mlp": 1.0179863, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.0994659777780478, + "language_loss": 0.8229866, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84418344, + "num_input_tokens_seen": 223172255, + "step": 10359, + "time_per_iteration": 2.511582851409912 + }, + { + "auxiliary_loss_clip": 0.01084201, + "auxiliary_loss_mlp": 0.01050434, + "balance_loss_clip": 1.0344162, + "balance_loss_mlp": 1.0358963, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 5.869306392630472, + "language_loss": 0.73310852, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75445485, + "num_input_tokens_seen": 223186965, + "step": 10360, + "time_per_iteration": 3.8925297260284424 + }, + { + "auxiliary_loss_clip": 0.01101177, + "auxiliary_loss_mlp": 0.01037739, + "balance_loss_clip": 1.03680897, + "balance_loss_mlp": 1.02513862, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 3.229837896386294, + "language_loss": 0.77263105, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.79402018, + "num_input_tokens_seen": 223206045, + "step": 10361, + "time_per_iteration": 2.462921380996704 + }, + { + "auxiliary_loss_clip": 0.01076306, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.04018188, + "balance_loss_mlp": 1.01823056, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 1.6111946723932062, + "language_loss": 0.67706919, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69813424, + "num_input_tokens_seen": 223224820, + "step": 10362, + "time_per_iteration": 2.536207675933838 + }, + { + "auxiliary_loss_clip": 0.01094541, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.03766465, + "balance_loss_mlp": 1.01829314, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 2.0051816775010525, + "language_loss": 0.67502528, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.69628829, + "num_input_tokens_seen": 223243205, + "step": 10363, + "time_per_iteration": 2.5708162784576416 + }, + { + "auxiliary_loss_clip": 0.01071233, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.03852153, + "balance_loss_mlp": 1.01768601, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 1.906492834381978, + "language_loss": 0.86632311, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88734591, + "num_input_tokens_seen": 223261370, + "step": 10364, + "time_per_iteration": 2.5784807205200195 + }, + { + "auxiliary_loss_clip": 0.01013997, + "auxiliary_loss_mlp": 0.0100676, + "balance_loss_clip": 1.00902367, + "balance_loss_mlp": 1.0054127, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.8877402286228581, + "language_loss": 0.60829449, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62850201, + "num_input_tokens_seen": 223315050, + "step": 10365, + "time_per_iteration": 3.1095962524414062 + }, + { + "auxiliary_loss_clip": 0.01084368, + "auxiliary_loss_mlp": 0.00780001, + "balance_loss_clip": 1.04319477, + "balance_loss_mlp": 1.0006969, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 2.3313210042310137, + "language_loss": 0.75333685, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77198052, + "num_input_tokens_seen": 223332130, + "step": 10366, + "time_per_iteration": 2.6069207191467285 + }, + { + "auxiliary_loss_clip": 0.01103051, + "auxiliary_loss_mlp": 0.0104008, + "balance_loss_clip": 1.03836834, + "balance_loss_mlp": 1.02694297, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 4.034268094082705, + "language_loss": 0.76180929, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78324062, + "num_input_tokens_seen": 223351605, + "step": 10367, + "time_per_iteration": 2.5050196647644043 + }, + { + "auxiliary_loss_clip": 0.01102133, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.03964567, + "balance_loss_mlp": 1.02272558, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.4655869005495283, + "language_loss": 0.78493571, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80631089, + "num_input_tokens_seen": 223372090, + "step": 10368, + "time_per_iteration": 2.5340967178344727 + }, + { + "auxiliary_loss_clip": 0.01051175, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.03634834, + "balance_loss_mlp": 1.02083135, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.7998779671288458, + "language_loss": 0.68370819, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70456171, + "num_input_tokens_seen": 223390110, + "step": 10369, + "time_per_iteration": 2.694286346435547 + }, + { + "auxiliary_loss_clip": 0.01115773, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.04106772, + "balance_loss_mlp": 1.02226925, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.296177407166117, + "language_loss": 0.88162577, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90313888, + "num_input_tokens_seen": 223404205, + "step": 10370, + "time_per_iteration": 2.4531285762786865 + }, + { + "auxiliary_loss_clip": 0.01112214, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.03987479, + "balance_loss_mlp": 1.01725912, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.4851519218943687, + "language_loss": 0.66135621, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68277311, + "num_input_tokens_seen": 223424855, + "step": 10371, + "time_per_iteration": 2.5076744556427 + }, + { + "auxiliary_loss_clip": 0.01095689, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.03702092, + "balance_loss_mlp": 1.01995254, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.321820466124161, + "language_loss": 0.77671713, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79797816, + "num_input_tokens_seen": 223447225, + "step": 10372, + "time_per_iteration": 2.58813738822937 + }, + { + "auxiliary_loss_clip": 0.01103209, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.03830576, + "balance_loss_mlp": 1.01957464, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 2.124473311632811, + "language_loss": 0.77588129, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79724008, + "num_input_tokens_seen": 223467520, + "step": 10373, + "time_per_iteration": 2.522311210632324 + }, + { + "auxiliary_loss_clip": 0.01097679, + "auxiliary_loss_mlp": 0.01026892, + "balance_loss_clip": 1.037642, + "balance_loss_mlp": 1.01516747, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.5680508946865201, + "language_loss": 0.69593972, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71718538, + "num_input_tokens_seen": 223488130, + "step": 10374, + "time_per_iteration": 2.522155284881592 + }, + { + "auxiliary_loss_clip": 0.01100408, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.04229867, + "balance_loss_mlp": 1.01722252, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.6976503325889205, + "language_loss": 0.76909351, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79038751, + "num_input_tokens_seen": 223505105, + "step": 10375, + "time_per_iteration": 4.010646820068359 + }, + { + "auxiliary_loss_clip": 0.01089123, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.04227245, + "balance_loss_mlp": 1.02092862, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.3784738267982195, + "language_loss": 0.70251834, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72373581, + "num_input_tokens_seen": 223528065, + "step": 10376, + "time_per_iteration": 2.659794569015503 + }, + { + "auxiliary_loss_clip": 0.01086216, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.03964674, + "balance_loss_mlp": 1.01974308, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 2.3633605842091057, + "language_loss": 0.76312596, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78432012, + "num_input_tokens_seen": 223547305, + "step": 10377, + "time_per_iteration": 2.5775485038757324 + }, + { + "auxiliary_loss_clip": 0.01091935, + "auxiliary_loss_mlp": 0.0103146, + "balance_loss_clip": 1.03893721, + "balance_loss_mlp": 1.02043319, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 2.2182068832161415, + "language_loss": 0.68398893, + "learning_rate": 1.308665737227052e-06, + "loss": 0.7052229, + "num_input_tokens_seen": 223567205, + "step": 10378, + "time_per_iteration": 2.5806658267974854 + }, + { + "auxiliary_loss_clip": 0.01094487, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.04344654, + "balance_loss_mlp": 1.02178514, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.8526059418497507, + "language_loss": 0.75643498, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.77772152, + "num_input_tokens_seen": 223586560, + "step": 10379, + "time_per_iteration": 2.5843546390533447 + }, + { + "auxiliary_loss_clip": 0.01087389, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.03799987, + "balance_loss_mlp": 1.01728725, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.5159310767162735, + "language_loss": 0.79472792, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.8158958, + "num_input_tokens_seen": 223610595, + "step": 10380, + "time_per_iteration": 2.6137046813964844 + }, + { + "auxiliary_loss_clip": 0.01099535, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.04316199, + "balance_loss_mlp": 1.02183819, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.4877571092534874, + "language_loss": 0.7993083, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82063651, + "num_input_tokens_seen": 223630230, + "step": 10381, + "time_per_iteration": 2.5032200813293457 + }, + { + "auxiliary_loss_clip": 0.01088904, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.03669047, + "balance_loss_mlp": 1.02447128, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.131618713428779, + "language_loss": 0.74783814, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76910961, + "num_input_tokens_seen": 223648360, + "step": 10382, + "time_per_iteration": 2.4819846153259277 + }, + { + "auxiliary_loss_clip": 0.01099645, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.03805685, + "balance_loss_mlp": 1.01788688, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.4016429805802788, + "language_loss": 0.78303474, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80432677, + "num_input_tokens_seen": 223671255, + "step": 10383, + "time_per_iteration": 2.559370517730713 + }, + { + "auxiliary_loss_clip": 0.0108044, + "auxiliary_loss_mlp": 0.01026653, + "balance_loss_clip": 1.03536534, + "balance_loss_mlp": 1.01486325, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 1.8283575467784725, + "language_loss": 0.75573313, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77680409, + "num_input_tokens_seen": 223689860, + "step": 10384, + "time_per_iteration": 2.561084508895874 + }, + { + "auxiliary_loss_clip": 0.01091029, + "auxiliary_loss_mlp": 0.01039935, + "balance_loss_clip": 1.03670406, + "balance_loss_mlp": 1.02611852, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 1.6941637730133265, + "language_loss": 0.66465676, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68596631, + "num_input_tokens_seen": 223707835, + "step": 10385, + "time_per_iteration": 2.5354037284851074 + }, + { + "auxiliary_loss_clip": 0.01019047, + "auxiliary_loss_mlp": 0.01000406, + "balance_loss_clip": 1.01232123, + "balance_loss_mlp": 0.99919063, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.757624908609267, + "language_loss": 0.62060881, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64080334, + "num_input_tokens_seen": 223771875, + "step": 10386, + "time_per_iteration": 3.1328608989715576 + }, + { + "auxiliary_loss_clip": 0.01100461, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.03647184, + "balance_loss_mlp": 1.01718354, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.8459282099254892, + "language_loss": 0.71606457, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.7373693, + "num_input_tokens_seen": 223788895, + "step": 10387, + "time_per_iteration": 4.050747394561768 + }, + { + "auxiliary_loss_clip": 0.01109788, + "auxiliary_loss_mlp": 0.01038495, + "balance_loss_clip": 1.04124069, + "balance_loss_mlp": 1.02448153, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.5611552667580657, + "language_loss": 0.65476394, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67624682, + "num_input_tokens_seen": 223810385, + "step": 10388, + "time_per_iteration": 2.5952954292297363 + }, + { + "auxiliary_loss_clip": 0.01073262, + "auxiliary_loss_mlp": 0.01025464, + "balance_loss_clip": 1.03740251, + "balance_loss_mlp": 1.01413882, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.879197179158325, + "language_loss": 0.79297829, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.8139655, + "num_input_tokens_seen": 223826040, + "step": 10389, + "time_per_iteration": 3.9318020343780518 + }, + { + "auxiliary_loss_clip": 0.01088489, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.03628576, + "balance_loss_mlp": 1.0216608, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 3.6468950488257175, + "language_loss": 0.60522747, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62646002, + "num_input_tokens_seen": 223842300, + "step": 10390, + "time_per_iteration": 2.4938583374023438 + }, + { + "auxiliary_loss_clip": 0.01093587, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.03756404, + "balance_loss_mlp": 1.02051461, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.9116232883185404, + "language_loss": 0.77044582, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79170477, + "num_input_tokens_seen": 223858320, + "step": 10391, + "time_per_iteration": 2.576611042022705 + }, + { + "auxiliary_loss_clip": 0.01093991, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.03946245, + "balance_loss_mlp": 1.02119958, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.569918588771795, + "language_loss": 0.64497465, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.66625369, + "num_input_tokens_seen": 223883545, + "step": 10392, + "time_per_iteration": 2.7228245735168457 + }, + { + "auxiliary_loss_clip": 0.01095778, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.04006612, + "balance_loss_mlp": 1.02247357, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.7890515698228096, + "language_loss": 0.7683059, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78962088, + "num_input_tokens_seen": 223901445, + "step": 10393, + "time_per_iteration": 2.5094902515411377 + }, + { + "auxiliary_loss_clip": 0.01078061, + "auxiliary_loss_mlp": 0.00780671, + "balance_loss_clip": 1.03666842, + "balance_loss_mlp": 1.00058496, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.7141722483225923, + "language_loss": 0.82783526, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84642255, + "num_input_tokens_seen": 223920170, + "step": 10394, + "time_per_iteration": 2.5853970050811768 + }, + { + "auxiliary_loss_clip": 0.0109634, + "auxiliary_loss_mlp": 0.01038984, + "balance_loss_clip": 1.03945553, + "balance_loss_mlp": 1.02571571, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.9304438887145918, + "language_loss": 0.75109673, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77244991, + "num_input_tokens_seen": 223936495, + "step": 10395, + "time_per_iteration": 2.5529425144195557 + }, + { + "auxiliary_loss_clip": 0.01096059, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.03641748, + "balance_loss_mlp": 1.01783085, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.6058502957961176, + "language_loss": 0.72574711, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74701786, + "num_input_tokens_seen": 223950070, + "step": 10396, + "time_per_iteration": 2.5252633094787598 + }, + { + "auxiliary_loss_clip": 0.0107297, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.03851211, + "balance_loss_mlp": 1.02500319, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.6874549388432802, + "language_loss": 0.7582249, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.77932739, + "num_input_tokens_seen": 223970065, + "step": 10397, + "time_per_iteration": 2.5834429264068604 + }, + { + "auxiliary_loss_clip": 0.01088874, + "auxiliary_loss_mlp": 0.01038342, + "balance_loss_clip": 1.03703976, + "balance_loss_mlp": 1.0253005, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 2.0864058114331727, + "language_loss": 0.75269294, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77396512, + "num_input_tokens_seen": 223990315, + "step": 10398, + "time_per_iteration": 2.603302478790283 + }, + { + "auxiliary_loss_clip": 0.01115815, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.03863966, + "balance_loss_mlp": 1.02076471, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 2.0217930646232793, + "language_loss": 0.7430774, + "learning_rate": 1.300997001489483e-06, + "loss": 0.76458383, + "num_input_tokens_seen": 224009960, + "step": 10399, + "time_per_iteration": 2.5259017944335938 + }, + { + "auxiliary_loss_clip": 0.01081074, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.0395925, + "balance_loss_mlp": 1.02496052, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.6157848323754775, + "language_loss": 0.7434532, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76464581, + "num_input_tokens_seen": 224028870, + "step": 10400, + "time_per_iteration": 4.295098543167114 + }, + { + "auxiliary_loss_clip": 0.01009913, + "auxiliary_loss_mlp": 0.01003853, + "balance_loss_clip": 1.01049101, + "balance_loss_mlp": 1.00269628, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.844920578363907, + "language_loss": 0.56488842, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58502609, + "num_input_tokens_seen": 224094140, + "step": 10401, + "time_per_iteration": 3.2056198120117188 + }, + { + "auxiliary_loss_clip": 0.01106187, + "auxiliary_loss_mlp": 0.01036566, + "balance_loss_clip": 1.03912234, + "balance_loss_mlp": 1.02314353, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.1556723768763315, + "language_loss": 0.83056933, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.8519969, + "num_input_tokens_seen": 224113235, + "step": 10402, + "time_per_iteration": 2.534987211227417 + }, + { + "auxiliary_loss_clip": 0.01036496, + "auxiliary_loss_mlp": 0.01034047, + "balance_loss_clip": 1.03531492, + "balance_loss_mlp": 1.02214444, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 1.8748046755698589, + "language_loss": 0.69223678, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71294224, + "num_input_tokens_seen": 224134530, + "step": 10403, + "time_per_iteration": 2.8054065704345703 + }, + { + "auxiliary_loss_clip": 0.01082379, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.03584814, + "balance_loss_mlp": 1.01643157, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.816191303755706, + "language_loss": 0.72134972, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.74248266, + "num_input_tokens_seen": 224154170, + "step": 10404, + "time_per_iteration": 3.138298273086548 + }, + { + "auxiliary_loss_clip": 0.01071546, + "auxiliary_loss_mlp": 0.01039192, + "balance_loss_clip": 1.03552485, + "balance_loss_mlp": 1.0258944, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 3.1092888754385064, + "language_loss": 0.70056421, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.72167158, + "num_input_tokens_seen": 224172730, + "step": 10405, + "time_per_iteration": 2.617367744445801 + }, + { + "auxiliary_loss_clip": 0.01088218, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.03801751, + "balance_loss_mlp": 1.02348161, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.6457928936561872, + "language_loss": 0.79052722, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.8117789, + "num_input_tokens_seen": 224192620, + "step": 10406, + "time_per_iteration": 2.558506965637207 + }, + { + "auxiliary_loss_clip": 0.01076971, + "auxiliary_loss_mlp": 0.01037031, + "balance_loss_clip": 1.03664839, + "balance_loss_mlp": 1.02455544, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 2.9593315853734277, + "language_loss": 0.69044405, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.71158403, + "num_input_tokens_seen": 224214660, + "step": 10407, + "time_per_iteration": 2.6299400329589844 + }, + { + "auxiliary_loss_clip": 0.01100156, + "auxiliary_loss_mlp": 0.00777722, + "balance_loss_clip": 1.03716922, + "balance_loss_mlp": 1.0005641, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.6795077241019474, + "language_loss": 0.85512483, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87390363, + "num_input_tokens_seen": 224234170, + "step": 10408, + "time_per_iteration": 2.5145556926727295 + }, + { + "auxiliary_loss_clip": 0.01090849, + "auxiliary_loss_mlp": 0.00776538, + "balance_loss_clip": 1.03694725, + "balance_loss_mlp": 1.0005374, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.7068791748475516, + "language_loss": 0.80093229, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.81960618, + "num_input_tokens_seen": 224253115, + "step": 10409, + "time_per_iteration": 2.5699727535247803 + }, + { + "auxiliary_loss_clip": 0.01090112, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.03600323, + "balance_loss_mlp": 1.01941323, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.108217622081491, + "language_loss": 0.69562429, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71684659, + "num_input_tokens_seen": 224271375, + "step": 10410, + "time_per_iteration": 2.5842761993408203 + }, + { + "auxiliary_loss_clip": 0.01067809, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.0355804, + "balance_loss_mlp": 1.01457906, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.9630720865413442, + "language_loss": 0.67554212, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.6964854, + "num_input_tokens_seen": 224290315, + "step": 10411, + "time_per_iteration": 2.670793294906616 + }, + { + "auxiliary_loss_clip": 0.0107071, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.04008567, + "balance_loss_mlp": 1.02269673, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.6416422694055335, + "language_loss": 0.69507498, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71613157, + "num_input_tokens_seen": 224310545, + "step": 10412, + "time_per_iteration": 2.741215229034424 + }, + { + "auxiliary_loss_clip": 0.0108495, + "auxiliary_loss_mlp": 0.01040423, + "balance_loss_clip": 1.0382303, + "balance_loss_mlp": 1.02729189, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.4492971672613766, + "language_loss": 0.69303143, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.71428514, + "num_input_tokens_seen": 224331115, + "step": 10413, + "time_per_iteration": 2.587451696395874 + }, + { + "auxiliary_loss_clip": 0.0108252, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.03507268, + "balance_loss_mlp": 1.02037334, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 2.3416455066570423, + "language_loss": 0.80488062, + "learning_rate": 1.295526482316796e-06, + "loss": 0.82605726, + "num_input_tokens_seen": 224347525, + "step": 10414, + "time_per_iteration": 3.9720349311828613 + }, + { + "auxiliary_loss_clip": 0.01104887, + "auxiliary_loss_mlp": 0.01042178, + "balance_loss_clip": 1.04183304, + "balance_loss_mlp": 1.02998269, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.8097991947691818, + "language_loss": 0.75084186, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.77231252, + "num_input_tokens_seen": 224367045, + "step": 10415, + "time_per_iteration": 2.531747341156006 + }, + { + "auxiliary_loss_clip": 0.01064772, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.03963947, + "balance_loss_mlp": 1.01618457, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.5895158141738097, + "language_loss": 0.74501675, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.76595169, + "num_input_tokens_seen": 224388860, + "step": 10416, + "time_per_iteration": 2.648073196411133 + }, + { + "auxiliary_loss_clip": 0.01088779, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.04253983, + "balance_loss_mlp": 1.01802146, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.6812996424670914, + "language_loss": 0.84233135, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86352277, + "num_input_tokens_seen": 224409645, + "step": 10417, + "time_per_iteration": 2.6438961029052734 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.03967905, + "balance_loss_mlp": 1.01917517, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 2.36150433414709, + "language_loss": 0.56709182, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.58847296, + "num_input_tokens_seen": 224428530, + "step": 10418, + "time_per_iteration": 2.539966344833374 + }, + { + "auxiliary_loss_clip": 0.01109157, + "auxiliary_loss_mlp": 0.01040689, + "balance_loss_clip": 1.03822064, + "balance_loss_mlp": 1.02684247, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.9797698445128775, + "language_loss": 0.8444798, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.8659783, + "num_input_tokens_seen": 224447175, + "step": 10419, + "time_per_iteration": 2.5472779273986816 + }, + { + "auxiliary_loss_clip": 0.0111679, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.0410738, + "balance_loss_mlp": 1.0218122, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 1.640710510452837, + "language_loss": 0.64900565, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.67051613, + "num_input_tokens_seen": 224469445, + "step": 10420, + "time_per_iteration": 2.5588061809539795 + }, + { + "auxiliary_loss_clip": 0.01076301, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.03788459, + "balance_loss_mlp": 1.02019572, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 1.7728434523223184, + "language_loss": 0.86383742, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88493979, + "num_input_tokens_seen": 224486590, + "step": 10421, + "time_per_iteration": 2.5602214336395264 + }, + { + "auxiliary_loss_clip": 0.01077741, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.03631377, + "balance_loss_mlp": 1.02223122, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 2.6340504264886877, + "language_loss": 0.79610753, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81723022, + "num_input_tokens_seen": 224502795, + "step": 10422, + "time_per_iteration": 2.566519021987915 + }, + { + "auxiliary_loss_clip": 0.01103124, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.0383476, + "balance_loss_mlp": 1.01902103, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 1.8036206289226087, + "language_loss": 0.74642718, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76777917, + "num_input_tokens_seen": 224522300, + "step": 10423, + "time_per_iteration": 2.5113916397094727 + }, + { + "auxiliary_loss_clip": 0.01111837, + "auxiliary_loss_mlp": 0.01030971, + "balance_loss_clip": 1.03798532, + "balance_loss_mlp": 1.01877022, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 1.8863671152475459, + "language_loss": 0.77967995, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.801108, + "num_input_tokens_seen": 224538260, + "step": 10424, + "time_per_iteration": 2.4194581508636475 + }, + { + "auxiliary_loss_clip": 0.01112682, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.03896141, + "balance_loss_mlp": 1.02261519, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 1.841286113952911, + "language_loss": 0.68945932, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71094912, + "num_input_tokens_seen": 224559155, + "step": 10425, + "time_per_iteration": 2.5099120140075684 + }, + { + "auxiliary_loss_clip": 0.01089147, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.03835177, + "balance_loss_mlp": 1.01902211, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.5014341285154853, + "language_loss": 0.74493247, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76612806, + "num_input_tokens_seen": 224578660, + "step": 10426, + "time_per_iteration": 4.061337232589722 + }, + { + "auxiliary_loss_clip": 0.01104623, + "auxiliary_loss_mlp": 0.00778997, + "balance_loss_clip": 1.03960323, + "balance_loss_mlp": 1.00057244, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.523700573279404, + "language_loss": 0.80359024, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82242644, + "num_input_tokens_seen": 224599080, + "step": 10427, + "time_per_iteration": 2.5913987159729004 + }, + { + "auxiliary_loss_clip": 0.01079776, + "auxiliary_loss_mlp": 0.01036413, + "balance_loss_clip": 1.03479993, + "balance_loss_mlp": 1.02346134, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 1.8788379375692579, + "language_loss": 0.67867631, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.69983822, + "num_input_tokens_seen": 224614225, + "step": 10428, + "time_per_iteration": 3.9779622554779053 + }, + { + "auxiliary_loss_clip": 0.01072164, + "auxiliary_loss_mlp": 0.01047483, + "balance_loss_clip": 1.03475618, + "balance_loss_mlp": 1.03314853, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.6435625456800098, + "language_loss": 0.71602762, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73722404, + "num_input_tokens_seen": 224632365, + "step": 10429, + "time_per_iteration": 2.522435426712036 + }, + { + "auxiliary_loss_clip": 0.01107189, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.03900456, + "balance_loss_mlp": 1.0205791, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.4548545927158671, + "language_loss": 0.79780281, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.81921917, + "num_input_tokens_seen": 224651125, + "step": 10430, + "time_per_iteration": 2.5103209018707275 + }, + { + "auxiliary_loss_clip": 0.01033493, + "auxiliary_loss_mlp": 0.01002097, + "balance_loss_clip": 1.00923073, + "balance_loss_mlp": 1.00115514, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.774350222515652, + "language_loss": 0.59158707, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61194301, + "num_input_tokens_seen": 224716115, + "step": 10431, + "time_per_iteration": 3.1340301036834717 + }, + { + "auxiliary_loss_clip": 0.01015729, + "auxiliary_loss_mlp": 0.00999295, + "balance_loss_clip": 1.01120901, + "balance_loss_mlp": 0.99811518, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.866290703816797, + "language_loss": 0.63795471, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.6581049, + "num_input_tokens_seen": 224782930, + "step": 10432, + "time_per_iteration": 3.182325839996338 + }, + { + "auxiliary_loss_clip": 0.01087282, + "auxiliary_loss_mlp": 0.01032898, + "balance_loss_clip": 1.03608, + "balance_loss_mlp": 1.02153695, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.6834536952480934, + "language_loss": 0.64958465, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.67078644, + "num_input_tokens_seen": 224802010, + "step": 10433, + "time_per_iteration": 2.565335750579834 + }, + { + "auxiliary_loss_clip": 0.01106491, + "auxiliary_loss_mlp": 0.01034382, + "balance_loss_clip": 1.0393014, + "balance_loss_mlp": 1.02035117, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 2.680980636306023, + "language_loss": 0.61598128, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.63739002, + "num_input_tokens_seen": 224818875, + "step": 10434, + "time_per_iteration": 2.4713242053985596 + }, + { + "auxiliary_loss_clip": 0.01076362, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.03482902, + "balance_loss_mlp": 1.01672459, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.6943372365005844, + "language_loss": 0.84672153, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86777663, + "num_input_tokens_seen": 224837790, + "step": 10435, + "time_per_iteration": 2.547553062438965 + }, + { + "auxiliary_loss_clip": 0.01032532, + "auxiliary_loss_mlp": 0.0100334, + "balance_loss_clip": 1.00814998, + "balance_loss_mlp": 1.0022254, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7272813660598967, + "language_loss": 0.61557698, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63593572, + "num_input_tokens_seen": 224899685, + "step": 10436, + "time_per_iteration": 3.0353641510009766 + }, + { + "auxiliary_loss_clip": 0.01094249, + "auxiliary_loss_mlp": 0.0104054, + "balance_loss_clip": 1.04079676, + "balance_loss_mlp": 1.02677727, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.533832470675063, + "language_loss": 0.7740885, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79543632, + "num_input_tokens_seen": 224918650, + "step": 10437, + "time_per_iteration": 2.585875988006592 + }, + { + "auxiliary_loss_clip": 0.01025652, + "auxiliary_loss_mlp": 0.01005376, + "balance_loss_clip": 1.01109231, + "balance_loss_mlp": 1.00435638, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7292716479225492, + "language_loss": 0.5435555, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56386578, + "num_input_tokens_seen": 224981575, + "step": 10438, + "time_per_iteration": 2.9908699989318848 + }, + { + "auxiliary_loss_clip": 0.01064607, + "auxiliary_loss_mlp": 0.01046034, + "balance_loss_clip": 1.03520572, + "balance_loss_mlp": 1.03247392, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 2.167892622068683, + "language_loss": 0.83961022, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86071664, + "num_input_tokens_seen": 225000820, + "step": 10439, + "time_per_iteration": 4.113048553466797 + }, + { + "auxiliary_loss_clip": 0.01076704, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.04035258, + "balance_loss_mlp": 1.03036952, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.026389921001926, + "language_loss": 0.79845142, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.81965744, + "num_input_tokens_seen": 225017585, + "step": 10440, + "time_per_iteration": 2.607767105102539 + }, + { + "auxiliary_loss_clip": 0.01057766, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.0331887, + "balance_loss_mlp": 1.02033448, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 2.142252098730805, + "language_loss": 0.74431252, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76520818, + "num_input_tokens_seen": 225039085, + "step": 10441, + "time_per_iteration": 2.618762731552124 + }, + { + "auxiliary_loss_clip": 0.01096312, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.03592527, + "balance_loss_mlp": 1.02246666, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 2.0620327261728213, + "language_loss": 0.72817826, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74950266, + "num_input_tokens_seen": 225058105, + "step": 10442, + "time_per_iteration": 2.4596292972564697 + }, + { + "auxiliary_loss_clip": 0.0107974, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.03505325, + "balance_loss_mlp": 1.022192, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.53592725151457, + "language_loss": 0.71452749, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73567355, + "num_input_tokens_seen": 225077605, + "step": 10443, + "time_per_iteration": 2.5639138221740723 + }, + { + "auxiliary_loss_clip": 0.01114826, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.04029572, + "balance_loss_mlp": 1.01927936, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 1.9944650741474492, + "language_loss": 0.73500335, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.7564702, + "num_input_tokens_seen": 225097775, + "step": 10444, + "time_per_iteration": 2.46683669090271 + }, + { + "auxiliary_loss_clip": 0.01082439, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_clip": 1.04469609, + "balance_loss_mlp": 1.01800287, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 1.910639810621634, + "language_loss": 0.7157836, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.73691541, + "num_input_tokens_seen": 225115585, + "step": 10445, + "time_per_iteration": 2.6110894680023193 + }, + { + "auxiliary_loss_clip": 0.01097418, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.03657436, + "balance_loss_mlp": 1.01920092, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.6540911912652327, + "language_loss": 0.69168639, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71298426, + "num_input_tokens_seen": 225135575, + "step": 10446, + "time_per_iteration": 2.4866583347320557 + }, + { + "auxiliary_loss_clip": 0.01076143, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.03763318, + "balance_loss_mlp": 1.02050281, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 2.050115938581114, + "language_loss": 0.73233509, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.75344521, + "num_input_tokens_seen": 225154230, + "step": 10447, + "time_per_iteration": 2.6261308193206787 + }, + { + "auxiliary_loss_clip": 0.01025324, + "auxiliary_loss_mlp": 0.01001665, + "balance_loss_clip": 1.0104425, + "balance_loss_mlp": 1.00040686, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6788503423119512, + "language_loss": 0.52381611, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54408598, + "num_input_tokens_seen": 225213650, + "step": 10448, + "time_per_iteration": 2.9130430221557617 + }, + { + "auxiliary_loss_clip": 0.01091051, + "auxiliary_loss_mlp": 0.0105387, + "balance_loss_clip": 1.03768408, + "balance_loss_mlp": 1.03880239, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.215956878966031, + "language_loss": 0.91479701, + "learning_rate": 1.282785392633079e-06, + "loss": 0.93624628, + "num_input_tokens_seen": 225230135, + "step": 10449, + "time_per_iteration": 2.4659435749053955 + }, + { + "auxiliary_loss_clip": 0.01110851, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.03691053, + "balance_loss_mlp": 1.02132714, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.7393072392358535, + "language_loss": 0.59988987, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62133014, + "num_input_tokens_seen": 225253520, + "step": 10450, + "time_per_iteration": 2.6197783946990967 + }, + { + "auxiliary_loss_clip": 0.01092286, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.04382253, + "balance_loss_mlp": 1.02036357, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.5093494803947882, + "language_loss": 0.76892751, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79018015, + "num_input_tokens_seen": 225272460, + "step": 10451, + "time_per_iteration": 2.5047757625579834 + }, + { + "auxiliary_loss_clip": 0.01089536, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.03651476, + "balance_loss_mlp": 1.02137852, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.7611382085440912, + "language_loss": 0.77618206, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79742336, + "num_input_tokens_seen": 225291700, + "step": 10452, + "time_per_iteration": 2.496640205383301 + }, + { + "auxiliary_loss_clip": 0.0108325, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.03814864, + "balance_loss_mlp": 1.02001238, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 1.6656642171054108, + "language_loss": 0.7288394, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74999678, + "num_input_tokens_seen": 225311470, + "step": 10453, + "time_per_iteration": 2.5791380405426025 + }, + { + "auxiliary_loss_clip": 0.01059616, + "auxiliary_loss_mlp": 0.01042567, + "balance_loss_clip": 1.03485787, + "balance_loss_mlp": 1.0271771, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.707518224491423, + "language_loss": 0.8057127, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82673454, + "num_input_tokens_seen": 225328385, + "step": 10454, + "time_per_iteration": 3.9941518306732178 + }, + { + "auxiliary_loss_clip": 0.01082721, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.04003692, + "balance_loss_mlp": 1.0223484, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 2.287841870404226, + "language_loss": 0.81949067, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84066284, + "num_input_tokens_seen": 225348415, + "step": 10455, + "time_per_iteration": 2.56752347946167 + }, + { + "auxiliary_loss_clip": 0.01065912, + "auxiliary_loss_mlp": 0.00779659, + "balance_loss_clip": 1.03303373, + "balance_loss_mlp": 1.00061393, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 1.5227312579774661, + "language_loss": 0.8174811, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83593678, + "num_input_tokens_seen": 225367740, + "step": 10456, + "time_per_iteration": 2.651563882827759 + }, + { + "auxiliary_loss_clip": 0.01088474, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.04052448, + "balance_loss_mlp": 1.01668024, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.613278484535643, + "language_loss": 0.7182712, + "learning_rate": 1.27987780006486e-06, + "loss": 0.73945713, + "num_input_tokens_seen": 225388405, + "step": 10457, + "time_per_iteration": 2.527066707611084 + }, + { + "auxiliary_loss_clip": 0.01107241, + "auxiliary_loss_mlp": 0.01036231, + "balance_loss_clip": 1.03666615, + "balance_loss_mlp": 1.02286148, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 1.9394817460874347, + "language_loss": 0.79932332, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.8207581, + "num_input_tokens_seen": 225408360, + "step": 10458, + "time_per_iteration": 2.5140182971954346 + }, + { + "auxiliary_loss_clip": 0.01106938, + "auxiliary_loss_mlp": 0.01034329, + "balance_loss_clip": 1.04023814, + "balance_loss_mlp": 1.02118671, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.5029301832793809, + "language_loss": 0.61155057, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.6329633, + "num_input_tokens_seen": 225431310, + "step": 10459, + "time_per_iteration": 2.598479986190796 + }, + { + "auxiliary_loss_clip": 0.01090253, + "auxiliary_loss_mlp": 0.01036224, + "balance_loss_clip": 1.03798342, + "balance_loss_mlp": 1.02330744, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 6.310410402637988, + "language_loss": 0.78402579, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.80529058, + "num_input_tokens_seen": 225450385, + "step": 10460, + "time_per_iteration": 2.526898145675659 + }, + { + "auxiliary_loss_clip": 0.01076039, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.03734255, + "balance_loss_mlp": 1.01898837, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.760179739185048, + "language_loss": 0.7408796, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76196659, + "num_input_tokens_seen": 225467325, + "step": 10461, + "time_per_iteration": 2.5073554515838623 + }, + { + "auxiliary_loss_clip": 0.01091929, + "auxiliary_loss_mlp": 0.01036389, + "balance_loss_clip": 1.03804302, + "balance_loss_mlp": 1.02355599, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.5939729225074066, + "language_loss": 0.70197928, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72326249, + "num_input_tokens_seen": 225487370, + "step": 10462, + "time_per_iteration": 2.530430793762207 + }, + { + "auxiliary_loss_clip": 0.01107669, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.03850138, + "balance_loss_mlp": 1.02293575, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 2.156639515388199, + "language_loss": 0.71767187, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.73909175, + "num_input_tokens_seen": 225506915, + "step": 10463, + "time_per_iteration": 2.5031797885894775 + }, + { + "auxiliary_loss_clip": 0.01093174, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.04203653, + "balance_loss_mlp": 1.0228653, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 1.7407399580850913, + "language_loss": 0.72482663, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.74611199, + "num_input_tokens_seen": 225525670, + "step": 10464, + "time_per_iteration": 4.1799750328063965 + }, + { + "auxiliary_loss_clip": 0.01084012, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.03784764, + "balance_loss_mlp": 1.02267826, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.62526192549285, + "language_loss": 0.69572544, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.716914, + "num_input_tokens_seen": 225542235, + "step": 10465, + "time_per_iteration": 2.576416015625 + }, + { + "auxiliary_loss_clip": 0.01025409, + "auxiliary_loss_mlp": 0.01000754, + "balance_loss_clip": 1.01084304, + "balance_loss_mlp": 0.99958545, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6774268360145214, + "language_loss": 0.59788191, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61814356, + "num_input_tokens_seen": 225607185, + "step": 10466, + "time_per_iteration": 3.1555721759796143 + }, + { + "auxiliary_loss_clip": 0.01075766, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.03792644, + "balance_loss_mlp": 1.01771069, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 1.941937866728727, + "language_loss": 0.65018463, + "learning_rate": 1.276245767820154e-06, + "loss": 0.67123497, + "num_input_tokens_seen": 225628785, + "step": 10467, + "time_per_iteration": 2.7133901119232178 + }, + { + "auxiliary_loss_clip": 0.01014394, + "auxiliary_loss_mlp": 0.01004915, + "balance_loss_clip": 1.00961268, + "balance_loss_mlp": 1.00374722, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.815832897846789, + "language_loss": 0.56872123, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58891433, + "num_input_tokens_seen": 225678980, + "step": 10468, + "time_per_iteration": 4.210232257843018 + }, + { + "auxiliary_loss_clip": 0.00997969, + "auxiliary_loss_mlp": 0.01000958, + "balance_loss_clip": 1.01350594, + "balance_loss_mlp": 0.99987966, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7287124175423352, + "language_loss": 0.57934368, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.59933293, + "num_input_tokens_seen": 225740295, + "step": 10469, + "time_per_iteration": 3.084803581237793 + }, + { + "auxiliary_loss_clip": 0.01032833, + "auxiliary_loss_mlp": 0.01001813, + "balance_loss_clip": 1.02441621, + "balance_loss_mlp": 1.00045991, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6757826602587098, + "language_loss": 0.52101213, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54135859, + "num_input_tokens_seen": 225805615, + "step": 10470, + "time_per_iteration": 3.1038718223571777 + }, + { + "auxiliary_loss_clip": 0.0109658, + "auxiliary_loss_mlp": 0.01032581, + "balance_loss_clip": 1.03775859, + "balance_loss_mlp": 1.02045774, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.700580065929958, + "language_loss": 0.74075347, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76204503, + "num_input_tokens_seen": 225826585, + "step": 10471, + "time_per_iteration": 2.6672487258911133 + }, + { + "auxiliary_loss_clip": 0.01085474, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.04034519, + "balance_loss_mlp": 1.01596141, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 2.0751677533858377, + "language_loss": 0.63072979, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65186489, + "num_input_tokens_seen": 225844095, + "step": 10472, + "time_per_iteration": 2.4894325733184814 + }, + { + "auxiliary_loss_clip": 0.0111807, + "auxiliary_loss_mlp": 0.01035985, + "balance_loss_clip": 1.04133558, + "balance_loss_mlp": 1.02305079, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 2.765577682447087, + "language_loss": 0.69456923, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71610975, + "num_input_tokens_seen": 225864310, + "step": 10473, + "time_per_iteration": 2.492311954498291 + }, + { + "auxiliary_loss_clip": 0.01087696, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.03443885, + "balance_loss_mlp": 1.01853156, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.596804480643628, + "language_loss": 0.74801487, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.76919711, + "num_input_tokens_seen": 225883830, + "step": 10474, + "time_per_iteration": 2.506622076034546 + }, + { + "auxiliary_loss_clip": 0.01089825, + "auxiliary_loss_mlp": 0.00778177, + "balance_loss_clip": 1.03549671, + "balance_loss_mlp": 1.00073564, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 1.4253123579670348, + "language_loss": 0.66282403, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68150401, + "num_input_tokens_seen": 225905755, + "step": 10475, + "time_per_iteration": 2.580115795135498 + }, + { + "auxiliary_loss_clip": 0.01067604, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.03992391, + "balance_loss_mlp": 1.01697218, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 1.7978822517315216, + "language_loss": 0.89911914, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92008317, + "num_input_tokens_seen": 225922155, + "step": 10476, + "time_per_iteration": 2.536740779876709 + }, + { + "auxiliary_loss_clip": 0.01111874, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.03906751, + "balance_loss_mlp": 1.02252638, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 2.063505257365411, + "language_loss": 0.75044435, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.77190733, + "num_input_tokens_seen": 225941060, + "step": 10477, + "time_per_iteration": 2.459193468093872 + }, + { + "auxiliary_loss_clip": 0.01101271, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.04098487, + "balance_loss_mlp": 1.01889324, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.740997374267277, + "language_loss": 0.70364559, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72497952, + "num_input_tokens_seen": 225960870, + "step": 10478, + "time_per_iteration": 4.097209930419922 + }, + { + "auxiliary_loss_clip": 0.01108841, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.03967023, + "balance_loss_mlp": 1.0177176, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.7055418938247042, + "language_loss": 0.67569894, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69710648, + "num_input_tokens_seen": 225977895, + "step": 10479, + "time_per_iteration": 2.5078976154327393 + }, + { + "auxiliary_loss_clip": 0.01087758, + "auxiliary_loss_mlp": 0.00779026, + "balance_loss_clip": 1.03746712, + "balance_loss_mlp": 1.00068641, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 1.825680814292951, + "language_loss": 0.73926568, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.7579335, + "num_input_tokens_seen": 225997835, + "step": 10480, + "time_per_iteration": 2.536539077758789 + }, + { + "auxiliary_loss_clip": 0.01102083, + "auxiliary_loss_mlp": 0.01035308, + "balance_loss_clip": 1.03744745, + "balance_loss_mlp": 1.02159309, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 2.06542985211995, + "language_loss": 0.79065204, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.8120259, + "num_input_tokens_seen": 226017620, + "step": 10481, + "time_per_iteration": 2.4624645709991455 + }, + { + "auxiliary_loss_clip": 0.01018386, + "auxiliary_loss_mlp": 0.01007347, + "balance_loss_clip": 1.01178849, + "balance_loss_mlp": 1.00612473, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.8935501301358709, + "language_loss": 0.61797035, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63822764, + "num_input_tokens_seen": 226068755, + "step": 10482, + "time_per_iteration": 2.8390986919403076 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.03928685, + "balance_loss_mlp": 1.02176106, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 5.502231590611366, + "language_loss": 0.82757783, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.84901911, + "num_input_tokens_seen": 226084395, + "step": 10483, + "time_per_iteration": 2.43489670753479 + }, + { + "auxiliary_loss_clip": 0.01094049, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.04033518, + "balance_loss_mlp": 1.01993608, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.7133064683162416, + "language_loss": 0.72302783, + "learning_rate": 1.270077618961487e-06, + "loss": 0.74428642, + "num_input_tokens_seen": 226105890, + "step": 10484, + "time_per_iteration": 2.518012762069702 + }, + { + "auxiliary_loss_clip": 0.01086031, + "auxiliary_loss_mlp": 0.01027116, + "balance_loss_clip": 1.04010439, + "balance_loss_mlp": 1.01440191, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.7674944102782457, + "language_loss": 0.74652505, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76765645, + "num_input_tokens_seen": 226126760, + "step": 10485, + "time_per_iteration": 2.687774896621704 + }, + { + "auxiliary_loss_clip": 0.01091424, + "auxiliary_loss_mlp": 0.00779726, + "balance_loss_clip": 1.04028738, + "balance_loss_mlp": 1.00072265, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.6271022780165023, + "language_loss": 0.8171221, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83583361, + "num_input_tokens_seen": 226147315, + "step": 10486, + "time_per_iteration": 2.618626356124878 + }, + { + "auxiliary_loss_clip": 0.01089429, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.03806007, + "balance_loss_mlp": 1.02403498, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 1.6243394676973708, + "language_loss": 0.63406634, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.655321, + "num_input_tokens_seen": 226165935, + "step": 10487, + "time_per_iteration": 2.4880926609039307 + }, + { + "auxiliary_loss_clip": 0.01113594, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.0393033, + "balance_loss_mlp": 1.02462125, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.6885571376701458, + "language_loss": 0.67039084, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69189435, + "num_input_tokens_seen": 226186890, + "step": 10488, + "time_per_iteration": 2.4747726917266846 + }, + { + "auxiliary_loss_clip": 0.0109339, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.03809798, + "balance_loss_mlp": 1.01841378, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.9624553845507893, + "language_loss": 0.67249864, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69373822, + "num_input_tokens_seen": 226206710, + "step": 10489, + "time_per_iteration": 2.556131601333618 + }, + { + "auxiliary_loss_clip": 0.0107919, + "auxiliary_loss_mlp": 0.0104041, + "balance_loss_clip": 1.03835964, + "balance_loss_mlp": 1.02530587, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.839888065935479, + "language_loss": 0.69111967, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71231568, + "num_input_tokens_seen": 226225565, + "step": 10490, + "time_per_iteration": 2.5413689613342285 + }, + { + "auxiliary_loss_clip": 0.01094023, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.03829384, + "balance_loss_mlp": 1.02379465, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 1.8691907833923458, + "language_loss": 0.78492981, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.8062408, + "num_input_tokens_seen": 226243680, + "step": 10491, + "time_per_iteration": 2.5776634216308594 + }, + { + "auxiliary_loss_clip": 0.01088858, + "auxiliary_loss_mlp": 0.01035334, + "balance_loss_clip": 1.0373528, + "balance_loss_mlp": 1.02308512, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.9791081203731344, + "language_loss": 0.56103683, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.58227879, + "num_input_tokens_seen": 226264345, + "step": 10492, + "time_per_iteration": 4.068300008773804 + }, + { + "auxiliary_loss_clip": 0.01115415, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.03928375, + "balance_loss_mlp": 1.02405953, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 1.831469330160584, + "language_loss": 0.64631248, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66784251, + "num_input_tokens_seen": 226283165, + "step": 10493, + "time_per_iteration": 2.4578771591186523 + }, + { + "auxiliary_loss_clip": 0.01082083, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.04257345, + "balance_loss_mlp": 1.01894879, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.5444166746844028, + "language_loss": 0.82930672, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.85044527, + "num_input_tokens_seen": 226304080, + "step": 10494, + "time_per_iteration": 2.6001765727996826 + }, + { + "auxiliary_loss_clip": 0.01099707, + "auxiliary_loss_mlp": 0.01035457, + "balance_loss_clip": 1.04214072, + "balance_loss_mlp": 1.02264166, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.8546540264687474, + "language_loss": 0.79482913, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81618083, + "num_input_tokens_seen": 226325925, + "step": 10495, + "time_per_iteration": 2.7006075382232666 + }, + { + "auxiliary_loss_clip": 0.01086605, + "auxiliary_loss_mlp": 0.0103631, + "balance_loss_clip": 1.03576374, + "balance_loss_mlp": 1.02233899, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 1.8934459410171105, + "language_loss": 0.70135903, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72258818, + "num_input_tokens_seen": 226344190, + "step": 10496, + "time_per_iteration": 2.4851737022399902 + }, + { + "auxiliary_loss_clip": 0.01096289, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.03944552, + "balance_loss_mlp": 1.02546549, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 2.022363027125812, + "language_loss": 0.80541384, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82676458, + "num_input_tokens_seen": 226361520, + "step": 10497, + "time_per_iteration": 2.5442090034484863 + }, + { + "auxiliary_loss_clip": 0.01085666, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.04043591, + "balance_loss_mlp": 1.01923633, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.8394288103551641, + "language_loss": 0.73786545, + "learning_rate": 1.265003970256247e-06, + "loss": 0.75903106, + "num_input_tokens_seen": 226381920, + "step": 10498, + "time_per_iteration": 2.5357167720794678 + }, + { + "auxiliary_loss_clip": 0.01104203, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.03826284, + "balance_loss_mlp": 1.02118862, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 1.9107121165093481, + "language_loss": 0.69465196, + "learning_rate": 1.264641775364217e-06, + "loss": 0.71603346, + "num_input_tokens_seen": 226400035, + "step": 10499, + "time_per_iteration": 2.482039213180542 + }, + { + "auxiliary_loss_clip": 0.01104292, + "auxiliary_loss_mlp": 0.01041971, + "balance_loss_clip": 1.04237437, + "balance_loss_mlp": 1.02965045, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 1.7266350796326155, + "language_loss": 0.69917727, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72063982, + "num_input_tokens_seen": 226418280, + "step": 10500, + "time_per_iteration": 2.5151619911193848 + }, + { + "auxiliary_loss_clip": 0.01114436, + "auxiliary_loss_mlp": 0.01039196, + "balance_loss_clip": 1.04009938, + "balance_loss_mlp": 1.02683961, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 1.848095798592318, + "language_loss": 0.74496949, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76650584, + "num_input_tokens_seen": 226436650, + "step": 10501, + "time_per_iteration": 2.424511432647705 + }, + { + "auxiliary_loss_clip": 0.01101844, + "auxiliary_loss_mlp": 0.00777428, + "balance_loss_clip": 1.03902137, + "balance_loss_mlp": 1.00056362, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 2.1096904643278744, + "language_loss": 0.75676531, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77555799, + "num_input_tokens_seen": 226456275, + "step": 10502, + "time_per_iteration": 2.5056097507476807 + }, + { + "auxiliary_loss_clip": 0.01107118, + "auxiliary_loss_mlp": 0.01048648, + "balance_loss_clip": 1.04042339, + "balance_loss_mlp": 1.03529024, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 1.888571317782368, + "language_loss": 0.85622883, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87778652, + "num_input_tokens_seen": 226473610, + "step": 10503, + "time_per_iteration": 2.48848557472229 + }, + { + "auxiliary_loss_clip": 0.01088784, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.04278719, + "balance_loss_mlp": 1.01882803, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 2.262108568289452, + "language_loss": 0.86703622, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88824111, + "num_input_tokens_seen": 226493665, + "step": 10504, + "time_per_iteration": 4.065830230712891 + }, + { + "auxiliary_loss_clip": 0.01082464, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.03980422, + "balance_loss_mlp": 1.02298284, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.702327214869143, + "language_loss": 0.76085097, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78204769, + "num_input_tokens_seen": 226511625, + "step": 10505, + "time_per_iteration": 2.572291851043701 + }, + { + "auxiliary_loss_clip": 0.0107344, + "auxiliary_loss_mlp": 0.01035723, + "balance_loss_clip": 1.03746545, + "balance_loss_mlp": 1.02211487, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 1.8216685532276453, + "language_loss": 0.82048714, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.84157872, + "num_input_tokens_seen": 226530085, + "step": 10506, + "time_per_iteration": 2.6104867458343506 + }, + { + "auxiliary_loss_clip": 0.01115626, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.04003, + "balance_loss_mlp": 1.02447438, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.7975448291653826, + "language_loss": 0.74495083, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76648849, + "num_input_tokens_seen": 226548115, + "step": 10507, + "time_per_iteration": 3.8238682746887207 + }, + { + "auxiliary_loss_clip": 0.01097942, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.04556537, + "balance_loss_mlp": 1.02345622, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 5.4428469352748845, + "language_loss": 0.67638111, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.69772577, + "num_input_tokens_seen": 226567955, + "step": 10508, + "time_per_iteration": 2.546513319015503 + }, + { + "auxiliary_loss_clip": 0.01083639, + "auxiliary_loss_mlp": 0.0103525, + "balance_loss_clip": 1.03717971, + "balance_loss_mlp": 1.02205992, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.7964191267083276, + "language_loss": 0.70783234, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.72902119, + "num_input_tokens_seen": 226588205, + "step": 10509, + "time_per_iteration": 2.592442750930786 + }, + { + "auxiliary_loss_clip": 0.01100148, + "auxiliary_loss_mlp": 0.01028866, + "balance_loss_clip": 1.04219151, + "balance_loss_mlp": 1.01668262, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.6022794346544704, + "language_loss": 0.79727978, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81856996, + "num_input_tokens_seen": 226606965, + "step": 10510, + "time_per_iteration": 2.5423882007598877 + }, + { + "auxiliary_loss_clip": 0.01073805, + "auxiliary_loss_mlp": 0.00779362, + "balance_loss_clip": 1.03971326, + "balance_loss_mlp": 1.00063026, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.5420018878285777, + "language_loss": 0.70566064, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72419232, + "num_input_tokens_seen": 226627845, + "step": 10511, + "time_per_iteration": 2.608504056930542 + }, + { + "auxiliary_loss_clip": 0.01112147, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.03952026, + "balance_loss_mlp": 1.02243054, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.828530267376034, + "language_loss": 0.80259001, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82406044, + "num_input_tokens_seen": 226645855, + "step": 10512, + "time_per_iteration": 2.4450795650482178 + }, + { + "auxiliary_loss_clip": 0.01105753, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.04123497, + "balance_loss_mlp": 1.02118194, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 1.7200335722121671, + "language_loss": 0.70519388, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72659767, + "num_input_tokens_seen": 226665375, + "step": 10513, + "time_per_iteration": 2.528411865234375 + }, + { + "auxiliary_loss_clip": 0.01108135, + "auxiliary_loss_mlp": 0.01037314, + "balance_loss_clip": 1.03960347, + "balance_loss_mlp": 1.02356875, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 7.738523145415243, + "language_loss": 0.66512537, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68657982, + "num_input_tokens_seen": 226685270, + "step": 10514, + "time_per_iteration": 2.560140609741211 + }, + { + "auxiliary_loss_clip": 0.01083391, + "auxiliary_loss_mlp": 0.01033253, + "balance_loss_clip": 1.0360961, + "balance_loss_mlp": 1.02064097, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.760330487080527, + "language_loss": 0.74188566, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76305211, + "num_input_tokens_seen": 226705325, + "step": 10515, + "time_per_iteration": 2.6515378952026367 + }, + { + "auxiliary_loss_clip": 0.01085899, + "auxiliary_loss_mlp": 0.01030024, + "balance_loss_clip": 1.03892446, + "balance_loss_mlp": 1.01784658, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.8243646421624937, + "language_loss": 0.89648235, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.91764158, + "num_input_tokens_seen": 226723815, + "step": 10516, + "time_per_iteration": 2.6114304065704346 + }, + { + "auxiliary_loss_clip": 0.01125188, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.04569459, + "balance_loss_mlp": 1.01839685, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.6573493664278158, + "language_loss": 0.82186717, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84345663, + "num_input_tokens_seen": 226741550, + "step": 10517, + "time_per_iteration": 2.5256450176239014 + }, + { + "auxiliary_loss_clip": 0.01059249, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.03795338, + "balance_loss_mlp": 1.0227083, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.6258438895560883, + "language_loss": 0.77527791, + "learning_rate": 1.257765386189541e-06, + "loss": 0.79622471, + "num_input_tokens_seen": 226761115, + "step": 10518, + "time_per_iteration": 4.1410253047943115 + }, + { + "auxiliary_loss_clip": 0.01101517, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.0431217, + "balance_loss_mlp": 1.01919985, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.520144949289862, + "language_loss": 0.85205787, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87339145, + "num_input_tokens_seen": 226782225, + "step": 10519, + "time_per_iteration": 2.5175013542175293 + }, + { + "auxiliary_loss_clip": 0.0108916, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.03803921, + "balance_loss_mlp": 1.02109981, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.4944036068369964, + "language_loss": 0.71657032, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.73779601, + "num_input_tokens_seen": 226802375, + "step": 10520, + "time_per_iteration": 2.521469831466675 + }, + { + "auxiliary_loss_clip": 0.01101739, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.03810227, + "balance_loss_mlp": 1.01986015, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.7323866579110108, + "language_loss": 0.71681821, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73815989, + "num_input_tokens_seen": 226822165, + "step": 10521, + "time_per_iteration": 2.440316915512085 + }, + { + "auxiliary_loss_clip": 0.01080673, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.04162288, + "balance_loss_mlp": 1.02231741, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.7423967089763077, + "language_loss": 0.71817076, + "learning_rate": 1.256319016853377e-06, + "loss": 0.73934907, + "num_input_tokens_seen": 226841645, + "step": 10522, + "time_per_iteration": 2.543135166168213 + }, + { + "auxiliary_loss_clip": 0.010742, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.04365182, + "balance_loss_mlp": 1.02264595, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.9880984267295625, + "language_loss": 0.81510913, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.83619952, + "num_input_tokens_seen": 226860355, + "step": 10523, + "time_per_iteration": 2.5723941326141357 + }, + { + "auxiliary_loss_clip": 0.01103083, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.03948808, + "balance_loss_mlp": 1.01630473, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.0798016860950295, + "language_loss": 0.74186099, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76318049, + "num_input_tokens_seen": 226878390, + "step": 10524, + "time_per_iteration": 2.458411455154419 + }, + { + "auxiliary_loss_clip": 0.01101172, + "auxiliary_loss_mlp": 0.01040906, + "balance_loss_clip": 1.04103553, + "balance_loss_mlp": 1.02691042, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 2.915648225344674, + "language_loss": 0.84860015, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.87002087, + "num_input_tokens_seen": 226898420, + "step": 10525, + "time_per_iteration": 2.568281650543213 + }, + { + "auxiliary_loss_clip": 0.01084705, + "auxiliary_loss_mlp": 0.01029646, + "balance_loss_clip": 1.03366721, + "balance_loss_mlp": 1.01597285, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 1.5950064730008875, + "language_loss": 0.66894239, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.69008589, + "num_input_tokens_seen": 226916305, + "step": 10526, + "time_per_iteration": 2.534367084503174 + }, + { + "auxiliary_loss_clip": 0.0110799, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.04143727, + "balance_loss_mlp": 1.02300048, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 1.6871232579990632, + "language_loss": 0.7361182, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75756741, + "num_input_tokens_seen": 226937705, + "step": 10527, + "time_per_iteration": 2.4981393814086914 + }, + { + "auxiliary_loss_clip": 0.01101476, + "auxiliary_loss_mlp": 0.01033233, + "balance_loss_clip": 1.03992081, + "balance_loss_mlp": 1.02109742, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 1.9643460763650848, + "language_loss": 0.7201851, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.74153215, + "num_input_tokens_seen": 226954880, + "step": 10528, + "time_per_iteration": 2.4521820545196533 + }, + { + "auxiliary_loss_clip": 0.01099292, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.03713596, + "balance_loss_mlp": 1.01721311, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 1.9881909076593753, + "language_loss": 0.66980159, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.69110072, + "num_input_tokens_seen": 226972595, + "step": 10529, + "time_per_iteration": 2.424220323562622 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.04128647, + "balance_loss_mlp": 1.01912761, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 1.8626669701095502, + "language_loss": 0.75192368, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.77335036, + "num_input_tokens_seen": 226991910, + "step": 10530, + "time_per_iteration": 2.495701313018799 + }, + { + "auxiliary_loss_clip": 0.01103905, + "auxiliary_loss_mlp": 0.00778026, + "balance_loss_clip": 1.03996897, + "balance_loss_mlp": 1.00066733, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 2.4505684503380696, + "language_loss": 0.7384094, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75722867, + "num_input_tokens_seen": 227010175, + "step": 10531, + "time_per_iteration": 2.486778736114502 + }, + { + "auxiliary_loss_clip": 0.01079017, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.0404284, + "balance_loss_mlp": 1.01777565, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.1479289946812234, + "language_loss": 0.7972542, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81834972, + "num_input_tokens_seen": 227025540, + "step": 10532, + "time_per_iteration": 3.9826316833496094 + }, + { + "auxiliary_loss_clip": 0.01100705, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.0391674, + "balance_loss_mlp": 1.01896155, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 1.5828141284763535, + "language_loss": 0.75013047, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.77143967, + "num_input_tokens_seen": 227045520, + "step": 10533, + "time_per_iteration": 2.482729911804199 + }, + { + "auxiliary_loss_clip": 0.01096635, + "auxiliary_loss_mlp": 0.01038586, + "balance_loss_clip": 1.04318321, + "balance_loss_mlp": 1.02423358, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 4.880303822543284, + "language_loss": 0.77107906, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79243129, + "num_input_tokens_seen": 227059420, + "step": 10534, + "time_per_iteration": 2.4593005180358887 + }, + { + "auxiliary_loss_clip": 0.01082551, + "auxiliary_loss_mlp": 0.01038002, + "balance_loss_clip": 1.0390929, + "balance_loss_mlp": 1.0244714, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.5152278646235358, + "language_loss": 0.85622835, + "learning_rate": 1.251621437204777e-06, + "loss": 0.8774339, + "num_input_tokens_seen": 227081310, + "step": 10535, + "time_per_iteration": 2.583383798599243 + }, + { + "auxiliary_loss_clip": 0.01108389, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.04157829, + "balance_loss_mlp": 1.01956952, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 1.76499991631959, + "language_loss": 0.76791263, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78932536, + "num_input_tokens_seen": 227100365, + "step": 10536, + "time_per_iteration": 2.543300151824951 + }, + { + "auxiliary_loss_clip": 0.01101258, + "auxiliary_loss_mlp": 0.01034209, + "balance_loss_clip": 1.04066503, + "balance_loss_mlp": 1.02052379, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.7921622437661928, + "language_loss": 0.60152656, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62288117, + "num_input_tokens_seen": 227119680, + "step": 10537, + "time_per_iteration": 2.61923885345459 + }, + { + "auxiliary_loss_clip": 0.0101, + "auxiliary_loss_mlp": 0.01007546, + "balance_loss_clip": 1.01484203, + "balance_loss_mlp": 1.00630665, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7768640580596895, + "language_loss": 0.52442443, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54459989, + "num_input_tokens_seen": 227184465, + "step": 10538, + "time_per_iteration": 3.249096393585205 + }, + { + "auxiliary_loss_clip": 0.01097601, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.04243588, + "balance_loss_mlp": 1.0198946, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.8669530380658683, + "language_loss": 0.83504385, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85635585, + "num_input_tokens_seen": 227202185, + "step": 10539, + "time_per_iteration": 2.5188536643981934 + }, + { + "auxiliary_loss_clip": 0.01087676, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.03676653, + "balance_loss_mlp": 1.02085268, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.4860625366482512, + "language_loss": 0.86654687, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88777065, + "num_input_tokens_seen": 227222020, + "step": 10540, + "time_per_iteration": 2.5383265018463135 + }, + { + "auxiliary_loss_clip": 0.01088121, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.03786564, + "balance_loss_mlp": 1.02280653, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.588406471919647, + "language_loss": 0.72123849, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74245656, + "num_input_tokens_seen": 227240885, + "step": 10541, + "time_per_iteration": 2.54836368560791 + }, + { + "auxiliary_loss_clip": 0.01108446, + "auxiliary_loss_mlp": 0.01035581, + "balance_loss_clip": 1.04032266, + "balance_loss_mlp": 1.02146697, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.7594309975855724, + "language_loss": 0.84720373, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.868644, + "num_input_tokens_seen": 227257880, + "step": 10542, + "time_per_iteration": 2.5573043823242188 + }, + { + "auxiliary_loss_clip": 0.0110372, + "auxiliary_loss_mlp": 0.01033678, + "balance_loss_clip": 1.03978777, + "balance_loss_mlp": 1.01970053, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.6963728167909113, + "language_loss": 0.77750278, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.79887682, + "num_input_tokens_seen": 227274840, + "step": 10543, + "time_per_iteration": 3.9922285079956055 + }, + { + "auxiliary_loss_clip": 0.01061044, + "auxiliary_loss_mlp": 0.01037486, + "balance_loss_clip": 1.03856802, + "balance_loss_mlp": 1.02551746, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.549553387436605, + "language_loss": 0.73983878, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.76082402, + "num_input_tokens_seen": 227294835, + "step": 10544, + "time_per_iteration": 2.590332269668579 + }, + { + "auxiliary_loss_clip": 0.0109098, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.04067552, + "balance_loss_mlp": 1.02227354, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 1.9824177184334812, + "language_loss": 0.68648791, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70775384, + "num_input_tokens_seen": 227314935, + "step": 10545, + "time_per_iteration": 2.530226707458496 + }, + { + "auxiliary_loss_clip": 0.01091155, + "auxiliary_loss_mlp": 0.01037731, + "balance_loss_clip": 1.03753078, + "balance_loss_mlp": 1.02473116, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.0423932797892372, + "language_loss": 0.71596551, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73725438, + "num_input_tokens_seen": 227332905, + "step": 10546, + "time_per_iteration": 3.877960681915283 + }, + { + "auxiliary_loss_clip": 0.01096507, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.03833783, + "balance_loss_mlp": 1.02072012, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.2916313869548643, + "language_loss": 0.77708447, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.79837704, + "num_input_tokens_seen": 227354915, + "step": 10547, + "time_per_iteration": 2.5126702785491943 + }, + { + "auxiliary_loss_clip": 0.01075532, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.03679144, + "balance_loss_mlp": 1.02289343, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.7714961649423981, + "language_loss": 0.63186866, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.65298128, + "num_input_tokens_seen": 227372990, + "step": 10548, + "time_per_iteration": 2.5594983100891113 + }, + { + "auxiliary_loss_clip": 0.01090306, + "auxiliary_loss_mlp": 0.01035635, + "balance_loss_clip": 1.03702044, + "balance_loss_mlp": 1.02268934, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.5318687722723017, + "language_loss": 0.62010443, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.64136386, + "num_input_tokens_seen": 227393270, + "step": 10549, + "time_per_iteration": 2.5562586784362793 + }, + { + "auxiliary_loss_clip": 0.01061673, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.03573537, + "balance_loss_mlp": 1.02117443, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.872367306402421, + "language_loss": 0.73738182, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75832617, + "num_input_tokens_seen": 227413630, + "step": 10550, + "time_per_iteration": 2.6306025981903076 + }, + { + "auxiliary_loss_clip": 0.01005523, + "auxiliary_loss_mlp": 0.01000832, + "balance_loss_clip": 1.01354599, + "balance_loss_mlp": 0.99964589, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6978155676001726, + "language_loss": 0.57727486, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.5973385, + "num_input_tokens_seen": 227476630, + "step": 10551, + "time_per_iteration": 3.12237811088562 + }, + { + "auxiliary_loss_clip": 0.01079312, + "auxiliary_loss_mlp": 0.01029074, + "balance_loss_clip": 1.04149795, + "balance_loss_mlp": 1.01753426, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.8542886250630204, + "language_loss": 0.67137408, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69245791, + "num_input_tokens_seen": 227496060, + "step": 10552, + "time_per_iteration": 2.553361654281616 + }, + { + "auxiliary_loss_clip": 0.01079853, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.03908896, + "balance_loss_mlp": 1.01718915, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.7179978612752553, + "language_loss": 0.81886971, + "learning_rate": 1.24512502014147e-06, + "loss": 0.83997071, + "num_input_tokens_seen": 227513440, + "step": 10553, + "time_per_iteration": 2.5681755542755127 + }, + { + "auxiliary_loss_clip": 0.01102406, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.03771734, + "balance_loss_mlp": 1.02129138, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 2.040224542331178, + "language_loss": 0.54715037, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.56851375, + "num_input_tokens_seen": 227535395, + "step": 10554, + "time_per_iteration": 2.6423909664154053 + }, + { + "auxiliary_loss_clip": 0.01095549, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.04046285, + "balance_loss_mlp": 1.02101946, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 2.771503152993721, + "language_loss": 0.70840508, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.72969592, + "num_input_tokens_seen": 227554545, + "step": 10555, + "time_per_iteration": 2.5395267009735107 + }, + { + "auxiliary_loss_clip": 0.01018228, + "auxiliary_loss_mlp": 0.01003905, + "balance_loss_clip": 1.01224256, + "balance_loss_mlp": 1.00291014, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.7846765271350994, + "language_loss": 0.55423546, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57445681, + "num_input_tokens_seen": 227608575, + "step": 10556, + "time_per_iteration": 3.0061728954315186 + }, + { + "auxiliary_loss_clip": 0.01092244, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.03612041, + "balance_loss_mlp": 1.02121258, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 1.6818806450472858, + "language_loss": 0.68229258, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70356315, + "num_input_tokens_seen": 227628175, + "step": 10557, + "time_per_iteration": 2.5569007396698 + }, + { + "auxiliary_loss_clip": 0.01083702, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.03594816, + "balance_loss_mlp": 1.02405286, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.6856031082600136, + "language_loss": 0.70684355, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72804892, + "num_input_tokens_seen": 227645330, + "step": 10558, + "time_per_iteration": 4.065519332885742 + }, + { + "auxiliary_loss_clip": 0.01086203, + "auxiliary_loss_mlp": 0.01033176, + "balance_loss_clip": 1.0419836, + "balance_loss_mlp": 1.02043247, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.6262622952991455, + "language_loss": 0.77845919, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.79965299, + "num_input_tokens_seen": 227665250, + "step": 10559, + "time_per_iteration": 2.5248899459838867 + }, + { + "auxiliary_loss_clip": 0.01089882, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.03590417, + "balance_loss_mlp": 1.02586079, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 1.8269706451317926, + "language_loss": 0.68312019, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70441002, + "num_input_tokens_seen": 227685070, + "step": 10560, + "time_per_iteration": 2.6017801761627197 + }, + { + "auxiliary_loss_clip": 0.01085411, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_clip": 1.03495777, + "balance_loss_mlp": 1.0318383, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.6327610412562348, + "language_loss": 0.77122194, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.79253197, + "num_input_tokens_seen": 227704430, + "step": 10561, + "time_per_iteration": 2.5389368534088135 + }, + { + "auxiliary_loss_clip": 0.01091256, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.0376817, + "balance_loss_mlp": 1.01861823, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 1.7774969421713183, + "language_loss": 0.71640342, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.73762679, + "num_input_tokens_seen": 227724920, + "step": 10562, + "time_per_iteration": 2.5508272647857666 + }, + { + "auxiliary_loss_clip": 0.01100796, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.04135728, + "balance_loss_mlp": 1.01934981, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 1.8684532349388403, + "language_loss": 0.80390441, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.82524002, + "num_input_tokens_seen": 227743400, + "step": 10563, + "time_per_iteration": 2.4471628665924072 + }, + { + "auxiliary_loss_clip": 0.0108906, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.04318881, + "balance_loss_mlp": 1.01919484, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 11.896133921293009, + "language_loss": 0.81501484, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.83622456, + "num_input_tokens_seen": 227759990, + "step": 10564, + "time_per_iteration": 2.5970051288604736 + }, + { + "auxiliary_loss_clip": 0.01087987, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.04196596, + "balance_loss_mlp": 1.0241729, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.555387860650649, + "language_loss": 0.72720063, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74846637, + "num_input_tokens_seen": 227780835, + "step": 10565, + "time_per_iteration": 2.6082310676574707 + }, + { + "auxiliary_loss_clip": 0.0109901, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.03986681, + "balance_loss_mlp": 1.02030718, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 2.189594547613507, + "language_loss": 0.68868178, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71001446, + "num_input_tokens_seen": 227798580, + "step": 10566, + "time_per_iteration": 2.4966137409210205 + }, + { + "auxiliary_loss_clip": 0.01100321, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.03929806, + "balance_loss_mlp": 1.02014887, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.7004417838504986, + "language_loss": 0.698259, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71958351, + "num_input_tokens_seen": 227819210, + "step": 10567, + "time_per_iteration": 2.5154082775115967 + }, + { + "auxiliary_loss_clip": 0.01097642, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.04087222, + "balance_loss_mlp": 1.01850116, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 1.6540297667268244, + "language_loss": 0.84667683, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86795682, + "num_input_tokens_seen": 227838340, + "step": 10568, + "time_per_iteration": 2.5031661987304688 + }, + { + "auxiliary_loss_clip": 0.01055804, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.03824902, + "balance_loss_mlp": 1.02637589, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.6909191670517814, + "language_loss": 0.84425306, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86521101, + "num_input_tokens_seen": 227859170, + "step": 10569, + "time_per_iteration": 2.671990394592285 + }, + { + "auxiliary_loss_clip": 0.01099651, + "auxiliary_loss_mlp": 0.01028724, + "balance_loss_clip": 1.03801715, + "balance_loss_mlp": 1.01605749, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.9505940809071527, + "language_loss": 0.69468927, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71597302, + "num_input_tokens_seen": 227878545, + "step": 10570, + "time_per_iteration": 2.4660253524780273 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.03766644, + "balance_loss_mlp": 1.01888955, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.9356470546923896, + "language_loss": 0.65511918, + "learning_rate": 1.2386378775476e-06, + "loss": 0.67646706, + "num_input_tokens_seen": 227898875, + "step": 10571, + "time_per_iteration": 4.012876749038696 + }, + { + "auxiliary_loss_clip": 0.01109499, + "auxiliary_loss_mlp": 0.01026882, + "balance_loss_clip": 1.04216135, + "balance_loss_mlp": 1.01430511, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.8326073498631592, + "language_loss": 0.70985806, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.7312218, + "num_input_tokens_seen": 227917130, + "step": 10572, + "time_per_iteration": 2.4504175186157227 + }, + { + "auxiliary_loss_clip": 0.01086201, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.04205716, + "balance_loss_mlp": 1.02050757, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 1.657287664942543, + "language_loss": 0.81441522, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83559954, + "num_input_tokens_seen": 227939550, + "step": 10573, + "time_per_iteration": 2.619842529296875 + }, + { + "auxiliary_loss_clip": 0.01097167, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.04154038, + "balance_loss_mlp": 1.01740551, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 2.23673545922841, + "language_loss": 0.68996775, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71123618, + "num_input_tokens_seen": 227962200, + "step": 10574, + "time_per_iteration": 2.728400468826294 + }, + { + "auxiliary_loss_clip": 0.01113159, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.04074514, + "balance_loss_mlp": 1.01835585, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.1726988464390407, + "language_loss": 0.86657727, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88801712, + "num_input_tokens_seen": 227979270, + "step": 10575, + "time_per_iteration": 2.4006783962249756 + }, + { + "auxiliary_loss_clip": 0.01112654, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.03991961, + "balance_loss_mlp": 1.02233815, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.6354087753848399, + "language_loss": 0.71916294, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74063337, + "num_input_tokens_seen": 228000550, + "step": 10576, + "time_per_iteration": 2.4887070655822754 + }, + { + "auxiliary_loss_clip": 0.01090394, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.0373975, + "balance_loss_mlp": 1.01614332, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.6329010728832996, + "language_loss": 0.69492638, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71611887, + "num_input_tokens_seen": 228022005, + "step": 10577, + "time_per_iteration": 2.54413104057312 + }, + { + "auxiliary_loss_clip": 0.01076899, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.04189813, + "balance_loss_mlp": 1.01978803, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.694780198881628, + "language_loss": 0.72214848, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74323279, + "num_input_tokens_seen": 228043770, + "step": 10578, + "time_per_iteration": 2.759279727935791 + }, + { + "auxiliary_loss_clip": 0.01015705, + "auxiliary_loss_mlp": 0.00755176, + "balance_loss_clip": 1.01794362, + "balance_loss_mlp": 1.00037754, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.6988339938569593, + "language_loss": 0.54426521, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56197405, + "num_input_tokens_seen": 228104985, + "step": 10579, + "time_per_iteration": 3.201059341430664 + }, + { + "auxiliary_loss_clip": 0.01089854, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03676891, + "balance_loss_mlp": 1.01930571, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 2.1660237604148715, + "language_loss": 0.77616268, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79738533, + "num_input_tokens_seen": 228125620, + "step": 10580, + "time_per_iteration": 2.539364814758301 + }, + { + "auxiliary_loss_clip": 0.01084357, + "auxiliary_loss_mlp": 0.00777863, + "balance_loss_clip": 1.03913665, + "balance_loss_mlp": 1.00068474, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.0115956397293773, + "language_loss": 0.66402328, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68264544, + "num_input_tokens_seen": 228143495, + "step": 10581, + "time_per_iteration": 2.6043951511383057 + }, + { + "auxiliary_loss_clip": 0.01098596, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.0387361, + "balance_loss_mlp": 1.0184505, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.4516390016436604, + "language_loss": 0.6819768, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.7032665, + "num_input_tokens_seen": 228166500, + "step": 10582, + "time_per_iteration": 2.5440142154693604 + }, + { + "auxiliary_loss_clip": 0.0108563, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.03904152, + "balance_loss_mlp": 1.02545106, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 1.9568258101425957, + "language_loss": 0.84934258, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.87057441, + "num_input_tokens_seen": 228185325, + "step": 10583, + "time_per_iteration": 4.03338623046875 + }, + { + "auxiliary_loss_clip": 0.01092237, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.04321074, + "balance_loss_mlp": 1.01740098, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.9540776391460608, + "language_loss": 0.7512666, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77248859, + "num_input_tokens_seen": 228204050, + "step": 10584, + "time_per_iteration": 3.9922940731048584 + }, + { + "auxiliary_loss_clip": 0.01091082, + "auxiliary_loss_mlp": 0.01036642, + "balance_loss_clip": 1.04070377, + "balance_loss_mlp": 1.02319479, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.931179148586776, + "language_loss": 0.72800064, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.74927789, + "num_input_tokens_seen": 228222430, + "step": 10585, + "time_per_iteration": 2.4889214038848877 + }, + { + "auxiliary_loss_clip": 0.010811, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.04231548, + "balance_loss_mlp": 1.01690435, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 3.7884051201494, + "language_loss": 0.82747912, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.84857428, + "num_input_tokens_seen": 228241925, + "step": 10586, + "time_per_iteration": 2.5466537475585938 + }, + { + "auxiliary_loss_clip": 0.01099846, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.03808558, + "balance_loss_mlp": 1.01339817, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 1.5022036693300096, + "language_loss": 0.72358602, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74483657, + "num_input_tokens_seen": 228262535, + "step": 10587, + "time_per_iteration": 2.494086980819702 + }, + { + "auxiliary_loss_clip": 0.01089317, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.03797221, + "balance_loss_mlp": 1.0157187, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 2.868247251757304, + "language_loss": 0.76976669, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.79093683, + "num_input_tokens_seen": 228281340, + "step": 10588, + "time_per_iteration": 2.5019145011901855 + }, + { + "auxiliary_loss_clip": 0.01063654, + "auxiliary_loss_mlp": 0.01029776, + "balance_loss_clip": 1.03485918, + "balance_loss_mlp": 1.01667464, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.4939631815113665, + "language_loss": 0.800448, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82138228, + "num_input_tokens_seen": 228300865, + "step": 10589, + "time_per_iteration": 2.562948226928711 + }, + { + "auxiliary_loss_clip": 0.01091625, + "auxiliary_loss_mlp": 0.01035787, + "balance_loss_clip": 1.03726006, + "balance_loss_mlp": 1.02332354, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 1.8886266036966228, + "language_loss": 0.67095244, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69222659, + "num_input_tokens_seen": 228320815, + "step": 10590, + "time_per_iteration": 2.5404670238494873 + }, + { + "auxiliary_loss_clip": 0.0110956, + "auxiliary_loss_mlp": 0.01035182, + "balance_loss_clip": 1.0408566, + "balance_loss_mlp": 1.02234364, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.7011411829163257, + "language_loss": 0.78561938, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.80706692, + "num_input_tokens_seen": 228339065, + "step": 10591, + "time_per_iteration": 2.4380557537078857 + }, + { + "auxiliary_loss_clip": 0.01090625, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.03857028, + "balance_loss_mlp": 1.01948309, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.4393816716562742, + "language_loss": 0.88950902, + "learning_rate": 1.231081372744317e-06, + "loss": 0.9107303, + "num_input_tokens_seen": 228359210, + "step": 10592, + "time_per_iteration": 2.546762704849243 + }, + { + "auxiliary_loss_clip": 0.01098875, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.03718889, + "balance_loss_mlp": 1.01900125, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.3406153754496375, + "language_loss": 0.68111062, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70240366, + "num_input_tokens_seen": 228379630, + "step": 10593, + "time_per_iteration": 2.522444725036621 + }, + { + "auxiliary_loss_clip": 0.01061575, + "auxiliary_loss_mlp": 0.01034105, + "balance_loss_clip": 1.03048754, + "balance_loss_mlp": 1.02127182, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 2.2318598884739966, + "language_loss": 0.63858116, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.65953797, + "num_input_tokens_seen": 228401410, + "step": 10594, + "time_per_iteration": 2.7684390544891357 + }, + { + "auxiliary_loss_clip": 0.01030282, + "auxiliary_loss_mlp": 0.0100383, + "balance_loss_clip": 1.01932538, + "balance_loss_mlp": 1.00257826, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7598314226182129, + "language_loss": 0.54611135, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56645244, + "num_input_tokens_seen": 228470335, + "step": 10595, + "time_per_iteration": 3.185525894165039 + }, + { + "auxiliary_loss_clip": 0.01117169, + "auxiliary_loss_mlp": 0.01039359, + "balance_loss_clip": 1.04135346, + "balance_loss_mlp": 1.02609122, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 1.9273939897150316, + "language_loss": 0.66936004, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.69092536, + "num_input_tokens_seen": 228490765, + "step": 10596, + "time_per_iteration": 3.997115135192871 + }, + { + "auxiliary_loss_clip": 0.01099437, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.03898585, + "balance_loss_mlp": 1.02175617, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.11024648869731, + "language_loss": 0.78831208, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.80964929, + "num_input_tokens_seen": 228509700, + "step": 10597, + "time_per_iteration": 2.468578577041626 + }, + { + "auxiliary_loss_clip": 0.01104465, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.04124463, + "balance_loss_mlp": 1.0194813, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.7746244954054313, + "language_loss": 0.74578583, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.76714224, + "num_input_tokens_seen": 228529050, + "step": 10598, + "time_per_iteration": 2.4816031455993652 + }, + { + "auxiliary_loss_clip": 0.01083548, + "auxiliary_loss_mlp": 0.00778512, + "balance_loss_clip": 1.03989959, + "balance_loss_mlp": 1.00062895, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.847408124772469, + "language_loss": 0.68704593, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70566648, + "num_input_tokens_seen": 228544665, + "step": 10599, + "time_per_iteration": 2.5076351165771484 + }, + { + "auxiliary_loss_clip": 0.01076245, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.0352571, + "balance_loss_mlp": 1.02101421, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.3512655180578834, + "language_loss": 0.80539781, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82650071, + "num_input_tokens_seen": 228562060, + "step": 10600, + "time_per_iteration": 2.4958503246307373 + }, + { + "auxiliary_loss_clip": 0.01100854, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.03708494, + "balance_loss_mlp": 1.02194595, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.4606301644635535, + "language_loss": 0.79931539, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82066494, + "num_input_tokens_seen": 228582550, + "step": 10601, + "time_per_iteration": 2.4878640174865723 + }, + { + "auxiliary_loss_clip": 0.01082722, + "auxiliary_loss_mlp": 0.01026358, + "balance_loss_clip": 1.04044414, + "balance_loss_mlp": 1.01424658, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 4.361447097540244, + "language_loss": 0.67525876, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69634956, + "num_input_tokens_seen": 228604960, + "step": 10602, + "time_per_iteration": 2.594005584716797 + }, + { + "auxiliary_loss_clip": 0.010421, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.03423142, + "balance_loss_mlp": 1.01710057, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.6464545556574537, + "language_loss": 0.79596817, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81668431, + "num_input_tokens_seen": 228622195, + "step": 10603, + "time_per_iteration": 2.6274497509002686 + }, + { + "auxiliary_loss_clip": 0.01071779, + "auxiliary_loss_mlp": 0.00776924, + "balance_loss_clip": 1.03928828, + "balance_loss_mlp": 1.00066376, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 2.0600291006949787, + "language_loss": 0.76560938, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78409636, + "num_input_tokens_seen": 228639735, + "step": 10604, + "time_per_iteration": 2.5671794414520264 + }, + { + "auxiliary_loss_clip": 0.01093951, + "auxiliary_loss_mlp": 0.01028867, + "balance_loss_clip": 1.03665185, + "balance_loss_mlp": 1.01611185, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.643757750186765, + "language_loss": 0.76871032, + "learning_rate": 1.226409972197281e-06, + "loss": 0.78993851, + "num_input_tokens_seen": 228658195, + "step": 10605, + "time_per_iteration": 2.5064120292663574 + }, + { + "auxiliary_loss_clip": 0.01056135, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.03600359, + "balance_loss_mlp": 1.0190382, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.6934346428600724, + "language_loss": 0.65563083, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67653966, + "num_input_tokens_seen": 228677415, + "step": 10606, + "time_per_iteration": 2.601003646850586 + }, + { + "auxiliary_loss_clip": 0.01087051, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.03794873, + "balance_loss_mlp": 1.02266335, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.62089826243417, + "language_loss": 0.75321317, + "learning_rate": 1.225691734459971e-06, + "loss": 0.774427, + "num_input_tokens_seen": 228696450, + "step": 10607, + "time_per_iteration": 2.496033191680908 + }, + { + "auxiliary_loss_clip": 0.010897, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.03982615, + "balance_loss_mlp": 1.02269471, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.7857171822670799, + "language_loss": 0.65968966, + "learning_rate": 1.225332659627278e-06, + "loss": 0.68093514, + "num_input_tokens_seen": 228721600, + "step": 10608, + "time_per_iteration": 2.7900185585021973 + }, + { + "auxiliary_loss_clip": 0.00984351, + "auxiliary_loss_mlp": 0.01015021, + "balance_loss_clip": 1.0142827, + "balance_loss_mlp": 1.01386452, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7306305108130886, + "language_loss": 0.51890838, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53890216, + "num_input_tokens_seen": 228784535, + "step": 10609, + "time_per_iteration": 3.30395245552063 + }, + { + "auxiliary_loss_clip": 0.01096799, + "auxiliary_loss_mlp": 0.01025397, + "balance_loss_clip": 1.03532517, + "balance_loss_mlp": 1.01441813, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.6493946274976543, + "language_loss": 0.74728662, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.76850855, + "num_input_tokens_seen": 228804110, + "step": 10610, + "time_per_iteration": 4.963692903518677 + }, + { + "auxiliary_loss_clip": 0.01019166, + "auxiliary_loss_mlp": 0.01003078, + "balance_loss_clip": 1.01502693, + "balance_loss_mlp": 1.00194001, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8590858777023015, + "language_loss": 0.6318509, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65207338, + "num_input_tokens_seen": 228867705, + "step": 10611, + "time_per_iteration": 3.2217884063720703 + }, + { + "auxiliary_loss_clip": 0.01099938, + "auxiliary_loss_mlp": 0.01031723, + "balance_loss_clip": 1.04027033, + "balance_loss_mlp": 1.01928914, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 1.9847562585786986, + "language_loss": 0.72123992, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74255657, + "num_input_tokens_seen": 228889215, + "step": 10612, + "time_per_iteration": 2.5702805519104004 + }, + { + "auxiliary_loss_clip": 0.01021109, + "auxiliary_loss_mlp": 0.01003072, + "balance_loss_clip": 1.01554298, + "balance_loss_mlp": 1.00161159, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7154855819455982, + "language_loss": 0.57892835, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.59917021, + "num_input_tokens_seen": 228948465, + "step": 10613, + "time_per_iteration": 2.9935731887817383 + }, + { + "auxiliary_loss_clip": 0.01071797, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.03550553, + "balance_loss_mlp": 1.01645195, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.7191154840952059, + "language_loss": 0.75705516, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77807951, + "num_input_tokens_seen": 228967955, + "step": 10614, + "time_per_iteration": 2.586970567703247 + }, + { + "auxiliary_loss_clip": 0.01093018, + "auxiliary_loss_mlp": 0.00779165, + "balance_loss_clip": 1.03908324, + "balance_loss_mlp": 1.00069809, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.8903297772646144, + "language_loss": 0.79436648, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.8130883, + "num_input_tokens_seen": 228985495, + "step": 10615, + "time_per_iteration": 2.5227200984954834 + }, + { + "auxiliary_loss_clip": 0.01021134, + "auxiliary_loss_mlp": 0.01002438, + "balance_loss_clip": 1.01982343, + "balance_loss_mlp": 1.001019, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6607381054420666, + "language_loss": 0.55591416, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57614982, + "num_input_tokens_seen": 229052995, + "step": 10616, + "time_per_iteration": 3.1971139907836914 + }, + { + "auxiliary_loss_clip": 0.01085694, + "auxiliary_loss_mlp": 0.01034954, + "balance_loss_clip": 1.0349232, + "balance_loss_mlp": 1.02185845, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.890721103675421, + "language_loss": 0.84454089, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.86574733, + "num_input_tokens_seen": 229071030, + "step": 10617, + "time_per_iteration": 2.48879337310791 + }, + { + "auxiliary_loss_clip": 0.01103089, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.03898776, + "balance_loss_mlp": 1.02625406, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.8157534085086589, + "language_loss": 0.86820233, + "learning_rate": 1.221743529196936e-06, + "loss": 0.88962525, + "num_input_tokens_seen": 229088275, + "step": 10618, + "time_per_iteration": 2.457463026046753 + }, + { + "auxiliary_loss_clip": 0.0106867, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.04232264, + "balance_loss_mlp": 1.02097666, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 3.7783136317180483, + "language_loss": 0.73426998, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75528061, + "num_input_tokens_seen": 229105190, + "step": 10619, + "time_per_iteration": 2.6086320877075195 + }, + { + "auxiliary_loss_clip": 0.01093591, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.03739023, + "balance_loss_mlp": 1.02619624, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 2.177576964219757, + "language_loss": 0.76442218, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78575969, + "num_input_tokens_seen": 229122290, + "step": 10620, + "time_per_iteration": 2.4946248531341553 + }, + { + "auxiliary_loss_clip": 0.0108941, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.03833961, + "balance_loss_mlp": 1.01838017, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 6.499686017028646, + "language_loss": 0.70834553, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.72954679, + "num_input_tokens_seen": 229141620, + "step": 10621, + "time_per_iteration": 2.526634931564331 + }, + { + "auxiliary_loss_clip": 0.01084037, + "auxiliary_loss_mlp": 0.01027941, + "balance_loss_clip": 1.03466845, + "balance_loss_mlp": 1.01675308, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.5099188052411792, + "language_loss": 0.77476627, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79588604, + "num_input_tokens_seen": 229161570, + "step": 10622, + "time_per_iteration": 4.175559043884277 + }, + { + "auxiliary_loss_clip": 0.01075329, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.0382688, + "balance_loss_mlp": 1.02188945, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 2.1573851209142214, + "language_loss": 0.7488749, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.76996833, + "num_input_tokens_seen": 229178465, + "step": 10623, + "time_per_iteration": 2.5329248905181885 + }, + { + "auxiliary_loss_clip": 0.010887, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.03700197, + "balance_loss_mlp": 1.0211488, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.7250119594327393, + "language_loss": 0.76819265, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78940892, + "num_input_tokens_seen": 229198975, + "step": 10624, + "time_per_iteration": 3.9966042041778564 + }, + { + "auxiliary_loss_clip": 0.01054843, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.04000926, + "balance_loss_mlp": 1.0235517, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.771105972317828, + "language_loss": 0.80413854, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82505316, + "num_input_tokens_seen": 229218825, + "step": 10625, + "time_per_iteration": 2.6615426540374756 + }, + { + "auxiliary_loss_clip": 0.0109071, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.03924108, + "balance_loss_mlp": 1.01880419, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.771114887609315, + "language_loss": 0.73125821, + "learning_rate": 1.218874349031654e-06, + "loss": 0.75248307, + "num_input_tokens_seen": 229236060, + "step": 10626, + "time_per_iteration": 2.5051944255828857 + }, + { + "auxiliary_loss_clip": 0.010916, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.03643394, + "balance_loss_mlp": 1.01660323, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.6987212950224415, + "language_loss": 0.72633207, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74754572, + "num_input_tokens_seen": 229255160, + "step": 10627, + "time_per_iteration": 2.485548734664917 + }, + { + "auxiliary_loss_clip": 0.01096281, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.04486465, + "balance_loss_mlp": 1.01674509, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 1.8796213648657574, + "language_loss": 0.67187423, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69314528, + "num_input_tokens_seen": 229278705, + "step": 10628, + "time_per_iteration": 2.6211233139038086 + }, + { + "auxiliary_loss_clip": 0.01108194, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.03820682, + "balance_loss_mlp": 1.02039003, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 1.8101558664752952, + "language_loss": 0.68026626, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.70166981, + "num_input_tokens_seen": 229299990, + "step": 10629, + "time_per_iteration": 2.473759412765503 + }, + { + "auxiliary_loss_clip": 0.01077817, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.03763199, + "balance_loss_mlp": 1.03052592, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.4744660730488868, + "language_loss": 0.75769854, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77893448, + "num_input_tokens_seen": 229319230, + "step": 10630, + "time_per_iteration": 2.54988694190979 + }, + { + "auxiliary_loss_clip": 0.01087645, + "auxiliary_loss_mlp": 0.01034199, + "balance_loss_clip": 1.03538108, + "balance_loss_mlp": 1.02260041, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.7639888547497502, + "language_loss": 0.70465446, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.72587299, + "num_input_tokens_seen": 229338600, + "step": 10631, + "time_per_iteration": 2.52699613571167 + }, + { + "auxiliary_loss_clip": 0.01026998, + "auxiliary_loss_mlp": 0.01010177, + "balance_loss_clip": 1.02585506, + "balance_loss_mlp": 1.00887728, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7767990270747394, + "language_loss": 0.6295017, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.6498735, + "num_input_tokens_seen": 229402420, + "step": 10632, + "time_per_iteration": 3.1234607696533203 + }, + { + "auxiliary_loss_clip": 0.01087519, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.03649426, + "balance_loss_mlp": 1.0220325, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 1.8651002683098754, + "language_loss": 0.67025089, + "learning_rate": 1.216365371217893e-06, + "loss": 0.69147193, + "num_input_tokens_seen": 229419185, + "step": 10633, + "time_per_iteration": 2.5206315517425537 + }, + { + "auxiliary_loss_clip": 0.0104905, + "auxiliary_loss_mlp": 0.01027533, + "balance_loss_clip": 1.03944373, + "balance_loss_mlp": 1.01586235, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 1.8979114126397132, + "language_loss": 0.81655049, + "learning_rate": 1.216007064569225e-06, + "loss": 0.83731627, + "num_input_tokens_seen": 229436735, + "step": 10634, + "time_per_iteration": 2.6293258666992188 + }, + { + "auxiliary_loss_clip": 0.01092619, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.04395998, + "balance_loss_mlp": 1.02388787, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.5651022192640558, + "language_loss": 0.74829376, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.76959687, + "num_input_tokens_seen": 229455595, + "step": 10635, + "time_per_iteration": 4.041502952575684 + }, + { + "auxiliary_loss_clip": 0.01099104, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.04116631, + "balance_loss_mlp": 1.0198667, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.8526915529058585, + "language_loss": 0.71470356, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.7360208, + "num_input_tokens_seen": 229476230, + "step": 10636, + "time_per_iteration": 2.5266616344451904 + }, + { + "auxiliary_loss_clip": 0.01095132, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.0400033, + "balance_loss_mlp": 1.02197278, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 2.0451384296555624, + "language_loss": 0.73363876, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75494051, + "num_input_tokens_seen": 229494300, + "step": 10637, + "time_per_iteration": 2.502253532409668 + }, + { + "auxiliary_loss_clip": 0.01099645, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.04003239, + "balance_loss_mlp": 1.01630759, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.8184175837020817, + "language_loss": 0.77696919, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.79826128, + "num_input_tokens_seen": 229512985, + "step": 10638, + "time_per_iteration": 2.5201282501220703 + }, + { + "auxiliary_loss_clip": 0.01090977, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.03957391, + "balance_loss_mlp": 1.01833463, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 1.653791130332191, + "language_loss": 0.81761682, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83883464, + "num_input_tokens_seen": 229534270, + "step": 10639, + "time_per_iteration": 2.5525150299072266 + }, + { + "auxiliary_loss_clip": 0.01021133, + "auxiliary_loss_mlp": 0.01004793, + "balance_loss_clip": 1.01627266, + "balance_loss_mlp": 1.00355339, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8169354706737242, + "language_loss": 0.5904696, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61072886, + "num_input_tokens_seen": 229596455, + "step": 10640, + "time_per_iteration": 3.048579216003418 + }, + { + "auxiliary_loss_clip": 0.01082714, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.03466201, + "balance_loss_mlp": 1.02180195, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.9822723980131933, + "language_loss": 0.78475457, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80591464, + "num_input_tokens_seen": 229612860, + "step": 10641, + "time_per_iteration": 2.467555522918701 + }, + { + "auxiliary_loss_clip": 0.01073212, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.04129612, + "balance_loss_mlp": 1.01949298, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 1.5376042272124493, + "language_loss": 0.6331867, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65424669, + "num_input_tokens_seen": 229633960, + "step": 10642, + "time_per_iteration": 2.62450909614563 + }, + { + "auxiliary_loss_clip": 0.01010081, + "auxiliary_loss_mlp": 0.01005393, + "balance_loss_clip": 1.01215291, + "balance_loss_mlp": 1.00396276, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 1.0822899169181353, + "language_loss": 0.55963951, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.57979423, + "num_input_tokens_seen": 229686730, + "step": 10643, + "time_per_iteration": 3.0041632652282715 + }, + { + "auxiliary_loss_clip": 0.0108203, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.03616381, + "balance_loss_mlp": 1.01852679, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 1.8513199645427865, + "language_loss": 0.76864368, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.78977811, + "num_input_tokens_seen": 229704800, + "step": 10644, + "time_per_iteration": 2.5571961402893066 + }, + { + "auxiliary_loss_clip": 0.0108186, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.0411582, + "balance_loss_mlp": 1.02180755, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.570349430034264, + "language_loss": 0.82521045, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84638083, + "num_input_tokens_seen": 229725265, + "step": 10645, + "time_per_iteration": 2.560450315475464 + }, + { + "auxiliary_loss_clip": 0.01108831, + "auxiliary_loss_mlp": 0.01047152, + "balance_loss_clip": 1.03975737, + "balance_loss_mlp": 1.03246534, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 3.0228819725408043, + "language_loss": 0.73926353, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.76082337, + "num_input_tokens_seen": 229744840, + "step": 10646, + "time_per_iteration": 2.5355751514434814 + }, + { + "auxiliary_loss_clip": 0.01083062, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.04180956, + "balance_loss_mlp": 1.01967883, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.22793825877303, + "language_loss": 0.80263627, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82380021, + "num_input_tokens_seen": 229759095, + "step": 10647, + "time_per_iteration": 2.5202198028564453 + }, + { + "auxiliary_loss_clip": 0.01071187, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.03764021, + "balance_loss_mlp": 1.02086484, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 1.5690367578732116, + "language_loss": 0.75767481, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.7787202, + "num_input_tokens_seen": 229777750, + "step": 10648, + "time_per_iteration": 2.6269171237945557 + }, + { + "auxiliary_loss_clip": 0.01089016, + "auxiliary_loss_mlp": 0.01030287, + "balance_loss_clip": 1.03832126, + "balance_loss_mlp": 1.01755595, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 2.7975257399056295, + "language_loss": 0.78410238, + "learning_rate": 1.210636039936138e-06, + "loss": 0.80529547, + "num_input_tokens_seen": 229796785, + "step": 10649, + "time_per_iteration": 4.007266044616699 + }, + { + "auxiliary_loss_clip": 0.01063066, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.04335022, + "balance_loss_mlp": 1.02013838, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 4.9687724888788845, + "language_loss": 0.76053762, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.78149766, + "num_input_tokens_seen": 229815425, + "step": 10650, + "time_per_iteration": 2.5944905281066895 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.04170489, + "balance_loss_mlp": 1.01950669, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 1.5529250061871847, + "language_loss": 0.71017563, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.73167324, + "num_input_tokens_seen": 229834545, + "step": 10651, + "time_per_iteration": 2.4566004276275635 + }, + { + "auxiliary_loss_clip": 0.01082156, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.03792822, + "balance_loss_mlp": 1.02510202, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.2096904055882423, + "language_loss": 0.63584125, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.65704846, + "num_input_tokens_seen": 229849175, + "step": 10652, + "time_per_iteration": 2.5392279624938965 + }, + { + "auxiliary_loss_clip": 0.01093073, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.03781843, + "balance_loss_mlp": 1.01681232, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 2.024086391846451, + "language_loss": 0.79232848, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81355345, + "num_input_tokens_seen": 229865400, + "step": 10653, + "time_per_iteration": 2.4892635345458984 + }, + { + "auxiliary_loss_clip": 0.01093381, + "auxiliary_loss_mlp": 0.01054438, + "balance_loss_clip": 1.03644395, + "balance_loss_mlp": 1.03764129, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.4520075578563416, + "language_loss": 0.70569944, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72717762, + "num_input_tokens_seen": 229882945, + "step": 10654, + "time_per_iteration": 2.519700288772583 + }, + { + "auxiliary_loss_clip": 0.01108677, + "auxiliary_loss_mlp": 0.01043769, + "balance_loss_clip": 1.04130435, + "balance_loss_mlp": 1.02983963, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 2.2120618326927834, + "language_loss": 0.7262181, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.74774253, + "num_input_tokens_seen": 229901590, + "step": 10655, + "time_per_iteration": 2.4931647777557373 + }, + { + "auxiliary_loss_clip": 0.01084991, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.04491162, + "balance_loss_mlp": 1.02394176, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.7959555345641212, + "language_loss": 0.8258127, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.8470329, + "num_input_tokens_seen": 229922535, + "step": 10656, + "time_per_iteration": 2.6241037845611572 + }, + { + "auxiliary_loss_clip": 0.0106035, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.03622389, + "balance_loss_mlp": 1.02474999, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.243965820580936, + "language_loss": 0.722013, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74298733, + "num_input_tokens_seen": 229939575, + "step": 10657, + "time_per_iteration": 2.5595955848693848 + }, + { + "auxiliary_loss_clip": 0.01080541, + "auxiliary_loss_mlp": 0.01039008, + "balance_loss_clip": 1.03633606, + "balance_loss_mlp": 1.02647328, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 2.056601764036046, + "language_loss": 0.77388227, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.7950778, + "num_input_tokens_seen": 229958840, + "step": 10658, + "time_per_iteration": 2.551072359085083 + }, + { + "auxiliary_loss_clip": 0.01118384, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.0415628, + "balance_loss_mlp": 1.0247035, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 1.6892552123146873, + "language_loss": 0.76169753, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78326362, + "num_input_tokens_seen": 229979680, + "step": 10659, + "time_per_iteration": 2.4756720066070557 + }, + { + "auxiliary_loss_clip": 0.01104354, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.04032147, + "balance_loss_mlp": 1.02056479, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.77889162901845, + "language_loss": 0.78299612, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80437887, + "num_input_tokens_seen": 229996830, + "step": 10660, + "time_per_iteration": 2.4434478282928467 + }, + { + "auxiliary_loss_clip": 0.01097614, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.04274225, + "balance_loss_mlp": 1.02319384, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 1.7029063436503857, + "language_loss": 0.68631619, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70766973, + "num_input_tokens_seen": 230015115, + "step": 10661, + "time_per_iteration": 4.243564605712891 + }, + { + "auxiliary_loss_clip": 0.01114596, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.04257798, + "balance_loss_mlp": 1.02579045, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.6125309254157714, + "language_loss": 0.76118773, + "learning_rate": 1.205986598033362e-06, + "loss": 0.78271377, + "num_input_tokens_seen": 230035515, + "step": 10662, + "time_per_iteration": 2.4722697734832764 + }, + { + "auxiliary_loss_clip": 0.01099575, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.03730822, + "balance_loss_mlp": 1.02570224, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 3.474690552011396, + "language_loss": 0.70029557, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.72168857, + "num_input_tokens_seen": 230054355, + "step": 10663, + "time_per_iteration": 3.798680067062378 + }, + { + "auxiliary_loss_clip": 0.01083632, + "auxiliary_loss_mlp": 0.01043624, + "balance_loss_clip": 1.04075372, + "balance_loss_mlp": 1.02825189, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 1.9749001795581589, + "language_loss": 0.68210328, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70337588, + "num_input_tokens_seen": 230074605, + "step": 10664, + "time_per_iteration": 2.5647122859954834 + }, + { + "auxiliary_loss_clip": 0.0108958, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.03615785, + "balance_loss_mlp": 1.0210284, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.995838503144128, + "language_loss": 0.66424459, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68547666, + "num_input_tokens_seen": 230093820, + "step": 10665, + "time_per_iteration": 2.5126736164093018 + }, + { + "auxiliary_loss_clip": 0.01104107, + "auxiliary_loss_mlp": 0.0103064, + "balance_loss_clip": 1.04029441, + "balance_loss_mlp": 1.01783061, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.6412627871496808, + "language_loss": 0.64478379, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66613126, + "num_input_tokens_seen": 230114285, + "step": 10666, + "time_per_iteration": 2.490802764892578 + }, + { + "auxiliary_loss_clip": 0.01106084, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.04033065, + "balance_loss_mlp": 1.02095771, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.7386311062290285, + "language_loss": 0.71100247, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73240578, + "num_input_tokens_seen": 230132760, + "step": 10667, + "time_per_iteration": 2.4665629863739014 + }, + { + "auxiliary_loss_clip": 0.01067487, + "auxiliary_loss_mlp": 0.00783806, + "balance_loss_clip": 1.03375208, + "balance_loss_mlp": 1.0007925, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.1356742685142485, + "language_loss": 0.77608907, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79460198, + "num_input_tokens_seen": 230149690, + "step": 10668, + "time_per_iteration": 2.7765491008758545 + }, + { + "auxiliary_loss_clip": 0.01108423, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.04335606, + "balance_loss_mlp": 1.02415693, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.5527874138862692, + "language_loss": 0.67475963, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69621426, + "num_input_tokens_seen": 230166950, + "step": 10669, + "time_per_iteration": 2.4952616691589355 + }, + { + "auxiliary_loss_clip": 0.01116521, + "auxiliary_loss_mlp": 0.01041935, + "balance_loss_clip": 1.04597938, + "balance_loss_mlp": 1.02819061, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 2.490509261263805, + "language_loss": 0.78439391, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80597848, + "num_input_tokens_seen": 230184785, + "step": 10670, + "time_per_iteration": 2.4847044944763184 + }, + { + "auxiliary_loss_clip": 0.01082295, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.03969538, + "balance_loss_mlp": 1.01944458, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.582971307442506, + "language_loss": 0.88835758, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90951347, + "num_input_tokens_seen": 230201385, + "step": 10671, + "time_per_iteration": 2.50581431388855 + }, + { + "auxiliary_loss_clip": 0.01105445, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.04190636, + "balance_loss_mlp": 1.01726234, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.4848634758699575, + "language_loss": 0.69440562, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71575236, + "num_input_tokens_seen": 230220380, + "step": 10672, + "time_per_iteration": 2.483025074005127 + }, + { + "auxiliary_loss_clip": 0.01111492, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.04213524, + "balance_loss_mlp": 1.01863551, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 1.8984654613934346, + "language_loss": 0.74482942, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76628041, + "num_input_tokens_seen": 230239845, + "step": 10673, + "time_per_iteration": 2.489311933517456 + }, + { + "auxiliary_loss_clip": 0.0107639, + "auxiliary_loss_mlp": 0.01038234, + "balance_loss_clip": 1.03581142, + "balance_loss_mlp": 1.02372622, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 1.6313331108108915, + "language_loss": 0.69596773, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71711397, + "num_input_tokens_seen": 230262420, + "step": 10674, + "time_per_iteration": 4.082748889923096 + }, + { + "auxiliary_loss_clip": 0.01120657, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.04156578, + "balance_loss_mlp": 1.01840544, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 4.522476449344954, + "language_loss": 0.67080498, + "learning_rate": 1.201342244560338e-06, + "loss": 0.69233346, + "num_input_tokens_seen": 230279950, + "step": 10675, + "time_per_iteration": 2.424525260925293 + }, + { + "auxiliary_loss_clip": 0.01117488, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.04263043, + "balance_loss_mlp": 1.02434111, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 1.7431797200107761, + "language_loss": 0.66282392, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.6843704, + "num_input_tokens_seen": 230299705, + "step": 10676, + "time_per_iteration": 2.4383177757263184 + }, + { + "auxiliary_loss_clip": 0.01121006, + "auxiliary_loss_mlp": 0.01035436, + "balance_loss_clip": 1.04471457, + "balance_loss_mlp": 1.02114308, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 3.561126683482044, + "language_loss": 0.75986451, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.78142893, + "num_input_tokens_seen": 230320030, + "step": 10677, + "time_per_iteration": 2.4728806018829346 + }, + { + "auxiliary_loss_clip": 0.01019093, + "auxiliary_loss_mlp": 0.01007021, + "balance_loss_clip": 1.01471984, + "balance_loss_mlp": 1.00573921, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7748291244038608, + "language_loss": 0.60715991, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62742102, + "num_input_tokens_seen": 230381495, + "step": 10678, + "time_per_iteration": 3.152754306793213 + }, + { + "auxiliary_loss_clip": 0.01099544, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.03726399, + "balance_loss_mlp": 1.02391922, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.7401332392774982, + "language_loss": 0.67336857, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69473135, + "num_input_tokens_seen": 230401385, + "step": 10679, + "time_per_iteration": 2.4575889110565186 + }, + { + "auxiliary_loss_clip": 0.01103527, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.0392741, + "balance_loss_mlp": 1.02399373, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 2.0107740536996372, + "language_loss": 0.72935879, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75078452, + "num_input_tokens_seen": 230421340, + "step": 10680, + "time_per_iteration": 2.483199119567871 + }, + { + "auxiliary_loss_clip": 0.01078438, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03706622, + "balance_loss_mlp": 1.01818776, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.8565287068757301, + "language_loss": 0.6835385, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.7046212, + "num_input_tokens_seen": 230441270, + "step": 10681, + "time_per_iteration": 2.556877851486206 + }, + { + "auxiliary_loss_clip": 0.01113643, + "auxiliary_loss_mlp": 0.01030843, + "balance_loss_clip": 1.04032695, + "balance_loss_mlp": 1.0183022, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.7689915328022183, + "language_loss": 0.74585617, + "learning_rate": 1.198843556910427e-06, + "loss": 0.76730096, + "num_input_tokens_seen": 230457455, + "step": 10682, + "time_per_iteration": 2.416062116622925 + }, + { + "auxiliary_loss_clip": 0.01052703, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.03754497, + "balance_loss_mlp": 1.02180505, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.8481511885620656, + "language_loss": 0.79280353, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81366682, + "num_input_tokens_seen": 230478955, + "step": 10683, + "time_per_iteration": 2.6464221477508545 + }, + { + "auxiliary_loss_clip": 0.01115377, + "auxiliary_loss_mlp": 0.0103717, + "balance_loss_clip": 1.04011536, + "balance_loss_mlp": 1.0232408, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.878870001277522, + "language_loss": 0.6737749, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69530034, + "num_input_tokens_seen": 230496425, + "step": 10684, + "time_per_iteration": 2.428420066833496 + }, + { + "auxiliary_loss_clip": 0.01104001, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.03889954, + "balance_loss_mlp": 1.02024567, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.070241106931634, + "language_loss": 0.71378535, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73516023, + "num_input_tokens_seen": 230516245, + "step": 10685, + "time_per_iteration": 2.4960787296295166 + }, + { + "auxiliary_loss_clip": 0.01076791, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.03545451, + "balance_loss_mlp": 1.02066231, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.6535963954299076, + "language_loss": 0.75736117, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77845246, + "num_input_tokens_seen": 230534745, + "step": 10686, + "time_per_iteration": 2.5516631603240967 + }, + { + "auxiliary_loss_clip": 0.01081179, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.03987265, + "balance_loss_mlp": 1.01831412, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 2.009970372117467, + "language_loss": 0.68508148, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70621336, + "num_input_tokens_seen": 230555895, + "step": 10687, + "time_per_iteration": 2.5834035873413086 + }, + { + "auxiliary_loss_clip": 0.01090004, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.0368762, + "balance_loss_mlp": 1.02170372, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 2.2698354906690588, + "language_loss": 0.66415966, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68540817, + "num_input_tokens_seen": 230577460, + "step": 10688, + "time_per_iteration": 2.5476832389831543 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01033313, + "balance_loss_clip": 1.03951097, + "balance_loss_mlp": 1.0200032, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.667090511875191, + "language_loss": 0.73007584, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.7515471, + "num_input_tokens_seen": 230595030, + "step": 10689, + "time_per_iteration": 3.9403493404388428 + }, + { + "auxiliary_loss_clip": 0.01098882, + "auxiliary_loss_mlp": 0.01028683, + "balance_loss_clip": 1.04063439, + "balance_loss_mlp": 1.01688766, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 1.8903844506409755, + "language_loss": 0.72056049, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74183619, + "num_input_tokens_seen": 230615135, + "step": 10690, + "time_per_iteration": 2.485578775405884 + }, + { + "auxiliary_loss_clip": 0.01086622, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.03774261, + "balance_loss_mlp": 1.0175879, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.6255069299980027, + "language_loss": 0.77687907, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.7980485, + "num_input_tokens_seen": 230631965, + "step": 10691, + "time_per_iteration": 2.477691650390625 + }, + { + "auxiliary_loss_clip": 0.01094057, + "auxiliary_loss_mlp": 0.01032884, + "balance_loss_clip": 1.03918433, + "balance_loss_mlp": 1.02030134, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 1.5902724085226143, + "language_loss": 0.74083304, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76210248, + "num_input_tokens_seen": 230649565, + "step": 10692, + "time_per_iteration": 2.48107647895813 + }, + { + "auxiliary_loss_clip": 0.0110422, + "auxiliary_loss_mlp": 0.01035989, + "balance_loss_clip": 1.04010534, + "balance_loss_mlp": 1.0235014, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 1.8776599896165787, + "language_loss": 0.61313093, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63453293, + "num_input_tokens_seen": 230669265, + "step": 10693, + "time_per_iteration": 2.48049259185791 + }, + { + "auxiliary_loss_clip": 0.01078601, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.03958559, + "balance_loss_mlp": 1.01333594, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.6023613372377352, + "language_loss": 0.5940994, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61514759, + "num_input_tokens_seen": 230690575, + "step": 10694, + "time_per_iteration": 2.6307902336120605 + }, + { + "auxiliary_loss_clip": 0.01090058, + "auxiliary_loss_mlp": 0.01031646, + "balance_loss_clip": 1.03748953, + "balance_loss_mlp": 1.01873553, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.377843943345458, + "language_loss": 0.80075562, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82197261, + "num_input_tokens_seen": 230709420, + "step": 10695, + "time_per_iteration": 2.5193114280700684 + }, + { + "auxiliary_loss_clip": 0.01113627, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_clip": 1.03863096, + "balance_loss_mlp": 1.0281713, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 1.749165293166427, + "language_loss": 0.73704743, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75860262, + "num_input_tokens_seen": 230729350, + "step": 10696, + "time_per_iteration": 2.50475811958313 + }, + { + "auxiliary_loss_clip": 0.01077672, + "auxiliary_loss_mlp": 0.01027141, + "balance_loss_clip": 1.03502214, + "balance_loss_mlp": 1.01486206, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 2.054019553920488, + "language_loss": 0.75377572, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77482378, + "num_input_tokens_seen": 230749220, + "step": 10697, + "time_per_iteration": 2.552515983581543 + }, + { + "auxiliary_loss_clip": 0.01089352, + "auxiliary_loss_mlp": 0.01033262, + "balance_loss_clip": 1.03697395, + "balance_loss_mlp": 1.02072716, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.4306984729350614, + "language_loss": 0.65961444, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68084055, + "num_input_tokens_seen": 230770245, + "step": 10698, + "time_per_iteration": 2.642364501953125 + }, + { + "auxiliary_loss_clip": 0.01034305, + "auxiliary_loss_mlp": 0.01003285, + "balance_loss_clip": 1.0099318, + "balance_loss_mlp": 1.00214636, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8296682823962455, + "language_loss": 0.63430995, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65468585, + "num_input_tokens_seen": 230837030, + "step": 10699, + "time_per_iteration": 2.9974863529205322 + }, + { + "auxiliary_loss_clip": 0.01102688, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.04046082, + "balance_loss_mlp": 1.01437294, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.751500605875288, + "language_loss": 0.6924848, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71377349, + "num_input_tokens_seen": 230856845, + "step": 10700, + "time_per_iteration": 3.9760236740112305 + }, + { + "auxiliary_loss_clip": 0.0111486, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.04017711, + "balance_loss_mlp": 1.01454282, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.6591431736733908, + "language_loss": 0.73816812, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75959229, + "num_input_tokens_seen": 230878785, + "step": 10701, + "time_per_iteration": 2.4691545963287354 + }, + { + "auxiliary_loss_clip": 0.01105239, + "auxiliary_loss_mlp": 0.01030355, + "balance_loss_clip": 1.03847194, + "balance_loss_mlp": 1.01600266, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 2.304245848411283, + "language_loss": 0.82105601, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84241199, + "num_input_tokens_seen": 230895445, + "step": 10702, + "time_per_iteration": 2.4432873725891113 + }, + { + "auxiliary_loss_clip": 0.01084917, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.03651178, + "balance_loss_mlp": 1.02245295, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 3.098726456440082, + "language_loss": 0.7485432, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76974505, + "num_input_tokens_seen": 230911375, + "step": 10703, + "time_per_iteration": 3.8699328899383545 + }, + { + "auxiliary_loss_clip": 0.00994168, + "auxiliary_loss_mlp": 0.01006043, + "balance_loss_clip": 1.02220416, + "balance_loss_mlp": 1.00483263, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6645458886732105, + "language_loss": 0.54565775, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56565988, + "num_input_tokens_seen": 230975990, + "step": 10704, + "time_per_iteration": 3.172346353530884 + }, + { + "auxiliary_loss_clip": 0.01070221, + "auxiliary_loss_mlp": 0.01024197, + "balance_loss_clip": 1.03773308, + "balance_loss_mlp": 1.01263988, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.6428385140994957, + "language_loss": 0.77043223, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79137641, + "num_input_tokens_seen": 230997110, + "step": 10705, + "time_per_iteration": 2.5488409996032715 + }, + { + "auxiliary_loss_clip": 0.01079086, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.0360527, + "balance_loss_mlp": 1.02396512, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.9407932986547793, + "language_loss": 0.79146481, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81261742, + "num_input_tokens_seen": 231015590, + "step": 10706, + "time_per_iteration": 2.5736565589904785 + }, + { + "auxiliary_loss_clip": 0.01073628, + "auxiliary_loss_mlp": 0.01036704, + "balance_loss_clip": 1.03285003, + "balance_loss_mlp": 1.0221839, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 1.9677451120474356, + "language_loss": 0.80110943, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82221282, + "num_input_tokens_seen": 231033800, + "step": 10707, + "time_per_iteration": 2.561469316482544 + }, + { + "auxiliary_loss_clip": 0.01102752, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.03886676, + "balance_loss_mlp": 1.0173862, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.7712947855971644, + "language_loss": 0.8580904, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.87941784, + "num_input_tokens_seen": 231053160, + "step": 10708, + "time_per_iteration": 2.4999232292175293 + }, + { + "auxiliary_loss_clip": 0.01074069, + "auxiliary_loss_mlp": 0.01040516, + "balance_loss_clip": 1.0430336, + "balance_loss_mlp": 1.02608585, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 5.286725664920955, + "language_loss": 0.65809882, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67924464, + "num_input_tokens_seen": 231069470, + "step": 10709, + "time_per_iteration": 2.5591492652893066 + }, + { + "auxiliary_loss_clip": 0.01112909, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.03869867, + "balance_loss_mlp": 1.01835012, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 2.266939649307029, + "language_loss": 0.80254257, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.82397491, + "num_input_tokens_seen": 231088205, + "step": 10710, + "time_per_iteration": 2.47337007522583 + }, + { + "auxiliary_loss_clip": 0.01101477, + "auxiliary_loss_mlp": 0.0102938, + "balance_loss_clip": 1.03712702, + "balance_loss_mlp": 1.01694667, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1.642405170947947, + "language_loss": 0.66203904, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68334764, + "num_input_tokens_seen": 231107850, + "step": 10711, + "time_per_iteration": 2.548973321914673 + }, + { + "auxiliary_loss_clip": 0.01076058, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.04096031, + "balance_loss_mlp": 1.01944292, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.7249211745069528, + "language_loss": 0.78930211, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.81038654, + "num_input_tokens_seen": 231127200, + "step": 10712, + "time_per_iteration": 2.5769803524017334 + }, + { + "auxiliary_loss_clip": 0.01103067, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.03701437, + "balance_loss_mlp": 1.02382076, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.6466622997634863, + "language_loss": 0.82696104, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84836376, + "num_input_tokens_seen": 231146360, + "step": 10713, + "time_per_iteration": 2.4700491428375244 + }, + { + "auxiliary_loss_clip": 0.01112095, + "auxiliary_loss_mlp": 0.01037265, + "balance_loss_clip": 1.04048634, + "balance_loss_mlp": 1.02458692, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.5360254050802842, + "language_loss": 0.78291428, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80440784, + "num_input_tokens_seen": 231168350, + "step": 10714, + "time_per_iteration": 3.981034755706787 + }, + { + "auxiliary_loss_clip": 0.01078852, + "auxiliary_loss_mlp": 0.01028672, + "balance_loss_clip": 1.03584754, + "balance_loss_mlp": 1.01707876, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.4491995993593525, + "language_loss": 0.81709456, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83816981, + "num_input_tokens_seen": 231188385, + "step": 10715, + "time_per_iteration": 2.585132122039795 + }, + { + "auxiliary_loss_clip": 0.01089242, + "auxiliary_loss_mlp": 0.01033117, + "balance_loss_clip": 1.03926516, + "balance_loss_mlp": 1.01977789, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 8.545049579401915, + "language_loss": 0.81271946, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83394307, + "num_input_tokens_seen": 231209880, + "step": 10716, + "time_per_iteration": 2.5693392753601074 + }, + { + "auxiliary_loss_clip": 0.01082656, + "auxiliary_loss_mlp": 0.01035817, + "balance_loss_clip": 1.03855491, + "balance_loss_mlp": 1.02133262, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 5.735630162181252, + "language_loss": 0.7802518, + "learning_rate": 1.186372540666424e-06, + "loss": 0.80143654, + "num_input_tokens_seen": 231230765, + "step": 10717, + "time_per_iteration": 2.6018972396850586 + }, + { + "auxiliary_loss_clip": 0.01111964, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.04029918, + "balance_loss_mlp": 1.02032948, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.6260403236537682, + "language_loss": 0.68191439, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70335978, + "num_input_tokens_seen": 231252350, + "step": 10718, + "time_per_iteration": 2.518249750137329 + }, + { + "auxiliary_loss_clip": 0.01027985, + "auxiliary_loss_mlp": 0.01007356, + "balance_loss_clip": 1.01354194, + "balance_loss_mlp": 1.00614631, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7498186975108131, + "language_loss": 0.49553737, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51589078, + "num_input_tokens_seen": 231313865, + "step": 10719, + "time_per_iteration": 3.197908639907837 + }, + { + "auxiliary_loss_clip": 0.01117898, + "auxiliary_loss_mlp": 0.01038268, + "balance_loss_clip": 1.0419507, + "balance_loss_mlp": 1.024315, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 1.7277020563286156, + "language_loss": 0.77979904, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80136067, + "num_input_tokens_seen": 231331710, + "step": 10720, + "time_per_iteration": 2.4465620517730713 + }, + { + "auxiliary_loss_clip": 0.0109269, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.03983188, + "balance_loss_mlp": 1.02052951, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 2.287322600574947, + "language_loss": 0.77090621, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79217166, + "num_input_tokens_seen": 231350705, + "step": 10721, + "time_per_iteration": 2.511709451675415 + }, + { + "auxiliary_loss_clip": 0.01075374, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.03646195, + "balance_loss_mlp": 1.01820219, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 3.1954594061717545, + "language_loss": 0.73117977, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75224507, + "num_input_tokens_seen": 231369550, + "step": 10722, + "time_per_iteration": 2.5476112365722656 + }, + { + "auxiliary_loss_clip": 0.0111106, + "auxiliary_loss_mlp": 0.01029816, + "balance_loss_clip": 1.03897297, + "balance_loss_mlp": 1.01778162, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 1.428027012677476, + "language_loss": 0.78231966, + "learning_rate": 1.184238431012635e-06, + "loss": 0.80372846, + "num_input_tokens_seen": 231389285, + "step": 10723, + "time_per_iteration": 2.4688634872436523 + }, + { + "auxiliary_loss_clip": 0.01104243, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.037251, + "balance_loss_mlp": 1.0241363, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.6598537117576242, + "language_loss": 0.58399397, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60541356, + "num_input_tokens_seen": 231408820, + "step": 10724, + "time_per_iteration": 2.511802911758423 + }, + { + "auxiliary_loss_clip": 0.01100821, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.03993583, + "balance_loss_mlp": 1.0211345, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.6215200933543426, + "language_loss": 0.83766943, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85900319, + "num_input_tokens_seen": 231428100, + "step": 10725, + "time_per_iteration": 2.4618639945983887 + }, + { + "auxiliary_loss_clip": 0.01088751, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.0350126, + "balance_loss_mlp": 1.02489662, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 4.728646032807686, + "language_loss": 0.82343632, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84470642, + "num_input_tokens_seen": 231445810, + "step": 10726, + "time_per_iteration": 2.5338079929351807 + }, + { + "auxiliary_loss_clip": 0.01100745, + "auxiliary_loss_mlp": 0.01035687, + "balance_loss_clip": 1.03777778, + "balance_loss_mlp": 1.02225196, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 1.7808028029112002, + "language_loss": 0.81128407, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.8326484, + "num_input_tokens_seen": 231463570, + "step": 10727, + "time_per_iteration": 2.454791784286499 + }, + { + "auxiliary_loss_clip": 0.01113124, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.04347014, + "balance_loss_mlp": 1.02339554, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.4669839886926623, + "language_loss": 0.79092801, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81242907, + "num_input_tokens_seen": 231482155, + "step": 10728, + "time_per_iteration": 4.116517543792725 + }, + { + "auxiliary_loss_clip": 0.01021729, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.03282964, + "balance_loss_mlp": 1.02034926, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.6695734964383573, + "language_loss": 0.74054509, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76111102, + "num_input_tokens_seen": 231502465, + "step": 10729, + "time_per_iteration": 2.9870285987854004 + }, + { + "auxiliary_loss_clip": 0.01073081, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.03953755, + "balance_loss_mlp": 1.02164054, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.9752164978246165, + "language_loss": 0.66374546, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68483222, + "num_input_tokens_seen": 231522740, + "step": 10730, + "time_per_iteration": 2.945488929748535 + }, + { + "auxiliary_loss_clip": 0.01053219, + "auxiliary_loss_mlp": 0.01034201, + "balance_loss_clip": 1.03905559, + "balance_loss_mlp": 1.0194906, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.5686045511946725, + "language_loss": 0.63829833, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65917253, + "num_input_tokens_seen": 231542050, + "step": 10731, + "time_per_iteration": 2.5991647243499756 + }, + { + "auxiliary_loss_clip": 0.01111501, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.03831577, + "balance_loss_mlp": 1.02002978, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 1.545271272485205, + "language_loss": 0.67969251, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70113921, + "num_input_tokens_seen": 231560380, + "step": 10732, + "time_per_iteration": 2.8270151615142822 + }, + { + "auxiliary_loss_clip": 0.01103391, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.04028654, + "balance_loss_mlp": 1.01986158, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.694889705191677, + "language_loss": 0.75775397, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77911103, + "num_input_tokens_seen": 231580810, + "step": 10733, + "time_per_iteration": 2.5847108364105225 + }, + { + "auxiliary_loss_clip": 0.01104752, + "auxiliary_loss_mlp": 0.01039578, + "balance_loss_clip": 1.03880286, + "balance_loss_mlp": 1.02556443, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 3.7625129376966213, + "language_loss": 0.67313123, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.69457453, + "num_input_tokens_seen": 231600585, + "step": 10734, + "time_per_iteration": 2.5155115127563477 + }, + { + "auxiliary_loss_clip": 0.01111551, + "auxiliary_loss_mlp": 0.01043388, + "balance_loss_clip": 1.04160285, + "balance_loss_mlp": 1.03052568, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 1.8560506859108827, + "language_loss": 0.7349149, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.75646424, + "num_input_tokens_seen": 231618765, + "step": 10735, + "time_per_iteration": 2.410480260848999 + }, + { + "auxiliary_loss_clip": 0.01053631, + "auxiliary_loss_mlp": 0.00780786, + "balance_loss_clip": 1.03572047, + "balance_loss_mlp": 1.000911, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 2.101122449611607, + "language_loss": 0.75089681, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.76924098, + "num_input_tokens_seen": 231638525, + "step": 10736, + "time_per_iteration": 2.6519858837127686 + }, + { + "auxiliary_loss_clip": 0.01108587, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.04138184, + "balance_loss_mlp": 1.01715183, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 1.8463048929889074, + "language_loss": 0.70034838, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.72174037, + "num_input_tokens_seen": 231656785, + "step": 10737, + "time_per_iteration": 2.4547958374023438 + }, + { + "auxiliary_loss_clip": 0.01025967, + "auxiliary_loss_mlp": 0.01004189, + "balance_loss_clip": 1.01018524, + "balance_loss_mlp": 1.00303304, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7813171818646975, + "language_loss": 0.58489639, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60519791, + "num_input_tokens_seen": 231719075, + "step": 10738, + "time_per_iteration": 3.1194357872009277 + }, + { + "auxiliary_loss_clip": 0.01078953, + "auxiliary_loss_mlp": 0.01027128, + "balance_loss_clip": 1.03924966, + "balance_loss_mlp": 1.01433682, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.9878469106798131, + "language_loss": 0.74700814, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76806891, + "num_input_tokens_seen": 231737810, + "step": 10739, + "time_per_iteration": 2.551400661468506 + }, + { + "auxiliary_loss_clip": 0.01097533, + "auxiliary_loss_mlp": 0.00779264, + "balance_loss_clip": 1.04095542, + "balance_loss_mlp": 1.00099254, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.8745484964754378, + "language_loss": 0.72030926, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.73907721, + "num_input_tokens_seen": 231756140, + "step": 10740, + "time_per_iteration": 4.066139459609985 + }, + { + "auxiliary_loss_clip": 0.01019687, + "auxiliary_loss_mlp": 0.01003685, + "balance_loss_clip": 1.01271236, + "balance_loss_mlp": 1.00243318, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 1.7481379362641267, + "language_loss": 0.55293316, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57316685, + "num_input_tokens_seen": 231823665, + "step": 10741, + "time_per_iteration": 3.093096971511841 + }, + { + "auxiliary_loss_clip": 0.01112274, + "auxiliary_loss_mlp": 0.0103568, + "balance_loss_clip": 1.03906024, + "balance_loss_mlp": 1.02323413, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 1.7532605284196798, + "language_loss": 0.80509019, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82656968, + "num_input_tokens_seen": 231844500, + "step": 10742, + "time_per_iteration": 3.9320333003997803 + }, + { + "auxiliary_loss_clip": 0.01088179, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.03623343, + "balance_loss_mlp": 1.01873183, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.5759159255976245, + "language_loss": 0.81535619, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.83655608, + "num_input_tokens_seen": 231864510, + "step": 10743, + "time_per_iteration": 2.528005838394165 + }, + { + "auxiliary_loss_clip": 0.01086657, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.03661382, + "balance_loss_mlp": 1.01673174, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 5.38934361989578, + "language_loss": 0.7167536, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.73790956, + "num_input_tokens_seen": 231881555, + "step": 10744, + "time_per_iteration": 2.4642562866210938 + }, + { + "auxiliary_loss_clip": 0.01113228, + "auxiliary_loss_mlp": 0.01028248, + "balance_loss_clip": 1.03893411, + "balance_loss_mlp": 1.01570153, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.6568147508075624, + "language_loss": 0.66023451, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68164921, + "num_input_tokens_seen": 231905945, + "step": 10745, + "time_per_iteration": 2.6408355236053467 + }, + { + "auxiliary_loss_clip": 0.01102267, + "auxiliary_loss_mlp": 0.01036101, + "balance_loss_clip": 1.03805232, + "balance_loss_mlp": 1.0224936, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.0466174296629336, + "language_loss": 0.73237622, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75375986, + "num_input_tokens_seen": 231922535, + "step": 10746, + "time_per_iteration": 2.451552152633667 + }, + { + "auxiliary_loss_clip": 0.01105157, + "auxiliary_loss_mlp": 0.01036867, + "balance_loss_clip": 1.04040873, + "balance_loss_mlp": 1.02464795, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.4762781473849094, + "language_loss": 0.6688022, + "learning_rate": 1.175713157660413e-06, + "loss": 0.69022238, + "num_input_tokens_seen": 231944800, + "step": 10747, + "time_per_iteration": 2.5100419521331787 + }, + { + "auxiliary_loss_clip": 0.01087566, + "auxiliary_loss_mlp": 0.01040429, + "balance_loss_clip": 1.04339957, + "balance_loss_mlp": 1.02794158, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 1.616445986180572, + "language_loss": 0.67104256, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69232249, + "num_input_tokens_seen": 231962970, + "step": 10748, + "time_per_iteration": 2.530014753341675 + }, + { + "auxiliary_loss_clip": 0.01118149, + "auxiliary_loss_mlp": 0.01041614, + "balance_loss_clip": 1.04037595, + "balance_loss_mlp": 1.02713025, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.7720274361191173, + "language_loss": 0.7594772, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78107476, + "num_input_tokens_seen": 231981195, + "step": 10749, + "time_per_iteration": 2.42842960357666 + }, + { + "auxiliary_loss_clip": 0.01077261, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.03314495, + "balance_loss_mlp": 1.02661467, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 2.368814274667578, + "language_loss": 0.76924574, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79042816, + "num_input_tokens_seen": 232001735, + "step": 10750, + "time_per_iteration": 2.5804054737091064 + }, + { + "auxiliary_loss_clip": 0.01098101, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.03856683, + "balance_loss_mlp": 1.01934922, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.28964396717258, + "language_loss": 0.68632269, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70764613, + "num_input_tokens_seen": 232019830, + "step": 10751, + "time_per_iteration": 2.5335612297058105 + }, + { + "auxiliary_loss_clip": 0.01087556, + "auxiliary_loss_mlp": 0.01032879, + "balance_loss_clip": 1.03677416, + "balance_loss_mlp": 1.01892519, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 1.916147399564703, + "language_loss": 0.71261013, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73381448, + "num_input_tokens_seen": 232039625, + "step": 10752, + "time_per_iteration": 2.5067138671875 + }, + { + "auxiliary_loss_clip": 0.01084848, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.03548241, + "balance_loss_mlp": 1.02633047, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.966805637988855, + "language_loss": 0.78604579, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80731535, + "num_input_tokens_seen": 232055855, + "step": 10753, + "time_per_iteration": 3.938720941543579 + }, + { + "auxiliary_loss_clip": 0.01114516, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.04084206, + "balance_loss_mlp": 1.02991724, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 3.7080454352378043, + "language_loss": 0.84974957, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87132698, + "num_input_tokens_seen": 232073475, + "step": 10754, + "time_per_iteration": 2.4214868545532227 + }, + { + "auxiliary_loss_clip": 0.0109, + "auxiliary_loss_mlp": 0.01035324, + "balance_loss_clip": 1.03615546, + "balance_loss_mlp": 1.02176356, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 2.0122966161112945, + "language_loss": 0.59886712, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.62012029, + "num_input_tokens_seen": 232091090, + "step": 10755, + "time_per_iteration": 2.5022263526916504 + }, + { + "auxiliary_loss_clip": 0.01071266, + "auxiliary_loss_mlp": 0.01037752, + "balance_loss_clip": 1.03642702, + "balance_loss_mlp": 1.02395296, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 2.1795719675417424, + "language_loss": 0.68252152, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70361173, + "num_input_tokens_seen": 232107320, + "step": 10756, + "time_per_iteration": 2.5310897827148438 + }, + { + "auxiliary_loss_clip": 0.0107885, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.04070199, + "balance_loss_mlp": 1.02255082, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 2.4024414279189474, + "language_loss": 0.74095118, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76210487, + "num_input_tokens_seen": 232123930, + "step": 10757, + "time_per_iteration": 2.579117774963379 + }, + { + "auxiliary_loss_clip": 0.01061743, + "auxiliary_loss_mlp": 0.01034002, + "balance_loss_clip": 1.03829134, + "balance_loss_mlp": 1.02110982, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 2.1233856115773455, + "language_loss": 0.74700624, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76796371, + "num_input_tokens_seen": 232142905, + "step": 10758, + "time_per_iteration": 2.572716474533081 + }, + { + "auxiliary_loss_clip": 0.01077461, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.03995156, + "balance_loss_mlp": 1.02005959, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.6405967641359223, + "language_loss": 0.67575955, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.69687533, + "num_input_tokens_seen": 232162230, + "step": 10759, + "time_per_iteration": 2.5659666061401367 + }, + { + "auxiliary_loss_clip": 0.01081199, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_clip": 1.03391838, + "balance_loss_mlp": 1.02695656, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.488131601627332, + "language_loss": 0.7565971, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77781916, + "num_input_tokens_seen": 232182700, + "step": 10760, + "time_per_iteration": 2.562910556793213 + }, + { + "auxiliary_loss_clip": 0.01086764, + "auxiliary_loss_mlp": 0.01036392, + "balance_loss_clip": 1.03462243, + "balance_loss_mlp": 1.02231884, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.4890057419530645, + "language_loss": 0.65410835, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67533988, + "num_input_tokens_seen": 232208235, + "step": 10761, + "time_per_iteration": 2.761685609817505 + }, + { + "auxiliary_loss_clip": 0.01070262, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.04055727, + "balance_loss_mlp": 1.01842713, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 2.1508282995518067, + "language_loss": 0.69877207, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71979088, + "num_input_tokens_seen": 232228720, + "step": 10762, + "time_per_iteration": 2.5711207389831543 + }, + { + "auxiliary_loss_clip": 0.01118101, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.04060459, + "balance_loss_mlp": 1.01986742, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 9.172995786691054, + "language_loss": 0.82784623, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.84935957, + "num_input_tokens_seen": 232244655, + "step": 10763, + "time_per_iteration": 2.426981210708618 + }, + { + "auxiliary_loss_clip": 0.01034061, + "auxiliary_loss_mlp": 0.01001889, + "balance_loss_clip": 1.00928009, + "balance_loss_mlp": 1.00073266, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.870693967934051, + "language_loss": 0.57782048, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59818, + "num_input_tokens_seen": 232308685, + "step": 10764, + "time_per_iteration": 3.1856236457824707 + }, + { + "auxiliary_loss_clip": 0.01076965, + "auxiliary_loss_mlp": 0.01036559, + "balance_loss_clip": 1.03517866, + "balance_loss_mlp": 1.02334499, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 2.605583073894466, + "language_loss": 0.60974193, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.63087714, + "num_input_tokens_seen": 232327520, + "step": 10765, + "time_per_iteration": 2.618169069290161 + }, + { + "auxiliary_loss_clip": 0.011135, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.03965354, + "balance_loss_mlp": 1.02097213, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.656497849814664, + "language_loss": 0.62799835, + "learning_rate": 1.168976742243437e-06, + "loss": 0.64947289, + "num_input_tokens_seen": 232349025, + "step": 10766, + "time_per_iteration": 2.4780972003936768 + }, + { + "auxiliary_loss_clip": 0.0109031, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.03890598, + "balance_loss_mlp": 1.02449751, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 2.157281763580967, + "language_loss": 0.75811148, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77940536, + "num_input_tokens_seen": 232367835, + "step": 10767, + "time_per_iteration": 2.504102945327759 + }, + { + "auxiliary_loss_clip": 0.01097424, + "auxiliary_loss_mlp": 0.01036797, + "balance_loss_clip": 1.03843951, + "balance_loss_mlp": 1.02382088, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 1.9957807908880858, + "language_loss": 0.77896792, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.80031013, + "num_input_tokens_seen": 232385840, + "step": 10768, + "time_per_iteration": 3.8563921451568604 + }, + { + "auxiliary_loss_clip": 0.01058101, + "auxiliary_loss_mlp": 0.01028835, + "balance_loss_clip": 1.03797567, + "balance_loss_mlp": 1.01562095, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.83328088949917, + "language_loss": 0.71475196, + "learning_rate": 1.167914135250663e-06, + "loss": 0.73562133, + "num_input_tokens_seen": 232406205, + "step": 10769, + "time_per_iteration": 2.582920789718628 + }, + { + "auxiliary_loss_clip": 0.01111347, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.03986096, + "balance_loss_mlp": 1.02129674, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.916225249845902, + "language_loss": 0.7223019, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.7437582, + "num_input_tokens_seen": 232424995, + "step": 10770, + "time_per_iteration": 2.4104297161102295 + }, + { + "auxiliary_loss_clip": 0.01075478, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.03555083, + "balance_loss_mlp": 1.01797962, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.5894101276224293, + "language_loss": 0.73677421, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75785154, + "num_input_tokens_seen": 232445870, + "step": 10771, + "time_per_iteration": 2.665888547897339 + }, + { + "auxiliary_loss_clip": 0.01076244, + "auxiliary_loss_mlp": 0.01038926, + "balance_loss_clip": 1.03388774, + "balance_loss_mlp": 1.02487695, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 1.8902196692040165, + "language_loss": 0.74232668, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.7634784, + "num_input_tokens_seen": 232464285, + "step": 10772, + "time_per_iteration": 2.5494465827941895 + }, + { + "auxiliary_loss_clip": 0.01092435, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.04244173, + "balance_loss_mlp": 1.02286601, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.7142976277235928, + "language_loss": 0.83221108, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85348022, + "num_input_tokens_seen": 232485815, + "step": 10773, + "time_per_iteration": 2.5654263496398926 + }, + { + "auxiliary_loss_clip": 0.0109887, + "auxiliary_loss_mlp": 0.00777876, + "balance_loss_clip": 1.03726292, + "balance_loss_mlp": 1.00101948, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.5554032602715606, + "language_loss": 0.78267652, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80144405, + "num_input_tokens_seen": 232504875, + "step": 10774, + "time_per_iteration": 2.5226638317108154 + }, + { + "auxiliary_loss_clip": 0.01103721, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.03903151, + "balance_loss_mlp": 1.02265596, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.1671208942210916, + "language_loss": 0.69189203, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.71328837, + "num_input_tokens_seen": 232521945, + "step": 10775, + "time_per_iteration": 2.4557242393493652 + }, + { + "auxiliary_loss_clip": 0.01076852, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.03481102, + "balance_loss_mlp": 1.02207351, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.681243274757437, + "language_loss": 0.65872574, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.67984456, + "num_input_tokens_seen": 232541500, + "step": 10776, + "time_per_iteration": 2.5346009731292725 + }, + { + "auxiliary_loss_clip": 0.01085837, + "auxiliary_loss_mlp": 0.01040023, + "balance_loss_clip": 1.03535008, + "balance_loss_mlp": 1.02569389, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.779235663854274, + "language_loss": 0.79124928, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81250787, + "num_input_tokens_seen": 232559720, + "step": 10777, + "time_per_iteration": 2.4730384349823 + }, + { + "auxiliary_loss_clip": 0.01100128, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.03993177, + "balance_loss_mlp": 1.01880252, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 1.7538607955970111, + "language_loss": 0.73561263, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75693464, + "num_input_tokens_seen": 232579370, + "step": 10778, + "time_per_iteration": 3.968177318572998 + }, + { + "auxiliary_loss_clip": 0.01097494, + "auxiliary_loss_mlp": 0.01029828, + "balance_loss_clip": 1.03678322, + "balance_loss_mlp": 1.01714373, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.445241554542636, + "language_loss": 0.78416252, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80543566, + "num_input_tokens_seen": 232600495, + "step": 10779, + "time_per_iteration": 2.5095651149749756 + }, + { + "auxiliary_loss_clip": 0.01006011, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.01433444, + "balance_loss_mlp": 1.00039864, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7286305801558073, + "language_loss": 0.59463286, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61470985, + "num_input_tokens_seen": 232663165, + "step": 10780, + "time_per_iteration": 3.0608537197113037 + }, + { + "auxiliary_loss_clip": 0.01041311, + "auxiliary_loss_mlp": 0.01032633, + "balance_loss_clip": 1.04064679, + "balance_loss_mlp": 1.02040243, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 2.3155996261293863, + "language_loss": 0.79758525, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81832469, + "num_input_tokens_seen": 232683385, + "step": 10781, + "time_per_iteration": 2.8145363330841064 + }, + { + "auxiliary_loss_clip": 0.01117741, + "auxiliary_loss_mlp": 0.01034051, + "balance_loss_clip": 1.04116201, + "balance_loss_mlp": 1.01944733, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 2.3991200058444027, + "language_loss": 0.78855848, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.81007648, + "num_input_tokens_seen": 232699095, + "step": 10782, + "time_per_iteration": 4.409960508346558 + }, + { + "auxiliary_loss_clip": 0.01105938, + "auxiliary_loss_mlp": 0.0078006, + "balance_loss_clip": 1.04056633, + "balance_loss_mlp": 1.00091922, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 3.1621700648739006, + "language_loss": 0.64351803, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.66237801, + "num_input_tokens_seen": 232717920, + "step": 10783, + "time_per_iteration": 2.5634055137634277 + }, + { + "auxiliary_loss_clip": 0.01117886, + "auxiliary_loss_mlp": 0.01035995, + "balance_loss_clip": 1.04082489, + "balance_loss_mlp": 1.02159476, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 1.745161043234625, + "language_loss": 0.88390648, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90544534, + "num_input_tokens_seen": 232737605, + "step": 10784, + "time_per_iteration": 2.4661426544189453 + }, + { + "auxiliary_loss_clip": 0.01089303, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.03909266, + "balance_loss_mlp": 1.01826131, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 2.6264215902736554, + "language_loss": 0.73558176, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75679028, + "num_input_tokens_seen": 232755110, + "step": 10785, + "time_per_iteration": 2.493283271789551 + }, + { + "auxiliary_loss_clip": 0.01077753, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.03786027, + "balance_loss_mlp": 1.01768219, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.4786942971043333, + "language_loss": 0.69239229, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71346867, + "num_input_tokens_seen": 232779040, + "step": 10786, + "time_per_iteration": 2.6091091632843018 + }, + { + "auxiliary_loss_clip": 0.01077327, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.03942394, + "balance_loss_mlp": 1.02060604, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 1.9971163582481717, + "language_loss": 0.71666247, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73777032, + "num_input_tokens_seen": 232800515, + "step": 10787, + "time_per_iteration": 2.5949833393096924 + }, + { + "auxiliary_loss_clip": 0.01118702, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.04143655, + "balance_loss_mlp": 1.01969731, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 1.8895436259063212, + "language_loss": 0.8428241, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86434537, + "num_input_tokens_seen": 232818450, + "step": 10788, + "time_per_iteration": 2.4077632427215576 + }, + { + "auxiliary_loss_clip": 0.01078308, + "auxiliary_loss_mlp": 0.01037372, + "balance_loss_clip": 1.0391835, + "balance_loss_mlp": 1.02424097, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 1.8499379742497293, + "language_loss": 0.77118385, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79234058, + "num_input_tokens_seen": 232834785, + "step": 10789, + "time_per_iteration": 2.4967963695526123 + }, + { + "auxiliary_loss_clip": 0.0109753, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.0357548, + "balance_loss_mlp": 1.02015257, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.70860277664947, + "language_loss": 0.75804722, + "learning_rate": 1.160483857897479e-06, + "loss": 0.77934909, + "num_input_tokens_seen": 232856050, + "step": 10790, + "time_per_iteration": 2.627415657043457 + }, + { + "auxiliary_loss_clip": 0.01114348, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.04200423, + "balance_loss_mlp": 1.02208018, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 1.9920086022684003, + "language_loss": 0.60445017, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62593377, + "num_input_tokens_seen": 232873945, + "step": 10791, + "time_per_iteration": 2.413954734802246 + }, + { + "auxiliary_loss_clip": 0.01074395, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.03702116, + "balance_loss_mlp": 1.01983571, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.9963053253341754, + "language_loss": 0.8602975, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88136619, + "num_input_tokens_seen": 232892160, + "step": 10792, + "time_per_iteration": 2.5432088375091553 + }, + { + "auxiliary_loss_clip": 0.0109148, + "auxiliary_loss_mlp": 0.01044832, + "balance_loss_clip": 1.03677297, + "balance_loss_mlp": 1.03153968, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.071828248197519, + "language_loss": 0.78423917, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80560231, + "num_input_tokens_seen": 232911725, + "step": 10793, + "time_per_iteration": 3.868814706802368 + }, + { + "auxiliary_loss_clip": 0.01081982, + "auxiliary_loss_mlp": 0.01030666, + "balance_loss_clip": 1.03848505, + "balance_loss_mlp": 1.01809573, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 1.941339775127959, + "language_loss": 0.74881399, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.76994044, + "num_input_tokens_seen": 232929085, + "step": 10794, + "time_per_iteration": 2.5531420707702637 + }, + { + "auxiliary_loss_clip": 0.01099043, + "auxiliary_loss_mlp": 0.00778083, + "balance_loss_clip": 1.03616476, + "balance_loss_mlp": 1.00091362, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.6588646785914258, + "language_loss": 0.70219028, + "learning_rate": 1.158716808837621e-06, + "loss": 0.72096151, + "num_input_tokens_seen": 232949455, + "step": 10795, + "time_per_iteration": 2.5078423023223877 + }, + { + "auxiliary_loss_clip": 0.01095013, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.03973055, + "balance_loss_mlp": 1.02712524, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.8352637251602928, + "language_loss": 0.53807008, + "learning_rate": 1.158363494676679e-06, + "loss": 0.55943549, + "num_input_tokens_seen": 232969445, + "step": 10796, + "time_per_iteration": 2.5520384311676025 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.01031973, + "balance_loss_clip": 1.03777158, + "balance_loss_mlp": 1.0198617, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 1.6170316625767462, + "language_loss": 0.77679443, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.79813147, + "num_input_tokens_seen": 232988900, + "step": 10797, + "time_per_iteration": 2.5130422115325928 + }, + { + "auxiliary_loss_clip": 0.01063816, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.03776836, + "balance_loss_mlp": 1.01989663, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 1.9847233659916683, + "language_loss": 0.70465386, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72560644, + "num_input_tokens_seen": 233005060, + "step": 10798, + "time_per_iteration": 2.589268445968628 + }, + { + "auxiliary_loss_clip": 0.0106383, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.03405428, + "balance_loss_mlp": 1.01800513, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.779566827541754, + "language_loss": 0.76895571, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.78989524, + "num_input_tokens_seen": 233023375, + "step": 10799, + "time_per_iteration": 2.5792887210845947 + }, + { + "auxiliary_loss_clip": 0.01104786, + "auxiliary_loss_mlp": 0.01034758, + "balance_loss_clip": 1.04253149, + "balance_loss_mlp": 1.02105522, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.8290987477135374, + "language_loss": 0.72080141, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.74219686, + "num_input_tokens_seen": 233043130, + "step": 10800, + "time_per_iteration": 2.511979341506958 + }, + { + "auxiliary_loss_clip": 0.01026266, + "auxiliary_loss_mlp": 0.01017282, + "balance_loss_clip": 1.00999177, + "balance_loss_mlp": 1.01587546, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.81211748049461, + "language_loss": 0.60238945, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62282503, + "num_input_tokens_seen": 233110560, + "step": 10801, + "time_per_iteration": 3.1450533866882324 + }, + { + "auxiliary_loss_clip": 0.01104218, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.03916335, + "balance_loss_mlp": 1.02694559, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 1.7593220558150884, + "language_loss": 0.78246641, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80391622, + "num_input_tokens_seen": 233130080, + "step": 10802, + "time_per_iteration": 2.502530097961426 + }, + { + "auxiliary_loss_clip": 0.01114591, + "auxiliary_loss_mlp": 0.01040506, + "balance_loss_clip": 1.03939724, + "balance_loss_mlp": 1.02688003, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.6109521189995584, + "language_loss": 0.74771827, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76926923, + "num_input_tokens_seen": 233150235, + "step": 10803, + "time_per_iteration": 2.468128204345703 + }, + { + "auxiliary_loss_clip": 0.0105351, + "auxiliary_loss_mlp": 0.01033674, + "balance_loss_clip": 1.03636956, + "balance_loss_mlp": 1.02137744, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 2.2666637039403144, + "language_loss": 0.70452869, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72540057, + "num_input_tokens_seen": 233166710, + "step": 10804, + "time_per_iteration": 2.681121587753296 + }, + { + "auxiliary_loss_clip": 0.01097536, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.03759098, + "balance_loss_mlp": 1.02115631, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.790070392833882, + "language_loss": 0.73010385, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.75142491, + "num_input_tokens_seen": 233185445, + "step": 10805, + "time_per_iteration": 2.4881410598754883 + }, + { + "auxiliary_loss_clip": 0.01085747, + "auxiliary_loss_mlp": 0.01033259, + "balance_loss_clip": 1.03809309, + "balance_loss_mlp": 1.02097487, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 2.330953432190767, + "language_loss": 0.65912759, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.68031764, + "num_input_tokens_seen": 233205805, + "step": 10806, + "time_per_iteration": 2.5716185569763184 + }, + { + "auxiliary_loss_clip": 0.01093799, + "auxiliary_loss_mlp": 0.0077994, + "balance_loss_clip": 1.03869486, + "balance_loss_mlp": 1.00098681, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.7640802442112613, + "language_loss": 0.79385853, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.8125959, + "num_input_tokens_seen": 233224215, + "step": 10807, + "time_per_iteration": 4.013745307922363 + }, + { + "auxiliary_loss_clip": 0.0101863, + "auxiliary_loss_mlp": 0.01005472, + "balance_loss_clip": 1.01229548, + "balance_loss_mlp": 1.00417888, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7812597125090107, + "language_loss": 0.58889282, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.60913384, + "num_input_tokens_seen": 233294440, + "step": 10808, + "time_per_iteration": 3.2454898357391357 + }, + { + "auxiliary_loss_clip": 0.01093351, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.04189575, + "balance_loss_mlp": 1.01872039, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.7694722966328957, + "language_loss": 0.63345432, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65469766, + "num_input_tokens_seen": 233316125, + "step": 10809, + "time_per_iteration": 2.638455629348755 + }, + { + "auxiliary_loss_clip": 0.01101324, + "auxiliary_loss_mlp": 0.00777954, + "balance_loss_clip": 1.04036474, + "balance_loss_mlp": 1.00087655, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.6745526059366178, + "language_loss": 0.81522334, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83401608, + "num_input_tokens_seen": 233336140, + "step": 10810, + "time_per_iteration": 2.5306854248046875 + }, + { + "auxiliary_loss_clip": 0.01074912, + "auxiliary_loss_mlp": 0.01035743, + "balance_loss_clip": 1.03827238, + "balance_loss_mlp": 1.02418613, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.6293200463636408, + "language_loss": 0.71686292, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.73796946, + "num_input_tokens_seen": 233356095, + "step": 10811, + "time_per_iteration": 2.524801015853882 + }, + { + "auxiliary_loss_clip": 0.01054391, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.04282546, + "balance_loss_mlp": 1.01776862, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.4638116064980449, + "language_loss": 0.77598006, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.79682004, + "num_input_tokens_seen": 233376830, + "step": 10812, + "time_per_iteration": 2.6393868923187256 + }, + { + "auxiliary_loss_clip": 0.01099459, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.0405097, + "balance_loss_mlp": 1.02020788, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.7108295131921802, + "language_loss": 0.85563219, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87696147, + "num_input_tokens_seen": 233395275, + "step": 10813, + "time_per_iteration": 2.4991514682769775 + }, + { + "auxiliary_loss_clip": 0.01071474, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.03308678, + "balance_loss_mlp": 1.02226162, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.6334460893556875, + "language_loss": 0.8006795, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.82174814, + "num_input_tokens_seen": 233413345, + "step": 10814, + "time_per_iteration": 2.509488105773926 + }, + { + "auxiliary_loss_clip": 0.01068621, + "auxiliary_loss_mlp": 0.00783598, + "balance_loss_clip": 1.04063606, + "balance_loss_mlp": 1.00098872, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.6517280530132827, + "language_loss": 0.65592515, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67444736, + "num_input_tokens_seen": 233436105, + "step": 10815, + "time_per_iteration": 2.7600648403167725 + }, + { + "auxiliary_loss_clip": 0.01117249, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.04018986, + "balance_loss_mlp": 1.01967978, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 2.1747803910589707, + "language_loss": 0.75406122, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77558398, + "num_input_tokens_seen": 233452320, + "step": 10816, + "time_per_iteration": 2.415909767150879 + }, + { + "auxiliary_loss_clip": 0.01085723, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.03736997, + "balance_loss_mlp": 1.02438521, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.7097627063480836, + "language_loss": 0.72813261, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.7493608, + "num_input_tokens_seen": 233469920, + "step": 10817, + "time_per_iteration": 2.4995830059051514 + }, + { + "auxiliary_loss_clip": 0.01072083, + "auxiliary_loss_mlp": 0.01051188, + "balance_loss_clip": 1.03354383, + "balance_loss_mlp": 1.035882, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.3828251882185467, + "language_loss": 0.72173405, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74296683, + "num_input_tokens_seen": 233499780, + "step": 10818, + "time_per_iteration": 4.421875715255737 + }, + { + "auxiliary_loss_clip": 0.01087122, + "auxiliary_loss_mlp": 0.01029118, + "balance_loss_clip": 1.03947353, + "balance_loss_mlp": 1.01604629, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 2.3732426816062273, + "language_loss": 0.64775074, + "learning_rate": 1.150246104600249e-06, + "loss": 0.66891313, + "num_input_tokens_seen": 233518235, + "step": 10819, + "time_per_iteration": 2.5834174156188965 + }, + { + "auxiliary_loss_clip": 0.01079067, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.03512216, + "balance_loss_mlp": 1.02090645, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.8896054853341284, + "language_loss": 0.83819139, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85932922, + "num_input_tokens_seen": 233535215, + "step": 10820, + "time_per_iteration": 2.576719284057617 + }, + { + "auxiliary_loss_clip": 0.01108612, + "auxiliary_loss_mlp": 0.01032999, + "balance_loss_clip": 1.03997564, + "balance_loss_mlp": 1.01893246, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.0283062664677565, + "language_loss": 0.77576697, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.79718304, + "num_input_tokens_seen": 233552775, + "step": 10821, + "time_per_iteration": 3.860581159591675 + }, + { + "auxiliary_loss_clip": 0.01077579, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.03750646, + "balance_loss_mlp": 1.01672912, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.8033912435014328, + "language_loss": 0.79976505, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82082212, + "num_input_tokens_seen": 233572080, + "step": 10822, + "time_per_iteration": 2.5561277866363525 + }, + { + "auxiliary_loss_clip": 0.01087766, + "auxiliary_loss_mlp": 0.01029934, + "balance_loss_clip": 1.04384315, + "balance_loss_mlp": 1.01713705, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.9999433251592247, + "language_loss": 0.87284517, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89402217, + "num_input_tokens_seen": 233589155, + "step": 10823, + "time_per_iteration": 2.5153636932373047 + }, + { + "auxiliary_loss_clip": 0.01114044, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.03856039, + "balance_loss_mlp": 1.01828957, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.6708880369375658, + "language_loss": 0.666219, + "learning_rate": 1.148483704558183e-06, + "loss": 0.68767279, + "num_input_tokens_seen": 233608180, + "step": 10824, + "time_per_iteration": 2.4748353958129883 + }, + { + "auxiliary_loss_clip": 0.01096217, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.0406096, + "balance_loss_mlp": 1.01932669, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 3.0419723696663588, + "language_loss": 0.87745303, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89873946, + "num_input_tokens_seen": 233625750, + "step": 10825, + "time_per_iteration": 2.4746439456939697 + }, + { + "auxiliary_loss_clip": 0.01092874, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.03558135, + "balance_loss_mlp": 1.0167135, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.4393964936625254, + "language_loss": 0.73112833, + "learning_rate": 1.147778970474885e-06, + "loss": 0.75237048, + "num_input_tokens_seen": 233644235, + "step": 10826, + "time_per_iteration": 2.4587109088897705 + }, + { + "auxiliary_loss_clip": 0.01105091, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.0400275, + "balance_loss_mlp": 1.01999497, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 2.1913083346826236, + "language_loss": 0.68987584, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71124721, + "num_input_tokens_seen": 233662845, + "step": 10827, + "time_per_iteration": 2.4431421756744385 + }, + { + "auxiliary_loss_clip": 0.01090577, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.03612399, + "balance_loss_mlp": 1.01522923, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 1.9185038224346822, + "language_loss": 0.76632607, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.787507, + "num_input_tokens_seen": 233681990, + "step": 10828, + "time_per_iteration": 2.510460138320923 + }, + { + "auxiliary_loss_clip": 0.01100656, + "auxiliary_loss_mlp": 0.01029562, + "balance_loss_clip": 1.03859234, + "balance_loss_mlp": 1.01743829, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 1.6532817511802709, + "language_loss": 0.89319086, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91449308, + "num_input_tokens_seen": 233698930, + "step": 10829, + "time_per_iteration": 2.4660279750823975 + }, + { + "auxiliary_loss_clip": 0.01032709, + "auxiliary_loss_mlp": 0.01003619, + "balance_loss_clip": 1.00860405, + "balance_loss_mlp": 1.00246859, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.6396121976855209, + "language_loss": 0.55374509, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57410836, + "num_input_tokens_seen": 233769825, + "step": 10830, + "time_per_iteration": 3.150019645690918 + }, + { + "auxiliary_loss_clip": 0.01079997, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.03476858, + "balance_loss_mlp": 1.01802206, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 1.9684121793386629, + "language_loss": 0.74304593, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.76415974, + "num_input_tokens_seen": 233787095, + "step": 10831, + "time_per_iteration": 2.5381510257720947 + }, + { + "auxiliary_loss_clip": 0.01017134, + "auxiliary_loss_mlp": 0.01000619, + "balance_loss_clip": 1.01092255, + "balance_loss_mlp": 0.99923629, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6459552680098413, + "language_loss": 0.51027262, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53045022, + "num_input_tokens_seen": 233853050, + "step": 10832, + "time_per_iteration": 4.698387861251831 + }, + { + "auxiliary_loss_clip": 0.01094818, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.03807282, + "balance_loss_mlp": 1.02247262, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 2.4277302621060555, + "language_loss": 0.83450019, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85580325, + "num_input_tokens_seen": 233871385, + "step": 10833, + "time_per_iteration": 2.4920878410339355 + }, + { + "auxiliary_loss_clip": 0.01095829, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.04253387, + "balance_loss_mlp": 1.02127743, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.6656115907809195, + "language_loss": 0.83781302, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.8591156, + "num_input_tokens_seen": 233888175, + "step": 10834, + "time_per_iteration": 2.476785182952881 + }, + { + "auxiliary_loss_clip": 0.01100346, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.03720129, + "balance_loss_mlp": 1.02378201, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.8681188396024948, + "language_loss": 0.76841676, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.78978467, + "num_input_tokens_seen": 233911470, + "step": 10835, + "time_per_iteration": 2.547635078430176 + }, + { + "auxiliary_loss_clip": 0.0109332, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.04340577, + "balance_loss_mlp": 1.0264082, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.5143725960871217, + "language_loss": 0.77481341, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79614013, + "num_input_tokens_seen": 233932135, + "step": 10836, + "time_per_iteration": 2.5481648445129395 + }, + { + "auxiliary_loss_clip": 0.01074963, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.03608477, + "balance_loss_mlp": 1.020486, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 2.07779102635658, + "language_loss": 0.82610577, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84719223, + "num_input_tokens_seen": 233947880, + "step": 10837, + "time_per_iteration": 2.4953417778015137 + }, + { + "auxiliary_loss_clip": 0.01079326, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.04444838, + "balance_loss_mlp": 1.02050734, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.931307983039164, + "language_loss": 0.5875175, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.60865653, + "num_input_tokens_seen": 233971475, + "step": 10838, + "time_per_iteration": 2.8255906105041504 + }, + { + "auxiliary_loss_clip": 0.01032041, + "auxiliary_loss_mlp": 0.0100524, + "balance_loss_clip": 1.00785387, + "balance_loss_mlp": 1.0041666, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.728209407381975, + "language_loss": 0.60871351, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.62908638, + "num_input_tokens_seen": 234030690, + "step": 10839, + "time_per_iteration": 3.056427001953125 + }, + { + "auxiliary_loss_clip": 0.01091827, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.04056883, + "balance_loss_mlp": 1.0177474, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.7618970250768682, + "language_loss": 0.67783028, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.6990459, + "num_input_tokens_seen": 234052470, + "step": 10840, + "time_per_iteration": 2.641021728515625 + }, + { + "auxiliary_loss_clip": 0.010694, + "auxiliary_loss_mlp": 0.01033813, + "balance_loss_clip": 1.03748608, + "balance_loss_mlp": 1.02199912, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.1489130357579236, + "language_loss": 0.73758399, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75861615, + "num_input_tokens_seen": 234071495, + "step": 10841, + "time_per_iteration": 2.590833902359009 + }, + { + "auxiliary_loss_clip": 0.01114019, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.03918934, + "balance_loss_mlp": 1.0256021, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.426481088329601, + "language_loss": 0.62483537, + "learning_rate": 1.142145760331648e-06, + "loss": 0.64636183, + "num_input_tokens_seen": 234092325, + "step": 10842, + "time_per_iteration": 2.4874134063720703 + }, + { + "auxiliary_loss_clip": 0.01025145, + "auxiliary_loss_mlp": 0.01000481, + "balance_loss_clip": 1.01076639, + "balance_loss_mlp": 0.99917555, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8110267545726498, + "language_loss": 0.56163198, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58188826, + "num_input_tokens_seen": 234148005, + "step": 10843, + "time_per_iteration": 2.8512065410614014 + }, + { + "auxiliary_loss_clip": 0.01107777, + "auxiliary_loss_mlp": 0.01040137, + "balance_loss_clip": 1.04048228, + "balance_loss_mlp": 1.02632093, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.6895515813013928, + "language_loss": 0.82669103, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84817016, + "num_input_tokens_seen": 234164280, + "step": 10844, + "time_per_iteration": 2.4343502521514893 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.03951716, + "balance_loss_mlp": 1.01594472, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 2.5963822189299757, + "language_loss": 0.59407747, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.61540991, + "num_input_tokens_seen": 234185090, + "step": 10845, + "time_per_iteration": 2.508132219314575 + }, + { + "auxiliary_loss_clip": 0.01103682, + "auxiliary_loss_mlp": 0.01029684, + "balance_loss_clip": 1.03946865, + "balance_loss_mlp": 1.01716721, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 1.7046002241060723, + "language_loss": 0.79344112, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81477481, + "num_input_tokens_seen": 234204050, + "step": 10846, + "time_per_iteration": 2.480121374130249 + }, + { + "auxiliary_loss_clip": 0.01023871, + "auxiliary_loss_mlp": 0.01005981, + "balance_loss_clip": 1.00879431, + "balance_loss_mlp": 1.00487864, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7098312976528742, + "language_loss": 0.60237056, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.6226691, + "num_input_tokens_seen": 234269790, + "step": 10847, + "time_per_iteration": 4.555239915847778 + }, + { + "auxiliary_loss_clip": 0.01118308, + "auxiliary_loss_mlp": 0.01039771, + "balance_loss_clip": 1.04175556, + "balance_loss_mlp": 1.02668762, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.9672454334545266, + "language_loss": 0.80831522, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.82989597, + "num_input_tokens_seen": 234290135, + "step": 10848, + "time_per_iteration": 2.479994058609009 + }, + { + "auxiliary_loss_clip": 0.0108445, + "auxiliary_loss_mlp": 0.01035335, + "balance_loss_clip": 1.03600502, + "balance_loss_mlp": 1.02311063, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.801908726599191, + "language_loss": 0.74490571, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.76610357, + "num_input_tokens_seen": 234309535, + "step": 10849, + "time_per_iteration": 2.5264604091644287 + }, + { + "auxiliary_loss_clip": 0.01065115, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.03864431, + "balance_loss_mlp": 1.02435184, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 2.158770859290975, + "language_loss": 0.68010592, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70112681, + "num_input_tokens_seen": 234328755, + "step": 10850, + "time_per_iteration": 2.6152548789978027 + }, + { + "auxiliary_loss_clip": 0.01089058, + "auxiliary_loss_mlp": 0.00777574, + "balance_loss_clip": 1.03885484, + "balance_loss_mlp": 1.00083709, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 2.5739023091526305, + "language_loss": 0.66735899, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68602538, + "num_input_tokens_seen": 234348655, + "step": 10851, + "time_per_iteration": 2.5738232135772705 + }, + { + "auxiliary_loss_clip": 0.0109106, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.03888977, + "balance_loss_mlp": 1.0176332, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 3.1698154003742496, + "language_loss": 0.73585027, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.75705987, + "num_input_tokens_seen": 234367445, + "step": 10852, + "time_per_iteration": 2.571235179901123 + }, + { + "auxiliary_loss_clip": 0.01093144, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.04246891, + "balance_loss_mlp": 1.01710021, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 2.0058186504208644, + "language_loss": 0.66497457, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68621296, + "num_input_tokens_seen": 234384825, + "step": 10853, + "time_per_iteration": 2.5069613456726074 + }, + { + "auxiliary_loss_clip": 0.01020787, + "auxiliary_loss_mlp": 0.01003677, + "balance_loss_clip": 1.02772069, + "balance_loss_mlp": 1.00269353, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7277215151254006, + "language_loss": 0.63056576, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65081036, + "num_input_tokens_seen": 234450630, + "step": 10854, + "time_per_iteration": 3.217904567718506 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.01041547, + "balance_loss_clip": 1.03714168, + "balance_loss_mlp": 1.02599013, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.7011820372446609, + "language_loss": 0.77610511, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.7975232, + "num_input_tokens_seen": 234473505, + "step": 10855, + "time_per_iteration": 2.5401108264923096 + }, + { + "auxiliary_loss_clip": 0.01074241, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.0331924, + "balance_loss_mlp": 1.0132885, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.9296545613560285, + "language_loss": 0.79699969, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81799924, + "num_input_tokens_seen": 234492485, + "step": 10856, + "time_per_iteration": 2.5701074600219727 + }, + { + "auxiliary_loss_clip": 0.01112777, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.03803849, + "balance_loss_mlp": 1.02069592, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.7995261808394687, + "language_loss": 0.73704892, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75852573, + "num_input_tokens_seen": 234512645, + "step": 10857, + "time_per_iteration": 4.105040073394775 + }, + { + "auxiliary_loss_clip": 0.01092348, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.03727853, + "balance_loss_mlp": 1.01970541, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.3388291812482955, + "language_loss": 0.63435066, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.65559077, + "num_input_tokens_seen": 234529310, + "step": 10858, + "time_per_iteration": 2.505161762237549 + }, + { + "auxiliary_loss_clip": 0.01109733, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.03723955, + "balance_loss_mlp": 1.02240765, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.6875261757503628, + "language_loss": 0.78460479, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80605078, + "num_input_tokens_seen": 234546685, + "step": 10859, + "time_per_iteration": 2.398705244064331 + }, + { + "auxiliary_loss_clip": 0.01104106, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.0379343, + "balance_loss_mlp": 1.01614475, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.5594650214527825, + "language_loss": 0.67776918, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.699103, + "num_input_tokens_seen": 234566255, + "step": 10860, + "time_per_iteration": 2.540013313293457 + }, + { + "auxiliary_loss_clip": 0.01103718, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.04042351, + "balance_loss_mlp": 1.01713157, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 1.8846363238317922, + "language_loss": 0.66506922, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68640554, + "num_input_tokens_seen": 234585405, + "step": 10861, + "time_per_iteration": 3.8018360137939453 + }, + { + "auxiliary_loss_clip": 0.01092899, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.03869426, + "balance_loss_mlp": 1.01818156, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.8331132763788378, + "language_loss": 0.65166748, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67291903, + "num_input_tokens_seen": 234608095, + "step": 10862, + "time_per_iteration": 2.604649782180786 + }, + { + "auxiliary_loss_clip": 0.01091153, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.0372144, + "balance_loss_mlp": 1.01925361, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.8129090710843854, + "language_loss": 0.7754547, + "learning_rate": 1.13476481851592e-06, + "loss": 0.7966845, + "num_input_tokens_seen": 234627335, + "step": 10863, + "time_per_iteration": 2.494442939758301 + }, + { + "auxiliary_loss_clip": 0.01086458, + "auxiliary_loss_mlp": 0.01029476, + "balance_loss_clip": 1.03791618, + "balance_loss_mlp": 1.01759696, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 1.6748333363936627, + "language_loss": 0.74810529, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.7692647, + "num_input_tokens_seen": 234646540, + "step": 10864, + "time_per_iteration": 2.4966681003570557 + }, + { + "auxiliary_loss_clip": 0.01099168, + "auxiliary_loss_mlp": 0.01032814, + "balance_loss_clip": 1.03748047, + "balance_loss_mlp": 1.02083921, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 2.162044785300424, + "language_loss": 0.86455262, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88587248, + "num_input_tokens_seen": 234665470, + "step": 10865, + "time_per_iteration": 2.505542039871216 + }, + { + "auxiliary_loss_clip": 0.01085557, + "auxiliary_loss_mlp": 0.0077923, + "balance_loss_clip": 1.03581905, + "balance_loss_mlp": 1.00088334, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 1.5636466010061594, + "language_loss": 0.81559062, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83423853, + "num_input_tokens_seen": 234683955, + "step": 10866, + "time_per_iteration": 2.543828248977661 + }, + { + "auxiliary_loss_clip": 0.01090056, + "auxiliary_loss_mlp": 0.01025871, + "balance_loss_clip": 1.03866005, + "balance_loss_mlp": 1.0140332, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.5062618060402972, + "language_loss": 0.82035857, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84151781, + "num_input_tokens_seen": 234704595, + "step": 10867, + "time_per_iteration": 2.5322766304016113 + }, + { + "auxiliary_loss_clip": 0.01085105, + "auxiliary_loss_mlp": 0.0102668, + "balance_loss_clip": 1.03607321, + "balance_loss_mlp": 1.01434183, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 1.8062871386888868, + "language_loss": 0.81213695, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.83325481, + "num_input_tokens_seen": 234724090, + "step": 10868, + "time_per_iteration": 2.4882352352142334 + }, + { + "auxiliary_loss_clip": 0.01084443, + "auxiliary_loss_mlp": 0.01025918, + "balance_loss_clip": 1.03924143, + "balance_loss_mlp": 1.01211333, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 1.736013122174079, + "language_loss": 0.79958171, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.82068533, + "num_input_tokens_seen": 234742560, + "step": 10869, + "time_per_iteration": 2.4975204467773438 + }, + { + "auxiliary_loss_clip": 0.01105487, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.04096437, + "balance_loss_mlp": 1.02047682, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 2.056898135189124, + "language_loss": 0.72553396, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.7469213, + "num_input_tokens_seen": 234762315, + "step": 10870, + "time_per_iteration": 2.4806182384490967 + }, + { + "auxiliary_loss_clip": 0.0107471, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.03738391, + "balance_loss_mlp": 1.02190757, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.498078976492737, + "language_loss": 0.75024098, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.77133667, + "num_input_tokens_seen": 234781300, + "step": 10871, + "time_per_iteration": 2.5487751960754395 + }, + { + "auxiliary_loss_clip": 0.01093506, + "auxiliary_loss_mlp": 0.00776728, + "balance_loss_clip": 1.03833699, + "balance_loss_mlp": 1.00084257, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.7087106302061945, + "language_loss": 0.55689502, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57559735, + "num_input_tokens_seen": 234801040, + "step": 10872, + "time_per_iteration": 3.8080108165740967 + }, + { + "auxiliary_loss_clip": 0.01088371, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.03713751, + "balance_loss_mlp": 1.02148581, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.5615616828145171, + "language_loss": 0.74896246, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77018166, + "num_input_tokens_seen": 234821415, + "step": 10873, + "time_per_iteration": 2.5101771354675293 + }, + { + "auxiliary_loss_clip": 0.01102282, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.03905225, + "balance_loss_mlp": 1.01916671, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.4934098429640754, + "language_loss": 0.75091326, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77225459, + "num_input_tokens_seen": 234843795, + "step": 10874, + "time_per_iteration": 2.507789373397827 + }, + { + "auxiliary_loss_clip": 0.01069945, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.03645611, + "balance_loss_mlp": 1.02692795, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 1.5968778362277827, + "language_loss": 0.81733578, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.8384341, + "num_input_tokens_seen": 234862350, + "step": 10875, + "time_per_iteration": 2.604891061782837 + }, + { + "auxiliary_loss_clip": 0.01112074, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.03770423, + "balance_loss_mlp": 1.02466071, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.7464013254686848, + "language_loss": 0.69938391, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72087252, + "num_input_tokens_seen": 234881790, + "step": 10876, + "time_per_iteration": 2.4591808319091797 + }, + { + "auxiliary_loss_clip": 0.01018039, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.03324592, + "balance_loss_mlp": 1.02113116, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 2.1418400521196506, + "language_loss": 0.79808915, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.8186059, + "num_input_tokens_seen": 234897775, + "step": 10877, + "time_per_iteration": 2.770754814147949 + }, + { + "auxiliary_loss_clip": 0.01095996, + "auxiliary_loss_mlp": 0.00776996, + "balance_loss_clip": 1.04033661, + "balance_loss_mlp": 1.00090194, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 10.422672788932429, + "language_loss": 0.8006708, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.81940073, + "num_input_tokens_seen": 234918395, + "step": 10878, + "time_per_iteration": 2.8243532180786133 + }, + { + "auxiliary_loss_clip": 0.01089342, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.03538561, + "balance_loss_mlp": 1.01728964, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 2.09837886826278, + "language_loss": 0.84232336, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86351794, + "num_input_tokens_seen": 234936260, + "step": 10879, + "time_per_iteration": 2.4967429637908936 + }, + { + "auxiliary_loss_clip": 0.01091802, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.03754115, + "balance_loss_mlp": 1.01759505, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.4341873581025095, + "language_loss": 0.71852005, + "learning_rate": 1.128800362199601e-06, + "loss": 0.73974496, + "num_input_tokens_seen": 234952110, + "step": 10880, + "time_per_iteration": 2.487440586090088 + }, + { + "auxiliary_loss_clip": 0.01071723, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.0340668, + "balance_loss_mlp": 1.02179909, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 1.8023095399471387, + "language_loss": 0.84249461, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86355191, + "num_input_tokens_seen": 234970810, + "step": 10881, + "time_per_iteration": 2.4976115226745605 + }, + { + "auxiliary_loss_clip": 0.01080484, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.0360837, + "balance_loss_mlp": 1.01825678, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 1.9520747802950749, + "language_loss": 0.77888113, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.80001408, + "num_input_tokens_seen": 234989565, + "step": 10882, + "time_per_iteration": 2.522691249847412 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.04068279, + "balance_loss_mlp": 1.01787245, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 1.8401690478134418, + "language_loss": 0.81649584, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.83798039, + "num_input_tokens_seen": 235007955, + "step": 10883, + "time_per_iteration": 2.4150757789611816 + }, + { + "auxiliary_loss_clip": 0.01066311, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.03821468, + "balance_loss_mlp": 1.02158964, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.7196726822481145, + "language_loss": 0.85487419, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87588739, + "num_input_tokens_seen": 235024860, + "step": 10884, + "time_per_iteration": 2.5458874702453613 + }, + { + "auxiliary_loss_clip": 0.01092243, + "auxiliary_loss_mlp": 0.01037609, + "balance_loss_clip": 1.03758097, + "balance_loss_mlp": 1.02457309, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 2.3004352990545422, + "language_loss": 0.80166745, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82296586, + "num_input_tokens_seen": 235043815, + "step": 10885, + "time_per_iteration": 2.499136209487915 + }, + { + "auxiliary_loss_clip": 0.01074598, + "auxiliary_loss_mlp": 0.01027252, + "balance_loss_clip": 1.03487289, + "balance_loss_mlp": 1.01423442, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.8661538039936423, + "language_loss": 0.71699774, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.73801625, + "num_input_tokens_seen": 235062985, + "step": 10886, + "time_per_iteration": 2.571751117706299 + }, + { + "auxiliary_loss_clip": 0.01096213, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.03976691, + "balance_loss_mlp": 1.01530266, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 2.4338223092608464, + "language_loss": 0.77887529, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.8001073, + "num_input_tokens_seen": 235081670, + "step": 10887, + "time_per_iteration": 3.8994555473327637 + }, + { + "auxiliary_loss_clip": 0.01087081, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.03735828, + "balance_loss_mlp": 1.0209415, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 1.9848017345307578, + "language_loss": 0.79145741, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81266069, + "num_input_tokens_seen": 235098510, + "step": 10888, + "time_per_iteration": 2.555604934692383 + }, + { + "auxiliary_loss_clip": 0.01099564, + "auxiliary_loss_mlp": 0.01026366, + "balance_loss_clip": 1.03859258, + "balance_loss_mlp": 1.01491642, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.7358599317107621, + "language_loss": 0.66279763, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.684057, + "num_input_tokens_seen": 235119990, + "step": 10889, + "time_per_iteration": 2.578528642654419 + }, + { + "auxiliary_loss_clip": 0.01088064, + "auxiliary_loss_mlp": 0.01042105, + "balance_loss_clip": 1.03624177, + "balance_loss_mlp": 1.02663136, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.503406036082305, + "language_loss": 0.79971969, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82102132, + "num_input_tokens_seen": 235139255, + "step": 10890, + "time_per_iteration": 2.4872796535491943 + }, + { + "auxiliary_loss_clip": 0.01101221, + "auxiliary_loss_mlp": 0.00778758, + "balance_loss_clip": 1.03691149, + "balance_loss_mlp": 1.0009675, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 2.1727425171347394, + "language_loss": 0.65595829, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67475814, + "num_input_tokens_seen": 235158455, + "step": 10891, + "time_per_iteration": 2.4883804321289062 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.03615677, + "balance_loss_mlp": 1.0240835, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.825778048728121, + "language_loss": 0.79519904, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81655526, + "num_input_tokens_seen": 235177350, + "step": 10892, + "time_per_iteration": 2.4535529613494873 + }, + { + "auxiliary_loss_clip": 0.01108813, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.04213583, + "balance_loss_mlp": 1.02170944, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 1.8381958115337291, + "language_loss": 0.77758932, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.79901755, + "num_input_tokens_seen": 235196435, + "step": 10893, + "time_per_iteration": 2.497333526611328 + }, + { + "auxiliary_loss_clip": 0.01118255, + "auxiliary_loss_mlp": 0.01034201, + "balance_loss_clip": 1.04134238, + "balance_loss_mlp": 1.02042699, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.6506367514421365, + "language_loss": 0.69912815, + "learning_rate": 1.123895622914766e-06, + "loss": 0.7206527, + "num_input_tokens_seen": 235215430, + "step": 10894, + "time_per_iteration": 2.426708936691284 + }, + { + "auxiliary_loss_clip": 0.01108516, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.04024911, + "balance_loss_mlp": 1.02224743, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 3.674915990066217, + "language_loss": 0.63023204, + "learning_rate": 1.123545533127549e-06, + "loss": 0.65167046, + "num_input_tokens_seen": 235232015, + "step": 10895, + "time_per_iteration": 2.4455697536468506 + }, + { + "auxiliary_loss_clip": 0.01099408, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.03540385, + "balance_loss_mlp": 1.02445579, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 1.898345885066221, + "language_loss": 0.79287398, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.81423891, + "num_input_tokens_seen": 235248115, + "step": 10896, + "time_per_iteration": 3.9571633338928223 + }, + { + "auxiliary_loss_clip": 0.01088591, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.037292, + "balance_loss_mlp": 1.01785111, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.4300831142301744, + "language_loss": 0.70660311, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72778606, + "num_input_tokens_seen": 235270785, + "step": 10897, + "time_per_iteration": 2.539677143096924 + }, + { + "auxiliary_loss_clip": 0.01114641, + "auxiliary_loss_mlp": 0.01032998, + "balance_loss_clip": 1.0388397, + "balance_loss_mlp": 1.0201292, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.6161776739446554, + "language_loss": 0.75364721, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.7751236, + "num_input_tokens_seen": 235287905, + "step": 10898, + "time_per_iteration": 2.398287057876587 + }, + { + "auxiliary_loss_clip": 0.01092836, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.03844607, + "balance_loss_mlp": 1.02184582, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 2.1786294307122778, + "language_loss": 0.74143481, + "learning_rate": 1.122145506463827e-06, + "loss": 0.76270592, + "num_input_tokens_seen": 235305525, + "step": 10899, + "time_per_iteration": 2.518754482269287 + }, + { + "auxiliary_loss_clip": 0.01090612, + "auxiliary_loss_mlp": 0.01027674, + "balance_loss_clip": 1.03792155, + "balance_loss_mlp": 1.01559854, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.277001157653924, + "language_loss": 0.55905843, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58024126, + "num_input_tokens_seen": 235324415, + "step": 10900, + "time_per_iteration": 3.9958698749542236 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.04097962, + "balance_loss_mlp": 1.02229357, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 2.6724533859621236, + "language_loss": 0.76724988, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.7886399, + "num_input_tokens_seen": 235341595, + "step": 10901, + "time_per_iteration": 2.4679253101348877 + }, + { + "auxiliary_loss_clip": 0.01112929, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.03946996, + "balance_loss_mlp": 1.01884675, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.9987867050052714, + "language_loss": 0.73345554, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75490397, + "num_input_tokens_seen": 235361700, + "step": 10902, + "time_per_iteration": 2.430616855621338 + }, + { + "auxiliary_loss_clip": 0.01112299, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.03969288, + "balance_loss_mlp": 1.01912367, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.7855420443939178, + "language_loss": 0.68096638, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.70240724, + "num_input_tokens_seen": 235382065, + "step": 10903, + "time_per_iteration": 2.4348020553588867 + }, + { + "auxiliary_loss_clip": 0.01095618, + "auxiliary_loss_mlp": 0.00780324, + "balance_loss_clip": 1.03727841, + "balance_loss_mlp": 1.00095057, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 2.1263168732235767, + "language_loss": 0.66547394, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.68423337, + "num_input_tokens_seen": 235402130, + "step": 10904, + "time_per_iteration": 2.5686733722686768 + }, + { + "auxiliary_loss_clip": 0.01105572, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.03782642, + "balance_loss_mlp": 1.02440357, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 1.8680642301066064, + "language_loss": 0.90443122, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92587578, + "num_input_tokens_seen": 235420435, + "step": 10905, + "time_per_iteration": 2.457129716873169 + }, + { + "auxiliary_loss_clip": 0.01100504, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.03768587, + "balance_loss_mlp": 1.0205822, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 4.831882179398599, + "language_loss": 0.75334591, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77468264, + "num_input_tokens_seen": 235439960, + "step": 10906, + "time_per_iteration": 2.4718759059906006 + }, + { + "auxiliary_loss_clip": 0.01116954, + "auxiliary_loss_mlp": 0.01041106, + "balance_loss_clip": 1.0404923, + "balance_loss_mlp": 1.02737927, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.7022016916153544, + "language_loss": 0.74392903, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76550961, + "num_input_tokens_seen": 235457495, + "step": 10907, + "time_per_iteration": 2.3952126502990723 + }, + { + "auxiliary_loss_clip": 0.01070543, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.03530669, + "balance_loss_mlp": 1.01802576, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.4715378751360237, + "language_loss": 0.72074819, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74177176, + "num_input_tokens_seen": 235479525, + "step": 10908, + "time_per_iteration": 2.5813679695129395 + }, + { + "auxiliary_loss_clip": 0.01115683, + "auxiliary_loss_mlp": 0.01039095, + "balance_loss_clip": 1.04144239, + "balance_loss_mlp": 1.02554071, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.394696140582895, + "language_loss": 0.80829692, + "learning_rate": 1.118647771844861e-06, + "loss": 0.82984471, + "num_input_tokens_seen": 235496305, + "step": 10909, + "time_per_iteration": 2.4029381275177 + }, + { + "auxiliary_loss_clip": 0.01115345, + "auxiliary_loss_mlp": 0.01037888, + "balance_loss_clip": 1.04011989, + "balance_loss_mlp": 1.02402997, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.6228215041933676, + "language_loss": 0.64006209, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.66159451, + "num_input_tokens_seen": 235512545, + "step": 10910, + "time_per_iteration": 2.4184930324554443 + }, + { + "auxiliary_loss_clip": 0.01094418, + "auxiliary_loss_mlp": 0.01038676, + "balance_loss_clip": 1.03755116, + "balance_loss_mlp": 1.02363205, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 2.539903877312843, + "language_loss": 0.75663972, + "learning_rate": 1.117948625548313e-06, + "loss": 0.77797067, + "num_input_tokens_seen": 235526045, + "step": 10911, + "time_per_iteration": 3.8603475093841553 + }, + { + "auxiliary_loss_clip": 0.01106794, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.03668237, + "balance_loss_mlp": 1.0172317, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 1.5539946787715617, + "language_loss": 0.75458264, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77594149, + "num_input_tokens_seen": 235545285, + "step": 10912, + "time_per_iteration": 2.4932925701141357 + }, + { + "auxiliary_loss_clip": 0.01083896, + "auxiliary_loss_mlp": 0.00781451, + "balance_loss_clip": 1.04342127, + "balance_loss_mlp": 1.00093615, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.6823088513081736, + "language_loss": 0.77851212, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79716557, + "num_input_tokens_seen": 235563150, + "step": 10913, + "time_per_iteration": 2.5338082313537598 + }, + { + "auxiliary_loss_clip": 0.01081288, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.03418589, + "balance_loss_mlp": 1.01861775, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 2.1252799869062144, + "language_loss": 0.71027112, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73138839, + "num_input_tokens_seen": 235582535, + "step": 10914, + "time_per_iteration": 2.499659538269043 + }, + { + "auxiliary_loss_clip": 0.01082293, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.03877926, + "balance_loss_mlp": 1.01811123, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.9547184031861078, + "language_loss": 0.74132824, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76245928, + "num_input_tokens_seen": 235601490, + "step": 10915, + "time_per_iteration": 2.514828681945801 + }, + { + "auxiliary_loss_clip": 0.01071335, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.0358007, + "balance_loss_mlp": 1.01669419, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 2.2187940063785025, + "language_loss": 0.79698509, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81799698, + "num_input_tokens_seen": 235619165, + "step": 10916, + "time_per_iteration": 2.5522007942199707 + }, + { + "auxiliary_loss_clip": 0.010858, + "auxiliary_loss_mlp": 0.01035245, + "balance_loss_clip": 1.03552902, + "balance_loss_mlp": 1.02294815, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 1.8594695422186596, + "language_loss": 0.75741386, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.77862436, + "num_input_tokens_seen": 235637115, + "step": 10917, + "time_per_iteration": 2.4694390296936035 + }, + { + "auxiliary_loss_clip": 0.01111928, + "auxiliary_loss_mlp": 0.00778487, + "balance_loss_clip": 1.03903842, + "balance_loss_mlp": 1.00088239, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 9.366125082571216, + "language_loss": 0.69873095, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.7176351, + "num_input_tokens_seen": 235656330, + "step": 10918, + "time_per_iteration": 2.464276075363159 + }, + { + "auxiliary_loss_clip": 0.01077463, + "auxiliary_loss_mlp": 0.01037822, + "balance_loss_clip": 1.03762364, + "balance_loss_mlp": 1.02584767, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.5580225399763723, + "language_loss": 0.76154959, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78270251, + "num_input_tokens_seen": 235674510, + "step": 10919, + "time_per_iteration": 2.526581287384033 + }, + { + "auxiliary_loss_clip": 0.01028471, + "auxiliary_loss_mlp": 0.00753479, + "balance_loss_clip": 1.0138526, + "balance_loss_mlp": 1.00044143, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7173229052610103, + "language_loss": 0.5300557, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.54787523, + "num_input_tokens_seen": 235735050, + "step": 10920, + "time_per_iteration": 3.069998025894165 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.0382781, + "balance_loss_mlp": 1.01510334, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.5737276051787703, + "language_loss": 0.65828186, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67958033, + "num_input_tokens_seen": 235757545, + "step": 10921, + "time_per_iteration": 2.5477283000946045 + }, + { + "auxiliary_loss_clip": 0.01086494, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.03527451, + "balance_loss_mlp": 1.02279162, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.6955241821514686, + "language_loss": 0.81021583, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83145803, + "num_input_tokens_seen": 235777265, + "step": 10922, + "time_per_iteration": 2.5057461261749268 + }, + { + "auxiliary_loss_clip": 0.01061684, + "auxiliary_loss_mlp": 0.00783224, + "balance_loss_clip": 1.03995514, + "balance_loss_mlp": 1.00092041, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 2.1354298245308843, + "language_loss": 0.7156384, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.73408747, + "num_input_tokens_seen": 235796565, + "step": 10923, + "time_per_iteration": 2.596583366394043 + }, + { + "auxiliary_loss_clip": 0.01079113, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.04137516, + "balance_loss_mlp": 1.01919687, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 2.4519784064170365, + "language_loss": 0.80922127, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.83032668, + "num_input_tokens_seen": 235814805, + "step": 10924, + "time_per_iteration": 2.510952949523926 + }, + { + "auxiliary_loss_clip": 0.01096401, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.03491378, + "balance_loss_mlp": 1.01908517, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.4886616973267208, + "language_loss": 0.7249918, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74626887, + "num_input_tokens_seen": 235833405, + "step": 10925, + "time_per_iteration": 2.4715938568115234 + }, + { + "auxiliary_loss_clip": 0.01100717, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.03647447, + "balance_loss_mlp": 1.01672959, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.339809189462145, + "language_loss": 0.72787523, + "learning_rate": 1.112709300197942e-06, + "loss": 0.74917114, + "num_input_tokens_seen": 235848530, + "step": 10926, + "time_per_iteration": 3.8893492221832275 + }, + { + "auxiliary_loss_clip": 0.01072228, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.03904653, + "balance_loss_mlp": 1.01810777, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.61362210762468, + "language_loss": 0.72743541, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74847412, + "num_input_tokens_seen": 235867225, + "step": 10927, + "time_per_iteration": 2.5608749389648438 + }, + { + "auxiliary_loss_clip": 0.01005662, + "auxiliary_loss_mlp": 0.01002184, + "balance_loss_clip": 1.011199, + "balance_loss_mlp": 1.0010159, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7286295788991796, + "language_loss": 0.64453697, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66461539, + "num_input_tokens_seen": 235932925, + "step": 10928, + "time_per_iteration": 3.103752613067627 + }, + { + "auxiliary_loss_clip": 0.01101158, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.03661144, + "balance_loss_mlp": 1.01953983, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 2.077538301854281, + "language_loss": 0.77666056, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.79799604, + "num_input_tokens_seen": 235952680, + "step": 10929, + "time_per_iteration": 2.508364200592041 + }, + { + "auxiliary_loss_clip": 0.01079308, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.03606033, + "balance_loss_mlp": 1.0204339, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.9969438206422836, + "language_loss": 0.65656543, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67769051, + "num_input_tokens_seen": 235972075, + "step": 10930, + "time_per_iteration": 2.5744688510894775 + }, + { + "auxiliary_loss_clip": 0.01064872, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.03257644, + "balance_loss_mlp": 1.01809847, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.9082087676539008, + "language_loss": 0.70935476, + "learning_rate": 1.110964538515258e-06, + "loss": 0.7303195, + "num_input_tokens_seen": 235990340, + "step": 10931, + "time_per_iteration": 2.544829845428467 + }, + { + "auxiliary_loss_clip": 0.01068044, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.03678346, + "balance_loss_mlp": 1.02503061, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.2273711435978902, + "language_loss": 0.687316, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.70837235, + "num_input_tokens_seen": 236007470, + "step": 10932, + "time_per_iteration": 2.5426151752471924 + }, + { + "auxiliary_loss_clip": 0.0108939, + "auxiliary_loss_mlp": 0.00779355, + "balance_loss_clip": 1.03566647, + "balance_loss_mlp": 1.00094175, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 9.263532265641787, + "language_loss": 0.80422461, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82291204, + "num_input_tokens_seen": 236029030, + "step": 10933, + "time_per_iteration": 2.680211067199707 + }, + { + "auxiliary_loss_clip": 0.01064699, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.03656054, + "balance_loss_mlp": 1.02292156, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.5828840672736437, + "language_loss": 0.7363891, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75739777, + "num_input_tokens_seen": 236047160, + "step": 10934, + "time_per_iteration": 2.5775725841522217 + }, + { + "auxiliary_loss_clip": 0.01097667, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.03687882, + "balance_loss_mlp": 1.02707314, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.5798912921234434, + "language_loss": 0.76052213, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78191596, + "num_input_tokens_seen": 236069215, + "step": 10935, + "time_per_iteration": 4.150834321975708 + }, + { + "auxiliary_loss_clip": 0.01076541, + "auxiliary_loss_mlp": 0.01041336, + "balance_loss_clip": 1.0374527, + "balance_loss_mlp": 1.02681017, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.836822082080506, + "language_loss": 0.78562891, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.8068077, + "num_input_tokens_seen": 236088335, + "step": 10936, + "time_per_iteration": 2.5650880336761475 + }, + { + "auxiliary_loss_clip": 0.01068204, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.03783941, + "balance_loss_mlp": 1.02442956, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 1.7644352650620307, + "language_loss": 0.69272584, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71377468, + "num_input_tokens_seen": 236108540, + "step": 10937, + "time_per_iteration": 2.5638427734375 + }, + { + "auxiliary_loss_clip": 0.01087051, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.03868508, + "balance_loss_mlp": 1.01910973, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 3.188542771813271, + "language_loss": 0.68970418, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.71089625, + "num_input_tokens_seen": 236124495, + "step": 10938, + "time_per_iteration": 2.4814305305480957 + }, + { + "auxiliary_loss_clip": 0.01089273, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.04041302, + "balance_loss_mlp": 1.02497101, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 2.260681161053516, + "language_loss": 0.71469587, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73597562, + "num_input_tokens_seen": 236142550, + "step": 10939, + "time_per_iteration": 3.821793556213379 + }, + { + "auxiliary_loss_clip": 0.01092485, + "auxiliary_loss_mlp": 0.00779262, + "balance_loss_clip": 1.03822374, + "balance_loss_mlp": 1.00099576, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 2.0694509232505074, + "language_loss": 0.7753883, + "learning_rate": 1.107826092473037e-06, + "loss": 0.79410577, + "num_input_tokens_seen": 236156620, + "step": 10940, + "time_per_iteration": 2.5452916622161865 + }, + { + "auxiliary_loss_clip": 0.0107441, + "auxiliary_loss_mlp": 0.01032764, + "balance_loss_clip": 1.03646517, + "balance_loss_mlp": 1.01928163, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 1.86467957160864, + "language_loss": 0.68406868, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70514047, + "num_input_tokens_seen": 236177095, + "step": 10941, + "time_per_iteration": 2.685143232345581 + }, + { + "auxiliary_loss_clip": 0.01098609, + "auxiliary_loss_mlp": 0.00778772, + "balance_loss_clip": 1.03521657, + "balance_loss_mlp": 1.00083637, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.986087148541147, + "language_loss": 0.67902958, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.69780338, + "num_input_tokens_seen": 236194695, + "step": 10942, + "time_per_iteration": 2.4879493713378906 + }, + { + "auxiliary_loss_clip": 0.01085111, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.03848815, + "balance_loss_mlp": 1.02561116, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 1.7455343577567777, + "language_loss": 0.71340644, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73466045, + "num_input_tokens_seen": 236213885, + "step": 10943, + "time_per_iteration": 2.510939121246338 + }, + { + "auxiliary_loss_clip": 0.01070645, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.03540623, + "balance_loss_mlp": 1.02083409, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.6208758721981595, + "language_loss": 0.59641755, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61746466, + "num_input_tokens_seen": 236237315, + "step": 10944, + "time_per_iteration": 2.5993335247039795 + }, + { + "auxiliary_loss_clip": 0.01106555, + "auxiliary_loss_mlp": 0.01039102, + "balance_loss_clip": 1.03877687, + "balance_loss_mlp": 1.02488005, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.676128586280783, + "language_loss": 0.72429192, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74574852, + "num_input_tokens_seen": 236256345, + "step": 10945, + "time_per_iteration": 2.4909865856170654 + }, + { + "auxiliary_loss_clip": 0.01090992, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.039114, + "balance_loss_mlp": 1.01700318, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.5332431187861555, + "language_loss": 0.70711827, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72831976, + "num_input_tokens_seen": 236281890, + "step": 10946, + "time_per_iteration": 2.6998398303985596 + }, + { + "auxiliary_loss_clip": 0.01102134, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.03914809, + "balance_loss_mlp": 1.02015591, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.8294592501062388, + "language_loss": 0.82107246, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84242558, + "num_input_tokens_seen": 236298370, + "step": 10947, + "time_per_iteration": 2.46138334274292 + }, + { + "auxiliary_loss_clip": 0.01061036, + "auxiliary_loss_mlp": 0.0077954, + "balance_loss_clip": 1.03564715, + "balance_loss_mlp": 1.00088215, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.5623698752043462, + "language_loss": 0.76783431, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.7862401, + "num_input_tokens_seen": 236317380, + "step": 10948, + "time_per_iteration": 2.648832321166992 + }, + { + "auxiliary_loss_clip": 0.0110108, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.03922498, + "balance_loss_mlp": 1.019907, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.7612466471353732, + "language_loss": 0.79112864, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81246114, + "num_input_tokens_seen": 236336210, + "step": 10949, + "time_per_iteration": 2.4751579761505127 + }, + { + "auxiliary_loss_clip": 0.01024459, + "auxiliary_loss_mlp": 0.01002316, + "balance_loss_clip": 1.00981212, + "balance_loss_mlp": 1.00112963, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7375984788693628, + "language_loss": 0.61800647, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63827425, + "num_input_tokens_seen": 236403090, + "step": 10950, + "time_per_iteration": 4.598621845245361 + }, + { + "auxiliary_loss_clip": 0.01099612, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.03799856, + "balance_loss_mlp": 1.02193141, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 2.8261464630613324, + "language_loss": 0.67363375, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69495964, + "num_input_tokens_seen": 236420475, + "step": 10951, + "time_per_iteration": 2.4643030166625977 + }, + { + "auxiliary_loss_clip": 0.0109998, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03702962, + "balance_loss_mlp": 1.02122259, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.8025092608488218, + "language_loss": 0.76593673, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.78727067, + "num_input_tokens_seen": 236441915, + "step": 10952, + "time_per_iteration": 2.5297510623931885 + }, + { + "auxiliary_loss_clip": 0.01112136, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.04058599, + "balance_loss_mlp": 1.0190928, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.905946059645659, + "language_loss": 0.73438066, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75581819, + "num_input_tokens_seen": 236460340, + "step": 10953, + "time_per_iteration": 2.4162495136260986 + }, + { + "auxiliary_loss_clip": 0.01080981, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.03851867, + "balance_loss_mlp": 1.02706921, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 2.598445984405921, + "language_loss": 0.79054165, + "learning_rate": 1.102949515683546e-06, + "loss": 0.81174862, + "num_input_tokens_seen": 236478280, + "step": 10954, + "time_per_iteration": 2.5780975818634033 + }, + { + "auxiliary_loss_clip": 0.01090988, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.03489184, + "balance_loss_mlp": 1.02341712, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 2.2584192357820436, + "language_loss": 0.69388902, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71516478, + "num_input_tokens_seen": 236493225, + "step": 10955, + "time_per_iteration": 2.491375207901001 + }, + { + "auxiliary_loss_clip": 0.01084578, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.03599167, + "balance_loss_mlp": 1.02216327, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 1.7221963211319151, + "language_loss": 0.8075878, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.82877076, + "num_input_tokens_seen": 236514420, + "step": 10956, + "time_per_iteration": 2.542689085006714 + }, + { + "auxiliary_loss_clip": 0.01100191, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.039029, + "balance_loss_mlp": 1.02321589, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 1.902009806715992, + "language_loss": 0.8131268, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83449566, + "num_input_tokens_seen": 236532785, + "step": 10957, + "time_per_iteration": 2.493976593017578 + }, + { + "auxiliary_loss_clip": 0.0108862, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.03828955, + "balance_loss_mlp": 1.01634216, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.7074458824319612, + "language_loss": 0.7615391, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78270066, + "num_input_tokens_seen": 236553330, + "step": 10958, + "time_per_iteration": 2.7152297496795654 + }, + { + "auxiliary_loss_clip": 0.01070451, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.03609145, + "balance_loss_mlp": 1.02128768, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.7686590651628724, + "language_loss": 0.75199616, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77304977, + "num_input_tokens_seen": 236572960, + "step": 10959, + "time_per_iteration": 2.5475075244903564 + }, + { + "auxiliary_loss_clip": 0.01104112, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.04014933, + "balance_loss_mlp": 1.01844728, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.5580300172735329, + "language_loss": 0.65254867, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.67389166, + "num_input_tokens_seen": 236594090, + "step": 10960, + "time_per_iteration": 2.502894163131714 + }, + { + "auxiliary_loss_clip": 0.01119373, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.04162085, + "balance_loss_mlp": 1.01932406, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 1.9627093670526845, + "language_loss": 0.81347913, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.8350026, + "num_input_tokens_seen": 236610190, + "step": 10961, + "time_per_iteration": 2.4216299057006836 + }, + { + "auxiliary_loss_clip": 0.01077192, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.03871357, + "balance_loss_mlp": 1.01592243, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 1.6950360688711035, + "language_loss": 0.73653126, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75758725, + "num_input_tokens_seen": 236631575, + "step": 10962, + "time_per_iteration": 2.5889780521392822 + }, + { + "auxiliary_loss_clip": 0.01098795, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.03770304, + "balance_loss_mlp": 1.02128863, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 1.7890045853516159, + "language_loss": 0.79936874, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.82069457, + "num_input_tokens_seen": 236649815, + "step": 10963, + "time_per_iteration": 2.45037841796875 + }, + { + "auxiliary_loss_clip": 0.01061054, + "auxiliary_loss_mlp": 0.00775937, + "balance_loss_clip": 1.03649354, + "balance_loss_mlp": 1.00082719, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.9761469267055383, + "language_loss": 0.78209752, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.80046743, + "num_input_tokens_seen": 236668335, + "step": 10964, + "time_per_iteration": 2.54478120803833 + }, + { + "auxiliary_loss_clip": 0.01074612, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.03261518, + "balance_loss_mlp": 1.02298796, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.7584604393034582, + "language_loss": 0.74143445, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76253772, + "num_input_tokens_seen": 236688945, + "step": 10965, + "time_per_iteration": 4.084998846054077 + }, + { + "auxiliary_loss_clip": 0.01080276, + "auxiliary_loss_mlp": 0.01033446, + "balance_loss_clip": 1.03539217, + "balance_loss_mlp": 1.01968336, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 2.6385338411471624, + "language_loss": 0.74079597, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.76193321, + "num_input_tokens_seen": 236707055, + "step": 10966, + "time_per_iteration": 2.534304141998291 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.0173595, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.6061505437381325, + "language_loss": 0.76906955, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79040074, + "num_input_tokens_seen": 236725900, + "step": 10967, + "time_per_iteration": 2.4930365085601807 + }, + { + "auxiliary_loss_clip": 0.01025079, + "auxiliary_loss_mlp": 0.01001625, + "balance_loss_clip": 1.01036859, + "balance_loss_mlp": 1.00046313, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6929421086524978, + "language_loss": 0.48446375, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50473082, + "num_input_tokens_seen": 236788415, + "step": 10968, + "time_per_iteration": 3.0278165340423584 + }, + { + "auxiliary_loss_clip": 0.0106944, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.03471136, + "balance_loss_mlp": 1.02331448, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.7341229612347522, + "language_loss": 0.78908086, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81014407, + "num_input_tokens_seen": 236805155, + "step": 10969, + "time_per_iteration": 2.576239585876465 + }, + { + "auxiliary_loss_clip": 0.01103133, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.03881121, + "balance_loss_mlp": 1.01865387, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 2.0726428050156915, + "language_loss": 0.65564054, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.67698324, + "num_input_tokens_seen": 236824360, + "step": 10970, + "time_per_iteration": 2.446653366088867 + }, + { + "auxiliary_loss_clip": 0.01098167, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.03525627, + "balance_loss_mlp": 1.01621282, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.59262898426004, + "language_loss": 0.76361609, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78489089, + "num_input_tokens_seen": 236844640, + "step": 10971, + "time_per_iteration": 2.468273162841797 + }, + { + "auxiliary_loss_clip": 0.01047809, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_clip": 1.03097677, + "balance_loss_mlp": 1.02834845, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.3601637045249344, + "language_loss": 0.69813663, + "learning_rate": 1.096689432978629e-06, + "loss": 0.71904635, + "num_input_tokens_seen": 236861160, + "step": 10972, + "time_per_iteration": 2.5589993000030518 + }, + { + "auxiliary_loss_clip": 0.01100087, + "auxiliary_loss_mlp": 0.01025501, + "balance_loss_clip": 1.03838241, + "balance_loss_mlp": 1.01254272, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 1.8385301108950134, + "language_loss": 0.56059557, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.58185148, + "num_input_tokens_seen": 236880465, + "step": 10973, + "time_per_iteration": 2.5415778160095215 + }, + { + "auxiliary_loss_clip": 0.01098624, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.04246581, + "balance_loss_mlp": 1.02411151, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 1.984339434118089, + "language_loss": 0.78867722, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.81003308, + "num_input_tokens_seen": 236897730, + "step": 10974, + "time_per_iteration": 2.486203908920288 + }, + { + "auxiliary_loss_clip": 0.01104184, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.04451847, + "balance_loss_mlp": 1.02133381, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.1022021121250347, + "language_loss": 0.68915832, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.71053654, + "num_input_tokens_seen": 236917300, + "step": 10975, + "time_per_iteration": 4.069784879684448 + }, + { + "auxiliary_loss_clip": 0.01095246, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.03740501, + "balance_loss_mlp": 1.01853645, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.7124519029712655, + "language_loss": 0.70454615, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.72580492, + "num_input_tokens_seen": 236935590, + "step": 10976, + "time_per_iteration": 2.4473013877868652 + }, + { + "auxiliary_loss_clip": 0.01086367, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.04070854, + "balance_loss_mlp": 1.01798117, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.6096996602049076, + "language_loss": 0.67490923, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69608271, + "num_input_tokens_seen": 236952830, + "step": 10977, + "time_per_iteration": 2.5055909156799316 + }, + { + "auxiliary_loss_clip": 0.01077798, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.03705871, + "balance_loss_mlp": 1.02072573, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 2.059863630661622, + "language_loss": 0.81186628, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83298743, + "num_input_tokens_seen": 236971930, + "step": 10978, + "time_per_iteration": 3.6813342571258545 + }, + { + "auxiliary_loss_clip": 0.01083715, + "auxiliary_loss_mlp": 0.01036339, + "balance_loss_clip": 1.03872347, + "balance_loss_mlp": 1.02321994, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 2.1710393966579833, + "language_loss": 0.67784524, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69904578, + "num_input_tokens_seen": 236989920, + "step": 10979, + "time_per_iteration": 2.4972851276397705 + }, + { + "auxiliary_loss_clip": 0.01081553, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.03612113, + "balance_loss_mlp": 1.01772308, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.160915650140424, + "language_loss": 0.73590022, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.75702733, + "num_input_tokens_seen": 237006570, + "step": 10980, + "time_per_iteration": 2.4968721866607666 + }, + { + "auxiliary_loss_clip": 0.01072428, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.03458047, + "balance_loss_mlp": 1.01987672, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.8189079275830273, + "language_loss": 0.73054236, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.75158083, + "num_input_tokens_seen": 237028415, + "step": 10981, + "time_per_iteration": 2.5945611000061035 + }, + { + "auxiliary_loss_clip": 0.01061237, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.03617859, + "balance_loss_mlp": 1.01947796, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 2.2119233270772427, + "language_loss": 0.68915927, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.71009159, + "num_input_tokens_seen": 237046595, + "step": 10982, + "time_per_iteration": 2.6206977367401123 + }, + { + "auxiliary_loss_clip": 0.01101014, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.03880858, + "balance_loss_mlp": 1.01538062, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.8235714228211877, + "language_loss": 0.69820869, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71949428, + "num_input_tokens_seen": 237066150, + "step": 10983, + "time_per_iteration": 2.441436529159546 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.0102824, + "balance_loss_clip": 1.03621328, + "balance_loss_mlp": 1.01543665, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.8070112822571271, + "language_loss": 0.70856494, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72985911, + "num_input_tokens_seen": 237087060, + "step": 10984, + "time_per_iteration": 2.562506675720215 + }, + { + "auxiliary_loss_clip": 0.01077949, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.03586757, + "balance_loss_mlp": 1.02330065, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.8850369079835443, + "language_loss": 0.83826649, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.85940158, + "num_input_tokens_seen": 237103825, + "step": 10985, + "time_per_iteration": 2.495847702026367 + }, + { + "auxiliary_loss_clip": 0.01103414, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.03977108, + "balance_loss_mlp": 1.01812196, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.318051696009068, + "language_loss": 0.74383152, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76517636, + "num_input_tokens_seen": 237121740, + "step": 10986, + "time_per_iteration": 2.4576570987701416 + }, + { + "auxiliary_loss_clip": 0.01098933, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.0374552, + "balance_loss_mlp": 1.016258, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 2.1454486848999053, + "language_loss": 0.78987896, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.81115699, + "num_input_tokens_seen": 237139565, + "step": 10987, + "time_per_iteration": 2.430408477783203 + }, + { + "auxiliary_loss_clip": 0.01024192, + "auxiliary_loss_mlp": 0.01005118, + "balance_loss_clip": 1.02335227, + "balance_loss_mlp": 1.0037415, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8136888044935321, + "language_loss": 0.54085571, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56114882, + "num_input_tokens_seen": 237201055, + "step": 10988, + "time_per_iteration": 3.1638481616973877 + }, + { + "auxiliary_loss_clip": 0.01055151, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.02191424, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.644511959996591, + "language_loss": 0.77476192, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79564619, + "num_input_tokens_seen": 237221805, + "step": 10989, + "time_per_iteration": 4.1973841190338135 + }, + { + "auxiliary_loss_clip": 0.01090509, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.03903115, + "balance_loss_mlp": 1.02059042, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 1.9304872509775226, + "language_loss": 0.77591699, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.79715276, + "num_input_tokens_seen": 237238270, + "step": 10990, + "time_per_iteration": 2.550358533859253 + }, + { + "auxiliary_loss_clip": 0.01113588, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.03814435, + "balance_loss_mlp": 1.0189414, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 1.919083886255136, + "language_loss": 0.60825896, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62971485, + "num_input_tokens_seen": 237255400, + "step": 10991, + "time_per_iteration": 2.413503646850586 + }, + { + "auxiliary_loss_clip": 0.01088693, + "auxiliary_loss_mlp": 0.0103784, + "balance_loss_clip": 1.03618765, + "balance_loss_mlp": 1.0241127, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 4.257195668675079, + "language_loss": 0.68804443, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70930976, + "num_input_tokens_seen": 237273105, + "step": 10992, + "time_per_iteration": 2.499086380004883 + }, + { + "auxiliary_loss_clip": 0.01100413, + "auxiliary_loss_mlp": 0.01031487, + "balance_loss_clip": 1.03805089, + "balance_loss_mlp": 1.01833868, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 2.1569943822443918, + "language_loss": 0.87988013, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.9011991, + "num_input_tokens_seen": 237292650, + "step": 10993, + "time_per_iteration": 2.4639675617218018 + }, + { + "auxiliary_loss_clip": 0.01112902, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.04266965, + "balance_loss_mlp": 1.0181433, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.5981269044471866, + "language_loss": 0.66777837, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.68923408, + "num_input_tokens_seen": 237312865, + "step": 10994, + "time_per_iteration": 2.5051426887512207 + }, + { + "auxiliary_loss_clip": 0.01079605, + "auxiliary_loss_mlp": 0.01036271, + "balance_loss_clip": 1.0378716, + "balance_loss_mlp": 1.02227008, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 2.273708310503288, + "language_loss": 0.76751757, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.78867638, + "num_input_tokens_seen": 237331210, + "step": 10995, + "time_per_iteration": 2.5188329219818115 + }, + { + "auxiliary_loss_clip": 0.01094174, + "auxiliary_loss_mlp": 0.0102565, + "balance_loss_clip": 1.03999281, + "balance_loss_mlp": 1.01422954, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.8600831001469789, + "language_loss": 0.74844396, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76964217, + "num_input_tokens_seen": 237349455, + "step": 10996, + "time_per_iteration": 2.5050435066223145 + }, + { + "auxiliary_loss_clip": 0.01113365, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.04009104, + "balance_loss_mlp": 1.02091622, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.6219670023180544, + "language_loss": 0.69003892, + "learning_rate": 1.088013301487126e-06, + "loss": 0.71150899, + "num_input_tokens_seen": 237367100, + "step": 10997, + "time_per_iteration": 2.477360248565674 + }, + { + "auxiliary_loss_clip": 0.01095453, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.03875208, + "balance_loss_mlp": 1.01846588, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 2.8772576439543047, + "language_loss": 0.6835956, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.70485789, + "num_input_tokens_seen": 237384840, + "step": 10998, + "time_per_iteration": 2.487952470779419 + }, + { + "auxiliary_loss_clip": 0.01025715, + "auxiliary_loss_mlp": 0.01006976, + "balance_loss_clip": 1.01036692, + "balance_loss_mlp": 1.00571275, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6523546940357365, + "language_loss": 0.51102513, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53135204, + "num_input_tokens_seen": 237443355, + "step": 10999, + "time_per_iteration": 3.0008773803710938 + }, + { + "auxiliary_loss_clip": 0.01115479, + "auxiliary_loss_mlp": 0.0077874, + "balance_loss_clip": 1.03927863, + "balance_loss_mlp": 1.00073576, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.4950486333467707, + "language_loss": 0.71019769, + "learning_rate": 1.086973614127679e-06, + "loss": 0.7291398, + "num_input_tokens_seen": 237459205, + "step": 11000, + "time_per_iteration": 2.4394960403442383 + }, + { + "auxiliary_loss_clip": 0.01082515, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.03726196, + "balance_loss_mlp": 1.02219558, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.4734829238286142, + "language_loss": 0.65360308, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.6747666, + "num_input_tokens_seen": 237483580, + "step": 11001, + "time_per_iteration": 2.619685649871826 + }, + { + "auxiliary_loss_clip": 0.01110554, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.03772509, + "balance_loss_mlp": 1.01867056, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.709935493910112, + "language_loss": 0.73178321, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75320005, + "num_input_tokens_seen": 237502860, + "step": 11002, + "time_per_iteration": 2.461899518966675 + }, + { + "auxiliary_loss_clip": 0.01096732, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.03563929, + "balance_loss_mlp": 1.01932693, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 3.4400089592364496, + "language_loss": 0.78649616, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.80778718, + "num_input_tokens_seen": 237521030, + "step": 11003, + "time_per_iteration": 2.440906047821045 + }, + { + "auxiliary_loss_clip": 0.01103823, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.03953075, + "balance_loss_mlp": 1.02055848, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 2.2924832909372017, + "language_loss": 0.68715, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.70853508, + "num_input_tokens_seen": 237539585, + "step": 11004, + "time_per_iteration": 3.8934972286224365 + }, + { + "auxiliary_loss_clip": 0.01103966, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.03835416, + "balance_loss_mlp": 1.02031851, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 1.9919989618546254, + "language_loss": 0.69901121, + "learning_rate": 1.085241494478132e-06, + "loss": 0.72039473, + "num_input_tokens_seen": 237557655, + "step": 11005, + "time_per_iteration": 2.4183812141418457 + }, + { + "auxiliary_loss_clip": 0.0109205, + "auxiliary_loss_mlp": 0.01025635, + "balance_loss_clip": 1.03761995, + "balance_loss_mlp": 1.01343417, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 4.390731328790315, + "language_loss": 0.7838586, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80503547, + "num_input_tokens_seen": 237577000, + "step": 11006, + "time_per_iteration": 2.529981851577759 + }, + { + "auxiliary_loss_clip": 0.01100935, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.03834558, + "balance_loss_mlp": 1.01900578, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.6796909610230217, + "language_loss": 0.76026553, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78159404, + "num_input_tokens_seen": 237597960, + "step": 11007, + "time_per_iteration": 2.480257749557495 + }, + { + "auxiliary_loss_clip": 0.01100159, + "auxiliary_loss_mlp": 0.01027505, + "balance_loss_clip": 1.03976488, + "balance_loss_mlp": 1.01554191, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.972354472547723, + "language_loss": 0.78817308, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80944967, + "num_input_tokens_seen": 237616385, + "step": 11008, + "time_per_iteration": 2.459231376647949 + }, + { + "auxiliary_loss_clip": 0.01116675, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.03948951, + "balance_loss_mlp": 1.01708412, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 5.129168066341515, + "language_loss": 0.8207221, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.84219527, + "num_input_tokens_seen": 237634930, + "step": 11009, + "time_per_iteration": 2.393583059310913 + }, + { + "auxiliary_loss_clip": 0.01011306, + "auxiliary_loss_mlp": 0.01000762, + "balance_loss_clip": 1.02146626, + "balance_loss_mlp": 0.99929583, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9858169684443127, + "language_loss": 0.67335236, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69347298, + "num_input_tokens_seen": 237693175, + "step": 11010, + "time_per_iteration": 3.017307758331299 + }, + { + "auxiliary_loss_clip": 0.01102465, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.03749406, + "balance_loss_mlp": 1.01699889, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.628277938455223, + "language_loss": 0.71313274, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73446476, + "num_input_tokens_seen": 237713160, + "step": 11011, + "time_per_iteration": 2.4634921550750732 + }, + { + "auxiliary_loss_clip": 0.01104538, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.04228747, + "balance_loss_mlp": 1.02153134, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.5727019669319693, + "language_loss": 0.72055382, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74192894, + "num_input_tokens_seen": 237733600, + "step": 11012, + "time_per_iteration": 2.486243724822998 + }, + { + "auxiliary_loss_clip": 0.01096566, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.03749752, + "balance_loss_mlp": 1.02276647, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.8586146981480711, + "language_loss": 0.79281354, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81411707, + "num_input_tokens_seen": 237752135, + "step": 11013, + "time_per_iteration": 2.4715611934661865 + }, + { + "auxiliary_loss_clip": 0.01091327, + "auxiliary_loss_mlp": 0.01030501, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.01770949, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 2.3488175410974703, + "language_loss": 0.70410508, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72532338, + "num_input_tokens_seen": 237770735, + "step": 11014, + "time_per_iteration": 3.997149705886841 + }, + { + "auxiliary_loss_clip": 0.01082963, + "auxiliary_loss_mlp": 0.00775364, + "balance_loss_clip": 1.04109144, + "balance_loss_mlp": 1.00062799, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 1.9432058746564302, + "language_loss": 0.77178395, + "learning_rate": 1.081779858400137e-06, + "loss": 0.79036719, + "num_input_tokens_seen": 237789005, + "step": 11015, + "time_per_iteration": 2.5155105590820312 + }, + { + "auxiliary_loss_clip": 0.01102567, + "auxiliary_loss_mlp": 0.00777864, + "balance_loss_clip": 1.03893316, + "balance_loss_mlp": 1.00075507, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.6518801979456255, + "language_loss": 0.82479072, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.84359503, + "num_input_tokens_seen": 237807740, + "step": 11016, + "time_per_iteration": 2.4491090774536133 + }, + { + "auxiliary_loss_clip": 0.01097478, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.037956, + "balance_loss_mlp": 1.02090573, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 2.6043580183359185, + "language_loss": 0.69648874, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.71780205, + "num_input_tokens_seen": 237826340, + "step": 11017, + "time_per_iteration": 3.76121187210083 + }, + { + "auxiliary_loss_clip": 0.01084264, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.03582311, + "balance_loss_mlp": 1.02560854, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.6747918890679963, + "language_loss": 0.77092427, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79215401, + "num_input_tokens_seen": 237848305, + "step": 11018, + "time_per_iteration": 2.725764751434326 + }, + { + "auxiliary_loss_clip": 0.01090619, + "auxiliary_loss_mlp": 0.01038441, + "balance_loss_clip": 1.03703475, + "balance_loss_mlp": 1.02535176, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 1.944500963371467, + "language_loss": 0.83401096, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85530162, + "num_input_tokens_seen": 237867020, + "step": 11019, + "time_per_iteration": 2.47916841506958 + }, + { + "auxiliary_loss_clip": 0.01096457, + "auxiliary_loss_mlp": 0.00780985, + "balance_loss_clip": 1.03681719, + "balance_loss_mlp": 1.00060821, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 18.107467321729853, + "language_loss": 0.71573102, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73450541, + "num_input_tokens_seen": 237886710, + "step": 11020, + "time_per_iteration": 2.4919610023498535 + }, + { + "auxiliary_loss_clip": 0.01092854, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.03759432, + "balance_loss_mlp": 1.01643693, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 1.6395005459875438, + "language_loss": 0.72306526, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74430126, + "num_input_tokens_seen": 237904795, + "step": 11021, + "time_per_iteration": 2.489802598953247 + }, + { + "auxiliary_loss_clip": 0.01093746, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.04370689, + "balance_loss_mlp": 1.02111077, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 3.2837095536208594, + "language_loss": 0.83166993, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85295159, + "num_input_tokens_seen": 237921320, + "step": 11022, + "time_per_iteration": 2.4753940105438232 + }, + { + "auxiliary_loss_clip": 0.01097826, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.03901649, + "balance_loss_mlp": 1.02067208, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 3.1043510060610986, + "language_loss": 0.7276206, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.74894857, + "num_input_tokens_seen": 237933525, + "step": 11023, + "time_per_iteration": 2.4398088455200195 + }, + { + "auxiliary_loss_clip": 0.01079933, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.03523302, + "balance_loss_mlp": 1.02095616, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 2.0781595126309935, + "language_loss": 0.74720514, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.76834118, + "num_input_tokens_seen": 237953395, + "step": 11024, + "time_per_iteration": 2.541405439376831 + }, + { + "auxiliary_loss_clip": 0.01082029, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.03781533, + "balance_loss_mlp": 1.0195787, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.5532846733122594, + "language_loss": 0.6981194, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71926439, + "num_input_tokens_seen": 237971445, + "step": 11025, + "time_per_iteration": 2.4952774047851562 + }, + { + "auxiliary_loss_clip": 0.01115946, + "auxiliary_loss_mlp": 0.01036034, + "balance_loss_clip": 1.04167378, + "balance_loss_mlp": 1.02349949, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.4818957152047574, + "language_loss": 0.7866993, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.80821902, + "num_input_tokens_seen": 237989965, + "step": 11026, + "time_per_iteration": 2.4353580474853516 + }, + { + "auxiliary_loss_clip": 0.01102764, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.03906894, + "balance_loss_mlp": 1.02323008, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.7963505684550947, + "language_loss": 0.76002681, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78140831, + "num_input_tokens_seen": 238006820, + "step": 11027, + "time_per_iteration": 2.460071325302124 + }, + { + "auxiliary_loss_clip": 0.01086798, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.03689575, + "balance_loss_mlp": 1.02267694, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.19797586189494, + "language_loss": 0.70013529, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.7213676, + "num_input_tokens_seen": 238022560, + "step": 11028, + "time_per_iteration": 4.061816692352295 + }, + { + "auxiliary_loss_clip": 0.01101366, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.03830373, + "balance_loss_mlp": 1.0241363, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 2.1026690871256903, + "language_loss": 0.80030411, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.82167065, + "num_input_tokens_seen": 238041895, + "step": 11029, + "time_per_iteration": 2.447158098220825 + }, + { + "auxiliary_loss_clip": 0.01115954, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.03995752, + "balance_loss_mlp": 1.0232873, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.0244936742399906, + "language_loss": 0.76333475, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78486156, + "num_input_tokens_seen": 238060445, + "step": 11030, + "time_per_iteration": 2.3951337337493896 + }, + { + "auxiliary_loss_clip": 0.01108851, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.04064345, + "balance_loss_mlp": 1.01918721, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 3.200624876679087, + "language_loss": 0.75015914, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.77156997, + "num_input_tokens_seen": 238077080, + "step": 11031, + "time_per_iteration": 2.4136276245117188 + }, + { + "auxiliary_loss_clip": 0.01102345, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.03776598, + "balance_loss_mlp": 1.02321208, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 3.58445942334113, + "language_loss": 0.74825865, + "learning_rate": 1.075903075048228e-06, + "loss": 0.76964498, + "num_input_tokens_seen": 238091045, + "step": 11032, + "time_per_iteration": 2.382577657699585 + }, + { + "auxiliary_loss_clip": 0.01071824, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.03523183, + "balance_loss_mlp": 1.0201664, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.6395413076324972, + "language_loss": 0.80584621, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82688594, + "num_input_tokens_seen": 238110220, + "step": 11033, + "time_per_iteration": 2.58164381980896 + }, + { + "auxiliary_loss_clip": 0.01095968, + "auxiliary_loss_mlp": 0.01032374, + "balance_loss_clip": 1.04041266, + "balance_loss_mlp": 1.01939249, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 1.7030126152652072, + "language_loss": 0.80421001, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82549345, + "num_input_tokens_seen": 238130400, + "step": 11034, + "time_per_iteration": 2.50724196434021 + }, + { + "auxiliary_loss_clip": 0.01095566, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.03715563, + "balance_loss_mlp": 1.01880121, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.5553671801785134, + "language_loss": 0.75948584, + "learning_rate": 1.074867045054166e-06, + "loss": 0.78074843, + "num_input_tokens_seen": 238148165, + "step": 11035, + "time_per_iteration": 2.4632720947265625 + }, + { + "auxiliary_loss_clip": 0.0107859, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.03451133, + "balance_loss_mlp": 1.01614892, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 1.7007805022146647, + "language_loss": 0.83432829, + "learning_rate": 1.074521771867622e-06, + "loss": 0.85540169, + "num_input_tokens_seen": 238166360, + "step": 11036, + "time_per_iteration": 2.5022127628326416 + }, + { + "auxiliary_loss_clip": 0.01031433, + "auxiliary_loss_mlp": 0.01001448, + "balance_loss_clip": 1.00724244, + "balance_loss_mlp": 1.00009525, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.8203308591107737, + "language_loss": 0.52523392, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54556274, + "num_input_tokens_seen": 238227630, + "step": 11037, + "time_per_iteration": 2.998868227005005 + }, + { + "auxiliary_loss_clip": 0.01063017, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_clip": 1.03853595, + "balance_loss_mlp": 1.02875805, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.6696085787318826, + "language_loss": 0.78921032, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81026512, + "num_input_tokens_seen": 238248435, + "step": 11038, + "time_per_iteration": 2.639944553375244 + }, + { + "auxiliary_loss_clip": 0.01080718, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.03707814, + "balance_loss_mlp": 1.02771997, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 1.850676655963153, + "language_loss": 0.63733089, + "learning_rate": 1.073486162925716e-06, + "loss": 0.65855056, + "num_input_tokens_seen": 238268755, + "step": 11039, + "time_per_iteration": 2.660857915878296 + }, + { + "auxiliary_loss_clip": 0.01080524, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.04296041, + "balance_loss_mlp": 1.01706052, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 1.5909743980663238, + "language_loss": 0.63850224, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.65960276, + "num_input_tokens_seen": 238290120, + "step": 11040, + "time_per_iteration": 2.582897663116455 + }, + { + "auxiliary_loss_clip": 0.01073115, + "auxiliary_loss_mlp": 0.01041117, + "balance_loss_clip": 1.03349447, + "balance_loss_mlp": 1.02790236, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 2.0155489009484904, + "language_loss": 0.72196412, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.74310637, + "num_input_tokens_seen": 238309290, + "step": 11041, + "time_per_iteration": 2.54062819480896 + }, + { + "auxiliary_loss_clip": 0.01097242, + "auxiliary_loss_mlp": 0.01046896, + "balance_loss_clip": 1.03560686, + "balance_loss_mlp": 1.03273344, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 2.311097199438913, + "language_loss": 0.62026727, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.64170873, + "num_input_tokens_seen": 238327280, + "step": 11042, + "time_per_iteration": 2.542790174484253 + }, + { + "auxiliary_loss_clip": 0.01105531, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.03793025, + "balance_loss_mlp": 1.01692867, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 6.411231550692272, + "language_loss": 0.68529499, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70665842, + "num_input_tokens_seen": 238346330, + "step": 11043, + "time_per_iteration": 2.5111215114593506 + }, + { + "auxiliary_loss_clip": 0.01098685, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.03901553, + "balance_loss_mlp": 1.01730561, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.516941247821142, + "language_loss": 0.83923733, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86050636, + "num_input_tokens_seen": 238364650, + "step": 11044, + "time_per_iteration": 3.9250993728637695 + }, + { + "auxiliary_loss_clip": 0.01077848, + "auxiliary_loss_mlp": 0.01035667, + "balance_loss_clip": 1.03925312, + "balance_loss_mlp": 1.02232742, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 2.2120417437942765, + "language_loss": 0.69508755, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.7162227, + "num_input_tokens_seen": 238381630, + "step": 11045, + "time_per_iteration": 2.508765459060669 + }, + { + "auxiliary_loss_clip": 0.01101517, + "auxiliary_loss_mlp": 0.01028603, + "balance_loss_clip": 1.03858411, + "balance_loss_mlp": 1.01583576, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.4497139046806558, + "language_loss": 0.64509344, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66639471, + "num_input_tokens_seen": 238402595, + "step": 11046, + "time_per_iteration": 2.511579990386963 + }, + { + "auxiliary_loss_clip": 0.01084002, + "auxiliary_loss_mlp": 0.01026564, + "balance_loss_clip": 1.03951216, + "balance_loss_mlp": 1.01477396, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.4759537692172748, + "language_loss": 0.71503913, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73614478, + "num_input_tokens_seen": 238426860, + "step": 11047, + "time_per_iteration": 2.6975669860839844 + }, + { + "auxiliary_loss_clip": 0.01049199, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.03967118, + "balance_loss_mlp": 1.02270758, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 2.040326727902646, + "language_loss": 0.77327687, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79412842, + "num_input_tokens_seen": 238443990, + "step": 11048, + "time_per_iteration": 2.559532403945923 + }, + { + "auxiliary_loss_clip": 0.01005078, + "auxiliary_loss_mlp": 0.01003821, + "balance_loss_clip": 1.00852942, + "balance_loss_mlp": 1.00248015, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.7548289262244776, + "language_loss": 0.5506056, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57069457, + "num_input_tokens_seen": 238503045, + "step": 11049, + "time_per_iteration": 3.12370228767395 + }, + { + "auxiliary_loss_clip": 0.01102667, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.03976929, + "balance_loss_mlp": 1.0222342, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 2.052607920898293, + "language_loss": 0.64532048, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66668713, + "num_input_tokens_seen": 238527320, + "step": 11050, + "time_per_iteration": 2.5548431873321533 + }, + { + "auxiliary_loss_clip": 0.01110263, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.03823566, + "balance_loss_mlp": 1.02245498, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.932445550284249, + "language_loss": 0.7869916, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.80843532, + "num_input_tokens_seen": 238546030, + "step": 11051, + "time_per_iteration": 2.4474034309387207 + }, + { + "auxiliary_loss_clip": 0.01089477, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.03912175, + "balance_loss_mlp": 1.02114999, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 1.803333543837533, + "language_loss": 0.85182953, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87306458, + "num_input_tokens_seen": 238564175, + "step": 11052, + "time_per_iteration": 2.4945504665374756 + }, + { + "auxiliary_loss_clip": 0.01068787, + "auxiliary_loss_mlp": 0.01040114, + "balance_loss_clip": 1.03585339, + "balance_loss_mlp": 1.02602363, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.3828696091322534, + "language_loss": 0.74775839, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.76884735, + "num_input_tokens_seen": 238581010, + "step": 11053, + "time_per_iteration": 3.9912421703338623 + }, + { + "auxiliary_loss_clip": 0.01081315, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.03669715, + "balance_loss_mlp": 1.01995039, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.6724097544757217, + "language_loss": 0.79753184, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81866282, + "num_input_tokens_seen": 238601365, + "step": 11054, + "time_per_iteration": 2.521657705307007 + }, + { + "auxiliary_loss_clip": 0.01067233, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.03704643, + "balance_loss_mlp": 1.02068305, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 2.4303243558294185, + "language_loss": 0.7389338, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.75992537, + "num_input_tokens_seen": 238619850, + "step": 11055, + "time_per_iteration": 2.558354616165161 + }, + { + "auxiliary_loss_clip": 0.01080192, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.03658843, + "balance_loss_mlp": 1.02558947, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 1.7196135833671364, + "language_loss": 0.72989964, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.75109333, + "num_input_tokens_seen": 238637635, + "step": 11056, + "time_per_iteration": 3.8822362422943115 + }, + { + "auxiliary_loss_clip": 0.01068361, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.03550136, + "balance_loss_mlp": 1.02065301, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 2.7230784592721635, + "language_loss": 0.69659114, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71760118, + "num_input_tokens_seen": 238656200, + "step": 11057, + "time_per_iteration": 2.5802342891693115 + }, + { + "auxiliary_loss_clip": 0.01101184, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.03816593, + "balance_loss_mlp": 1.01741576, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 2.0163708790592785, + "language_loss": 0.80518931, + "learning_rate": 1.066934663776291e-06, + "loss": 0.82650208, + "num_input_tokens_seen": 238675005, + "step": 11058, + "time_per_iteration": 2.4786431789398193 + }, + { + "auxiliary_loss_clip": 0.01008359, + "auxiliary_loss_mlp": 0.01006105, + "balance_loss_clip": 1.0121814, + "balance_loss_mlp": 1.00485349, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.7852243734112925, + "language_loss": 0.62593806, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64608276, + "num_input_tokens_seen": 238731425, + "step": 11059, + "time_per_iteration": 2.9947149753570557 + }, + { + "auxiliary_loss_clip": 0.01098271, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.03700089, + "balance_loss_mlp": 1.02195764, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.3759489156123905, + "language_loss": 0.78776848, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.80908728, + "num_input_tokens_seen": 238752020, + "step": 11060, + "time_per_iteration": 2.489473581314087 + }, + { + "auxiliary_loss_clip": 0.01080812, + "auxiliary_loss_mlp": 0.01034571, + "balance_loss_clip": 1.03755736, + "balance_loss_mlp": 1.02176762, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 1.7673861281751173, + "language_loss": 0.78749394, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.80864775, + "num_input_tokens_seen": 238769665, + "step": 11061, + "time_per_iteration": 2.5135514736175537 + }, + { + "auxiliary_loss_clip": 0.01090209, + "auxiliary_loss_mlp": 0.01026199, + "balance_loss_clip": 1.04024625, + "balance_loss_mlp": 1.01424205, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.161636323257331, + "language_loss": 0.56571209, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.58687615, + "num_input_tokens_seen": 238782180, + "step": 11062, + "time_per_iteration": 2.454463005065918 + }, + { + "auxiliary_loss_clip": 0.01101571, + "auxiliary_loss_mlp": 0.0103616, + "balance_loss_clip": 1.03606927, + "balance_loss_mlp": 1.0200901, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.8719271223409213, + "language_loss": 0.75977069, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78114802, + "num_input_tokens_seen": 238800315, + "step": 11063, + "time_per_iteration": 2.436211109161377 + }, + { + "auxiliary_loss_clip": 0.01052916, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.03521311, + "balance_loss_mlp": 1.0276978, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.2966698767147418, + "language_loss": 0.70758957, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72852755, + "num_input_tokens_seen": 238822250, + "step": 11064, + "time_per_iteration": 2.627359390258789 + }, + { + "auxiliary_loss_clip": 0.01031388, + "auxiliary_loss_mlp": 0.01002544, + "balance_loss_clip": 1.00710082, + "balance_loss_mlp": 1.00122118, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8433256324966333, + "language_loss": 0.63093066, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65126991, + "num_input_tokens_seen": 238877190, + "step": 11065, + "time_per_iteration": 2.9449973106384277 + }, + { + "auxiliary_loss_clip": 0.01098872, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.03671205, + "balance_loss_mlp": 1.02226758, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 3.320997132583704, + "language_loss": 0.62270832, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64405656, + "num_input_tokens_seen": 238896010, + "step": 11066, + "time_per_iteration": 2.463271379470825 + }, + { + "auxiliary_loss_clip": 0.0107539, + "auxiliary_loss_mlp": 0.01036549, + "balance_loss_clip": 1.032251, + "balance_loss_mlp": 1.02158785, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.576563858074125, + "language_loss": 0.69751287, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.71863222, + "num_input_tokens_seen": 238918990, + "step": 11067, + "time_per_iteration": 2.588876485824585 + }, + { + "auxiliary_loss_clip": 0.01016358, + "auxiliary_loss_mlp": 0.01010499, + "balance_loss_clip": 1.01017344, + "balance_loss_mlp": 1.00903308, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9293366097987829, + "language_loss": 0.72036862, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74063718, + "num_input_tokens_seen": 238975735, + "step": 11068, + "time_per_iteration": 4.478648900985718 + }, + { + "auxiliary_loss_clip": 0.01004688, + "auxiliary_loss_mlp": 0.00999319, + "balance_loss_clip": 1.00888586, + "balance_loss_mlp": 0.99824029, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.704191955748096, + "language_loss": 0.57716119, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59720123, + "num_input_tokens_seen": 239042360, + "step": 11069, + "time_per_iteration": 3.2165768146514893 + }, + { + "auxiliary_loss_clip": 0.01011555, + "auxiliary_loss_mlp": 0.01000669, + "balance_loss_clip": 1.00640309, + "balance_loss_mlp": 0.99944746, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7527353264931975, + "language_loss": 0.63608408, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65620625, + "num_input_tokens_seen": 239109410, + "step": 11070, + "time_per_iteration": 3.0920493602752686 + }, + { + "auxiliary_loss_clip": 0.01110762, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.03683674, + "balance_loss_mlp": 1.01621318, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 2.142607374630704, + "language_loss": 0.58596104, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60735506, + "num_input_tokens_seen": 239135345, + "step": 11071, + "time_per_iteration": 2.5841610431671143 + }, + { + "auxiliary_loss_clip": 0.01104709, + "auxiliary_loss_mlp": 0.01031332, + "balance_loss_clip": 1.04101026, + "balance_loss_mlp": 1.01936388, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 2.4895150804970387, + "language_loss": 0.72555882, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.74691927, + "num_input_tokens_seen": 239154340, + "step": 11072, + "time_per_iteration": 2.4609484672546387 + }, + { + "auxiliary_loss_clip": 0.01101748, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.03999305, + "balance_loss_mlp": 1.01726866, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 1.8460529940306518, + "language_loss": 0.70453185, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72586167, + "num_input_tokens_seen": 239177815, + "step": 11073, + "time_per_iteration": 2.6117444038391113 + }, + { + "auxiliary_loss_clip": 0.01084138, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.0387311, + "balance_loss_mlp": 1.01914454, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 2.0775683441802624, + "language_loss": 0.55800223, + "learning_rate": 1.061427515134354e-06, + "loss": 0.57916409, + "num_input_tokens_seen": 239195735, + "step": 11074, + "time_per_iteration": 2.5068137645721436 + }, + { + "auxiliary_loss_clip": 0.01113985, + "auxiliary_loss_mlp": 0.00777299, + "balance_loss_clip": 1.04078627, + "balance_loss_mlp": 1.00068402, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.5271651767745775, + "language_loss": 0.72336918, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74228203, + "num_input_tokens_seen": 239217535, + "step": 11075, + "time_per_iteration": 2.5407557487487793 + }, + { + "auxiliary_loss_clip": 0.01098146, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.03748822, + "balance_loss_mlp": 1.01824379, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.6326482370179523, + "language_loss": 0.65787607, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.67915773, + "num_input_tokens_seen": 239241975, + "step": 11076, + "time_per_iteration": 2.602511167526245 + }, + { + "auxiliary_loss_clip": 0.01088713, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.03361726, + "balance_loss_mlp": 1.01942062, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.6554861322799508, + "language_loss": 0.75158942, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77280873, + "num_input_tokens_seen": 239262025, + "step": 11077, + "time_per_iteration": 2.520026206970215 + }, + { + "auxiliary_loss_clip": 0.01089959, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.03670764, + "balance_loss_mlp": 1.01775217, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.6314477723285687, + "language_loss": 0.6705339, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.69173419, + "num_input_tokens_seen": 239282775, + "step": 11078, + "time_per_iteration": 2.5332353115081787 + }, + { + "auxiliary_loss_clip": 0.01114588, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.03882885, + "balance_loss_mlp": 1.02161837, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 3.6606383540406973, + "language_loss": 0.69638336, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71787989, + "num_input_tokens_seen": 239299775, + "step": 11079, + "time_per_iteration": 2.404215097427368 + }, + { + "auxiliary_loss_clip": 0.01090717, + "auxiliary_loss_mlp": 0.01026883, + "balance_loss_clip": 1.03816867, + "balance_loss_mlp": 1.01504493, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.6417982537223734, + "language_loss": 0.80561888, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82679486, + "num_input_tokens_seen": 239319660, + "step": 11080, + "time_per_iteration": 2.5154709815979004 + }, + { + "auxiliary_loss_clip": 0.01074705, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.03538156, + "balance_loss_mlp": 1.0166446, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.8412238194123167, + "language_loss": 0.78717124, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80819917, + "num_input_tokens_seen": 239339215, + "step": 11081, + "time_per_iteration": 2.5508463382720947 + }, + { + "auxiliary_loss_clip": 0.01075115, + "auxiliary_loss_mlp": 0.01041989, + "balance_loss_clip": 1.0369401, + "balance_loss_mlp": 1.02674782, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.7746663982981852, + "language_loss": 0.80010998, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.82128102, + "num_input_tokens_seen": 239358545, + "step": 11082, + "time_per_iteration": 2.544071674346924 + }, + { + "auxiliary_loss_clip": 0.01076172, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.03844047, + "balance_loss_mlp": 1.02345395, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.4696778394077519, + "language_loss": 0.84111857, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86223239, + "num_input_tokens_seen": 239376665, + "step": 11083, + "time_per_iteration": 4.020937204360962 + }, + { + "auxiliary_loss_clip": 0.01079333, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.04183638, + "balance_loss_mlp": 1.02000475, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.176639263643274, + "language_loss": 0.85471481, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87584335, + "num_input_tokens_seen": 239394345, + "step": 11084, + "time_per_iteration": 2.5232810974121094 + }, + { + "auxiliary_loss_clip": 0.01095459, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.03926253, + "balance_loss_mlp": 1.01929808, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.3949928204205957, + "language_loss": 0.72599542, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.74727798, + "num_input_tokens_seen": 239410605, + "step": 11085, + "time_per_iteration": 2.4681272506713867 + }, + { + "auxiliary_loss_clip": 0.01090844, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.03706455, + "balance_loss_mlp": 1.01962352, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.8533573670422667, + "language_loss": 0.8037473, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82498378, + "num_input_tokens_seen": 239427155, + "step": 11086, + "time_per_iteration": 2.488872766494751 + }, + { + "auxiliary_loss_clip": 0.01087929, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.03756189, + "balance_loss_mlp": 1.01943862, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 1.9261606579867727, + "language_loss": 0.74573517, + "learning_rate": 1.056959663258702e-06, + "loss": 0.76694363, + "num_input_tokens_seen": 239445510, + "step": 11087, + "time_per_iteration": 2.515547752380371 + }, + { + "auxiliary_loss_clip": 0.01100031, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.03768075, + "balance_loss_mlp": 1.01966679, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.6794174706681477, + "language_loss": 0.64877039, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.67009163, + "num_input_tokens_seen": 239464805, + "step": 11088, + "time_per_iteration": 2.465291976928711 + }, + { + "auxiliary_loss_clip": 0.0110213, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.03720021, + "balance_loss_mlp": 1.01553226, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 3.814787548023541, + "language_loss": 0.64010489, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66141176, + "num_input_tokens_seen": 239483890, + "step": 11089, + "time_per_iteration": 2.432889938354492 + }, + { + "auxiliary_loss_clip": 0.01112264, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.03925776, + "balance_loss_mlp": 1.01816893, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.4387830244645423, + "language_loss": 0.8150543, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.83647931, + "num_input_tokens_seen": 239500080, + "step": 11090, + "time_per_iteration": 2.3928160667419434 + }, + { + "auxiliary_loss_clip": 0.01094757, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.03735709, + "balance_loss_mlp": 1.01832998, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 2.067356409113776, + "language_loss": 0.77435702, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79561996, + "num_input_tokens_seen": 239517335, + "step": 11091, + "time_per_iteration": 2.4606375694274902 + }, + { + "auxiliary_loss_clip": 0.01113024, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.03974414, + "balance_loss_mlp": 1.02078605, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 2.3668997133159984, + "language_loss": 0.79341656, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81488264, + "num_input_tokens_seen": 239536240, + "step": 11092, + "time_per_iteration": 2.4323811531066895 + }, + { + "auxiliary_loss_clip": 0.01007215, + "auxiliary_loss_mlp": 0.01007322, + "balance_loss_clip": 1.00892949, + "balance_loss_mlp": 1.00604081, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7583636335248073, + "language_loss": 0.57711065, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.597256, + "num_input_tokens_seen": 239598000, + "step": 11093, + "time_per_iteration": 4.623935222625732 + }, + { + "auxiliary_loss_clip": 0.01112339, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03956783, + "balance_loss_mlp": 1.01897573, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 2.586520746396946, + "language_loss": 0.76454657, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78599274, + "num_input_tokens_seen": 239617650, + "step": 11094, + "time_per_iteration": 2.459791898727417 + }, + { + "auxiliary_loss_clip": 0.01114253, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.0400337, + "balance_loss_mlp": 1.017138, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 2.046593243608158, + "language_loss": 0.73164445, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75309372, + "num_input_tokens_seen": 239639825, + "step": 11095, + "time_per_iteration": 3.7843594551086426 + }, + { + "auxiliary_loss_clip": 0.01100762, + "auxiliary_loss_mlp": 0.01038989, + "balance_loss_clip": 1.03695059, + "balance_loss_mlp": 1.02638257, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 3.0084623971781417, + "language_loss": 0.73423094, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75562847, + "num_input_tokens_seen": 239656300, + "step": 11096, + "time_per_iteration": 2.4149768352508545 + }, + { + "auxiliary_loss_clip": 0.01076308, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.04224777, + "balance_loss_mlp": 1.02340293, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 1.7346589287273257, + "language_loss": 0.6428594, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66398013, + "num_input_tokens_seen": 239676655, + "step": 11097, + "time_per_iteration": 2.5593044757843018 + }, + { + "auxiliary_loss_clip": 0.0110226, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.03910875, + "balance_loss_mlp": 1.02190948, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.8996046640048614, + "language_loss": 0.75845528, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77982509, + "num_input_tokens_seen": 239695430, + "step": 11098, + "time_per_iteration": 2.457505226135254 + }, + { + "auxiliary_loss_clip": 0.01115595, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.04080153, + "balance_loss_mlp": 1.0209806, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.568447308741279, + "language_loss": 0.74809396, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76958394, + "num_input_tokens_seen": 239717070, + "step": 11099, + "time_per_iteration": 2.461568832397461 + }, + { + "auxiliary_loss_clip": 0.01097767, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.03521848, + "balance_loss_mlp": 1.01967239, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.9668807790008418, + "language_loss": 0.78139019, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80268633, + "num_input_tokens_seen": 239737105, + "step": 11100, + "time_per_iteration": 2.4620354175567627 + }, + { + "auxiliary_loss_clip": 0.0111157, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.03887725, + "balance_loss_mlp": 1.02590251, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 2.9894149408876736, + "language_loss": 0.59659994, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.61810124, + "num_input_tokens_seen": 239757835, + "step": 11101, + "time_per_iteration": 2.449589490890503 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.03985572, + "balance_loss_mlp": 1.02267385, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.7195519099454553, + "language_loss": 0.7167843, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73814797, + "num_input_tokens_seen": 239775425, + "step": 11102, + "time_per_iteration": 2.5177910327911377 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.03688574, + "balance_loss_mlp": 1.01717663, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 3.0245224772950663, + "language_loss": 0.84520674, + "learning_rate": 1.051469068021034e-06, + "loss": 0.86653, + "num_input_tokens_seen": 239794605, + "step": 11103, + "time_per_iteration": 2.4726998805999756 + }, + { + "auxiliary_loss_clip": 0.01091924, + "auxiliary_loss_mlp": 0.01025733, + "balance_loss_clip": 1.03682685, + "balance_loss_mlp": 1.01372313, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 2.1294464024913506, + "language_loss": 0.78311169, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80428827, + "num_input_tokens_seen": 239812135, + "step": 11104, + "time_per_iteration": 2.5674617290496826 + }, + { + "auxiliary_loss_clip": 0.01070496, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.0392108, + "balance_loss_mlp": 1.01974869, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.6380892401352656, + "language_loss": 0.58031213, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60133958, + "num_input_tokens_seen": 239835845, + "step": 11105, + "time_per_iteration": 2.7180984020233154 + }, + { + "auxiliary_loss_clip": 0.01107025, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.03959608, + "balance_loss_mlp": 1.01893723, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.8809299729245024, + "language_loss": 0.73359287, + "learning_rate": 1.0504406049066e-06, + "loss": 0.7549935, + "num_input_tokens_seen": 239853820, + "step": 11106, + "time_per_iteration": 2.5148770809173584 + }, + { + "auxiliary_loss_clip": 0.01112801, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.0393703, + "balance_loss_mlp": 1.01634002, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.6776805808689148, + "language_loss": 0.76680243, + "learning_rate": 1.0500978558659e-06, + "loss": 0.78822446, + "num_input_tokens_seen": 239873365, + "step": 11107, + "time_per_iteration": 2.4578957557678223 + }, + { + "auxiliary_loss_clip": 0.01087581, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.03564119, + "balance_loss_mlp": 1.01728773, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.461271590316421, + "language_loss": 0.89833653, + "learning_rate": 1.049755142845583e-06, + "loss": 0.91950935, + "num_input_tokens_seen": 239891215, + "step": 11108, + "time_per_iteration": 3.9848618507385254 + }, + { + "auxiliary_loss_clip": 0.01081075, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.04049551, + "balance_loss_mlp": 1.01350868, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.4391346351928276, + "language_loss": 0.82736731, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84842587, + "num_input_tokens_seen": 239913490, + "step": 11109, + "time_per_iteration": 2.6761484146118164 + }, + { + "auxiliary_loss_clip": 0.01090463, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.04235947, + "balance_loss_mlp": 1.0178473, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 2.081528432505002, + "language_loss": 0.69847202, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.7196852, + "num_input_tokens_seen": 239931565, + "step": 11110, + "time_per_iteration": 2.481963872909546 + }, + { + "auxiliary_loss_clip": 0.01084753, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.03903472, + "balance_loss_mlp": 1.01946175, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 2.08642716801561, + "language_loss": 0.7334677, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75464851, + "num_input_tokens_seen": 239952395, + "step": 11111, + "time_per_iteration": 2.53509783744812 + }, + { + "auxiliary_loss_clip": 0.01110287, + "auxiliary_loss_mlp": 0.01028483, + "balance_loss_clip": 1.03822994, + "balance_loss_mlp": 1.01626444, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 2.101632311688441, + "language_loss": 0.65171945, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.67310715, + "num_input_tokens_seen": 239968910, + "step": 11112, + "time_per_iteration": 2.4193766117095947 + }, + { + "auxiliary_loss_clip": 0.0108967, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.0397476, + "balance_loss_mlp": 1.01798081, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 2.775504782453374, + "language_loss": 0.63086939, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65207088, + "num_input_tokens_seen": 239987680, + "step": 11113, + "time_per_iteration": 2.4888904094696045 + }, + { + "auxiliary_loss_clip": 0.01074688, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.04385376, + "balance_loss_mlp": 1.02086616, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.8188350351979514, + "language_loss": 0.66334093, + "learning_rate": 1.047699621879422e-06, + "loss": 0.68441975, + "num_input_tokens_seen": 240005790, + "step": 11114, + "time_per_iteration": 2.5400562286376953 + }, + { + "auxiliary_loss_clip": 0.01104265, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.03900051, + "balance_loss_mlp": 1.02351594, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.5409284933812617, + "language_loss": 0.78259778, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80400407, + "num_input_tokens_seen": 240025895, + "step": 11115, + "time_per_iteration": 2.470918655395508 + }, + { + "auxiliary_loss_clip": 0.01059293, + "auxiliary_loss_mlp": 0.00779136, + "balance_loss_clip": 1.02975106, + "balance_loss_mlp": 1.00075293, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 3.84180678830985, + "language_loss": 0.79399002, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81237435, + "num_input_tokens_seen": 240044880, + "step": 11116, + "time_per_iteration": 2.606604814529419 + }, + { + "auxiliary_loss_clip": 0.01084859, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.03966701, + "balance_loss_mlp": 1.01840806, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 1.6287424186065445, + "language_loss": 0.78976125, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81092715, + "num_input_tokens_seen": 240065785, + "step": 11117, + "time_per_iteration": 2.5925064086914062 + }, + { + "auxiliary_loss_clip": 0.01068324, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.03801417, + "balance_loss_mlp": 1.02003849, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 2.0389779032433983, + "language_loss": 0.65649688, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67752355, + "num_input_tokens_seen": 240085130, + "step": 11118, + "time_per_iteration": 2.6027705669403076 + }, + { + "auxiliary_loss_clip": 0.01092406, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.03980041, + "balance_loss_mlp": 1.01563406, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 1.5425494552810475, + "language_loss": 0.68832362, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.70952225, + "num_input_tokens_seen": 240105495, + "step": 11119, + "time_per_iteration": 2.522925853729248 + }, + { + "auxiliary_loss_clip": 0.01086481, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.03502202, + "balance_loss_mlp": 1.01894093, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 1.7844587694885405, + "language_loss": 0.67049313, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.69168395, + "num_input_tokens_seen": 240125455, + "step": 11120, + "time_per_iteration": 2.579561710357666 + }, + { + "auxiliary_loss_clip": 0.01084567, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.03932786, + "balance_loss_mlp": 1.01844966, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.7088328757072082, + "language_loss": 0.71841466, + "learning_rate": 1.045303157347638e-06, + "loss": 0.7395705, + "num_input_tokens_seen": 240143870, + "step": 11121, + "time_per_iteration": 2.5611953735351562 + }, + { + "auxiliary_loss_clip": 0.01091666, + "auxiliary_loss_mlp": 0.01038403, + "balance_loss_clip": 1.03578472, + "balance_loss_mlp": 1.02484334, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 2.8615347781774467, + "language_loss": 0.69232368, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.71362436, + "num_input_tokens_seen": 240161020, + "step": 11122, + "time_per_iteration": 4.030341863632202 + }, + { + "auxiliary_loss_clip": 0.01055254, + "auxiliary_loss_mlp": 0.00780281, + "balance_loss_clip": 1.03329301, + "balance_loss_mlp": 1.00066257, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 2.2254897747663946, + "language_loss": 0.71551764, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73387295, + "num_input_tokens_seen": 240179820, + "step": 11123, + "time_per_iteration": 2.6494622230529785 + }, + { + "auxiliary_loss_clip": 0.01096579, + "auxiliary_loss_mlp": 0.01034009, + "balance_loss_clip": 1.04056621, + "balance_loss_mlp": 1.0207355, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 2.736985262270377, + "language_loss": 0.79691577, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81822169, + "num_input_tokens_seen": 240200130, + "step": 11124, + "time_per_iteration": 2.54018497467041 + }, + { + "auxiliary_loss_clip": 0.01089241, + "auxiliary_loss_mlp": 0.0103877, + "balance_loss_clip": 1.04162598, + "balance_loss_mlp": 1.02602625, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 1.6168831638572858, + "language_loss": 0.74605918, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76733929, + "num_input_tokens_seen": 240217945, + "step": 11125, + "time_per_iteration": 2.508206367492676 + }, + { + "auxiliary_loss_clip": 0.01078577, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.03590214, + "balance_loss_mlp": 1.02412438, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 2.626266455311239, + "language_loss": 0.66401076, + "learning_rate": 1.043592482774116e-06, + "loss": 0.685175, + "num_input_tokens_seen": 240237220, + "step": 11126, + "time_per_iteration": 2.5525906085968018 + }, + { + "auxiliary_loss_clip": 0.01096757, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.03543353, + "balance_loss_mlp": 1.01818991, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 1.7686703289299373, + "language_loss": 0.71005398, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73133969, + "num_input_tokens_seen": 240256000, + "step": 11127, + "time_per_iteration": 2.4719011783599854 + }, + { + "auxiliary_loss_clip": 0.01097904, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.0418582, + "balance_loss_mlp": 1.02204943, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 1.9434063070313503, + "language_loss": 0.80294722, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82428557, + "num_input_tokens_seen": 240275845, + "step": 11128, + "time_per_iteration": 2.5133397579193115 + }, + { + "auxiliary_loss_clip": 0.01113285, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.03788936, + "balance_loss_mlp": 1.01794851, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 1.8880822177181882, + "language_loss": 0.81095219, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83239627, + "num_input_tokens_seen": 240294095, + "step": 11129, + "time_per_iteration": 2.44734525680542 + }, + { + "auxiliary_loss_clip": 0.01094491, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.03684974, + "balance_loss_mlp": 1.02628756, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.5479971763459381, + "language_loss": 0.70438033, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72570783, + "num_input_tokens_seen": 240313460, + "step": 11130, + "time_per_iteration": 2.564805746078491 + }, + { + "auxiliary_loss_clip": 0.01086765, + "auxiliary_loss_mlp": 0.01036394, + "balance_loss_clip": 1.03592515, + "balance_loss_mlp": 1.02459788, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.6389424935183405, + "language_loss": 0.70293498, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72416657, + "num_input_tokens_seen": 240333540, + "step": 11131, + "time_per_iteration": 2.522221565246582 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.03901017, + "balance_loss_mlp": 1.01663494, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.1625058904392676, + "language_loss": 0.650985, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.6723274, + "num_input_tokens_seen": 240350085, + "step": 11132, + "time_per_iteration": 4.109698057174683 + }, + { + "auxiliary_loss_clip": 0.01102896, + "auxiliary_loss_mlp": 0.01034183, + "balance_loss_clip": 1.03663146, + "balance_loss_mlp": 1.02019358, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.7074895143081286, + "language_loss": 0.75079632, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.77216709, + "num_input_tokens_seen": 240370015, + "step": 11133, + "time_per_iteration": 2.4661173820495605 + }, + { + "auxiliary_loss_clip": 0.01109009, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.04226148, + "balance_loss_mlp": 1.0202657, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 8.232659323672445, + "language_loss": 0.66497314, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68640661, + "num_input_tokens_seen": 240390770, + "step": 11134, + "time_per_iteration": 2.499284267425537 + }, + { + "auxiliary_loss_clip": 0.01106259, + "auxiliary_loss_mlp": 0.01037678, + "balance_loss_clip": 1.03861141, + "balance_loss_mlp": 1.02312875, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 2.0491779307302807, + "language_loss": 0.76831722, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.78975654, + "num_input_tokens_seen": 240409590, + "step": 11135, + "time_per_iteration": 3.812983751296997 + }, + { + "auxiliary_loss_clip": 0.01103069, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.03997219, + "balance_loss_mlp": 1.01826119, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.7165998636465243, + "language_loss": 0.74282604, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76416814, + "num_input_tokens_seen": 240428180, + "step": 11136, + "time_per_iteration": 2.4552602767944336 + }, + { + "auxiliary_loss_clip": 0.01108324, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.04161501, + "balance_loss_mlp": 1.02047396, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.6775845304147745, + "language_loss": 0.62057287, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.64199924, + "num_input_tokens_seen": 240447815, + "step": 11137, + "time_per_iteration": 2.478872060775757 + }, + { + "auxiliary_loss_clip": 0.01112054, + "auxiliary_loss_mlp": 0.01033489, + "balance_loss_clip": 1.03911066, + "balance_loss_mlp": 1.02028656, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 2.0838807849570355, + "language_loss": 0.66038477, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.68184018, + "num_input_tokens_seen": 240468635, + "step": 11138, + "time_per_iteration": 2.4405975341796875 + }, + { + "auxiliary_loss_clip": 0.01076863, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.03310275, + "balance_loss_mlp": 1.0211792, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.6827608542584274, + "language_loss": 0.72946835, + "learning_rate": 1.039148976175053e-06, + "loss": 0.75057316, + "num_input_tokens_seen": 240488550, + "step": 11139, + "time_per_iteration": 2.5848796367645264 + }, + { + "auxiliary_loss_clip": 0.0107137, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.03347409, + "balance_loss_mlp": 1.01923943, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 2.684090275676308, + "language_loss": 0.70377833, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72479951, + "num_input_tokens_seen": 240508330, + "step": 11140, + "time_per_iteration": 2.5488059520721436 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.0358429, + "balance_loss_mlp": 1.0146935, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 2.2520351140753174, + "language_loss": 0.75525439, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.776559, + "num_input_tokens_seen": 240528470, + "step": 11141, + "time_per_iteration": 2.5069172382354736 + }, + { + "auxiliary_loss_clip": 0.0110298, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.03868723, + "balance_loss_mlp": 1.02165651, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 2.249292265792808, + "language_loss": 0.82249665, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84387505, + "num_input_tokens_seen": 240547815, + "step": 11142, + "time_per_iteration": 2.521704912185669 + }, + { + "auxiliary_loss_clip": 0.01063462, + "auxiliary_loss_mlp": 0.01027147, + "balance_loss_clip": 1.03647423, + "balance_loss_mlp": 1.01466012, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.9839469829566656, + "language_loss": 0.70179021, + "learning_rate": 1.037782980862959e-06, + "loss": 0.7226963, + "num_input_tokens_seen": 240567765, + "step": 11143, + "time_per_iteration": 2.6648573875427246 + }, + { + "auxiliary_loss_clip": 0.01072882, + "auxiliary_loss_mlp": 0.00778141, + "balance_loss_clip": 1.03659117, + "balance_loss_mlp": 1.00064206, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.7095536073639517, + "language_loss": 0.70110345, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71961367, + "num_input_tokens_seen": 240590750, + "step": 11144, + "time_per_iteration": 2.6342251300811768 + }, + { + "auxiliary_loss_clip": 0.0108612, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.03476572, + "balance_loss_mlp": 1.01636279, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.6288261723686401, + "language_loss": 0.7424174, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76356673, + "num_input_tokens_seen": 240608875, + "step": 11145, + "time_per_iteration": 2.512587070465088 + }, + { + "auxiliary_loss_clip": 0.01093153, + "auxiliary_loss_mlp": 0.01034163, + "balance_loss_clip": 1.03788555, + "balance_loss_mlp": 1.0207696, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.75152304590356, + "language_loss": 0.70976621, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.73103935, + "num_input_tokens_seen": 240628565, + "step": 11146, + "time_per_iteration": 2.5168983936309814 + }, + { + "auxiliary_loss_clip": 0.01107972, + "auxiliary_loss_mlp": 0.00776424, + "balance_loss_clip": 1.03740728, + "balance_loss_mlp": 1.00074732, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 1.9036624965531959, + "language_loss": 0.78388095, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80272484, + "num_input_tokens_seen": 240646325, + "step": 11147, + "time_per_iteration": 3.977983236312866 + }, + { + "auxiliary_loss_clip": 0.01104291, + "auxiliary_loss_mlp": 0.00777711, + "balance_loss_clip": 1.04045033, + "balance_loss_mlp": 1.00073361, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.803906330023228, + "language_loss": 0.70333159, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72215152, + "num_input_tokens_seen": 240666145, + "step": 11148, + "time_per_iteration": 2.4659807682037354 + }, + { + "auxiliary_loss_clip": 0.01094564, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.03720856, + "balance_loss_mlp": 1.0245055, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 1.793826346752853, + "language_loss": 0.70326054, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72457558, + "num_input_tokens_seen": 240685570, + "step": 11149, + "time_per_iteration": 2.5060994625091553 + }, + { + "auxiliary_loss_clip": 0.01092816, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.03753245, + "balance_loss_mlp": 1.01794863, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 2.015600700559878, + "language_loss": 0.73723459, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75845647, + "num_input_tokens_seen": 240706945, + "step": 11150, + "time_per_iteration": 2.5308873653411865 + }, + { + "auxiliary_loss_clip": 0.01104904, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.04251802, + "balance_loss_mlp": 1.02044272, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 1.983198786941353, + "language_loss": 0.7821635, + "learning_rate": 1.035052742460671e-06, + "loss": 0.8035413, + "num_input_tokens_seen": 240727990, + "step": 11151, + "time_per_iteration": 2.484978675842285 + }, + { + "auxiliary_loss_clip": 0.00993255, + "auxiliary_loss_mlp": 0.01008698, + "balance_loss_clip": 1.0133388, + "balance_loss_mlp": 1.00735128, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.8149164522781981, + "language_loss": 0.55507553, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57509494, + "num_input_tokens_seen": 240790380, + "step": 11152, + "time_per_iteration": 3.379040241241455 + }, + { + "auxiliary_loss_clip": 0.01091544, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.03900504, + "balance_loss_mlp": 1.02510357, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.9599546020506113, + "language_loss": 0.80820012, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.82949829, + "num_input_tokens_seen": 240811545, + "step": 11153, + "time_per_iteration": 3.4814114570617676 + }, + { + "auxiliary_loss_clip": 0.01075509, + "auxiliary_loss_mlp": 0.00777763, + "balance_loss_clip": 1.03937292, + "balance_loss_mlp": 1.00067902, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.6563582429324168, + "language_loss": 0.76398599, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78251874, + "num_input_tokens_seen": 240831380, + "step": 11154, + "time_per_iteration": 2.5590028762817383 + }, + { + "auxiliary_loss_clip": 0.01094905, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.03646684, + "balance_loss_mlp": 1.02868545, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.4475162539787534, + "language_loss": 0.76351976, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78488511, + "num_input_tokens_seen": 240851855, + "step": 11155, + "time_per_iteration": 2.51983904838562 + }, + { + "auxiliary_loss_clip": 0.01116351, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.04194069, + "balance_loss_mlp": 1.02384472, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 2.0455055268441242, + "language_loss": 0.81878817, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84031582, + "num_input_tokens_seen": 240869980, + "step": 11156, + "time_per_iteration": 2.4651265144348145 + }, + { + "auxiliary_loss_clip": 0.01112559, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.03952324, + "balance_loss_mlp": 1.02393615, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 2.15549847305955, + "language_loss": 0.74690616, + "learning_rate": 1.033006600114165e-06, + "loss": 0.76839519, + "num_input_tokens_seen": 240888680, + "step": 11157, + "time_per_iteration": 2.4413840770721436 + }, + { + "auxiliary_loss_clip": 0.01109454, + "auxiliary_loss_mlp": 0.01036463, + "balance_loss_clip": 1.04300714, + "balance_loss_mlp": 1.02321303, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.7989554226174866, + "language_loss": 0.74495202, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76641119, + "num_input_tokens_seen": 240909050, + "step": 11158, + "time_per_iteration": 2.4919283390045166 + }, + { + "auxiliary_loss_clip": 0.01117173, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.04130888, + "balance_loss_mlp": 1.02222681, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 2.049170743953811, + "language_loss": 0.81618929, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83771944, + "num_input_tokens_seen": 240930035, + "step": 11159, + "time_per_iteration": 2.4529614448547363 + }, + { + "auxiliary_loss_clip": 0.01092281, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.03789973, + "balance_loss_mlp": 1.01973128, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 7.940980699842852, + "language_loss": 0.77163136, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79287386, + "num_input_tokens_seen": 240948895, + "step": 11160, + "time_per_iteration": 2.4684805870056152 + }, + { + "auxiliary_loss_clip": 0.01090786, + "auxiliary_loss_mlp": 0.01029243, + "balance_loss_clip": 1.04082406, + "balance_loss_mlp": 1.01733351, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 1.7265183309469867, + "language_loss": 0.7334584, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.7546587, + "num_input_tokens_seen": 240967770, + "step": 11161, + "time_per_iteration": 2.51302433013916 + }, + { + "auxiliary_loss_clip": 0.0109525, + "auxiliary_loss_mlp": 0.01039469, + "balance_loss_clip": 1.03746331, + "balance_loss_mlp": 1.0255574, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 1.64318586933888, + "language_loss": 0.68003106, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70137823, + "num_input_tokens_seen": 240988985, + "step": 11162, + "time_per_iteration": 3.986891031265259 + }, + { + "auxiliary_loss_clip": 0.01089507, + "auxiliary_loss_mlp": 0.01039783, + "balance_loss_clip": 1.03474689, + "balance_loss_mlp": 1.02754569, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.8188401444101752, + "language_loss": 0.69926143, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72055435, + "num_input_tokens_seen": 241005455, + "step": 11163, + "time_per_iteration": 2.4807496070861816 + }, + { + "auxiliary_loss_clip": 0.01112669, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.04153895, + "balance_loss_mlp": 1.01963353, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.6134351924815853, + "language_loss": 0.75630844, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.77775478, + "num_input_tokens_seen": 241026175, + "step": 11164, + "time_per_iteration": 2.4682419300079346 + }, + { + "auxiliary_loss_clip": 0.01112923, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.03975177, + "balance_loss_mlp": 1.02151275, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 1.8880453276854967, + "language_loss": 0.64869213, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.6701628, + "num_input_tokens_seen": 241044040, + "step": 11165, + "time_per_iteration": 2.4148597717285156 + }, + { + "auxiliary_loss_clip": 0.01111781, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.03967857, + "balance_loss_mlp": 1.01893592, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 2.1477381345491446, + "language_loss": 0.71748531, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73891926, + "num_input_tokens_seen": 241063615, + "step": 11166, + "time_per_iteration": 2.4166252613067627 + }, + { + "auxiliary_loss_clip": 0.01110951, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.03971362, + "balance_loss_mlp": 1.02275991, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 2.130620248635027, + "language_loss": 0.77075332, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79220545, + "num_input_tokens_seen": 241082520, + "step": 11167, + "time_per_iteration": 2.4460291862487793 + }, + { + "auxiliary_loss_clip": 0.01100442, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.03525794, + "balance_loss_mlp": 1.02332997, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 3.8420432826286195, + "language_loss": 0.69051152, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71187574, + "num_input_tokens_seen": 241103505, + "step": 11168, + "time_per_iteration": 2.55918550491333 + }, + { + "auxiliary_loss_clip": 0.01079633, + "auxiliary_loss_mlp": 0.01039358, + "balance_loss_clip": 1.03716385, + "balance_loss_mlp": 1.02594137, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 2.2588310860322434, + "language_loss": 0.73100948, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75219941, + "num_input_tokens_seen": 241122885, + "step": 11169, + "time_per_iteration": 2.5433011054992676 + }, + { + "auxiliary_loss_clip": 0.01103738, + "auxiliary_loss_mlp": 0.01034392, + "balance_loss_clip": 1.03736782, + "balance_loss_mlp": 1.0208497, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.6614879292741667, + "language_loss": 0.76086819, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78224957, + "num_input_tokens_seen": 241140865, + "step": 11170, + "time_per_iteration": 2.422952890396118 + }, + { + "auxiliary_loss_clip": 0.01094369, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.0387001, + "balance_loss_mlp": 1.01841235, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 2.777621028216612, + "language_loss": 0.74911362, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.77036786, + "num_input_tokens_seen": 241158225, + "step": 11171, + "time_per_iteration": 2.4515016078948975 + }, + { + "auxiliary_loss_clip": 0.01080885, + "auxiliary_loss_mlp": 0.0104278, + "balance_loss_clip": 1.04118431, + "balance_loss_mlp": 1.0276289, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 2.2975397262767285, + "language_loss": 0.86202085, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88325751, + "num_input_tokens_seen": 241175215, + "step": 11172, + "time_per_iteration": 4.035335302352905 + }, + { + "auxiliary_loss_clip": 0.01099098, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.03533328, + "balance_loss_mlp": 1.0245707, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.72326071517532, + "language_loss": 0.63580036, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65716505, + "num_input_tokens_seen": 241195250, + "step": 11173, + "time_per_iteration": 2.4578399658203125 + }, + { + "auxiliary_loss_clip": 0.01111017, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.04041076, + "balance_loss_mlp": 1.02178848, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.2590691383798323, + "language_loss": 0.71357036, + "learning_rate": 1.02721637475002e-06, + "loss": 0.73504376, + "num_input_tokens_seen": 241210720, + "step": 11174, + "time_per_iteration": 3.7457242012023926 + }, + { + "auxiliary_loss_clip": 0.01079725, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.04112148, + "balance_loss_mlp": 1.01934934, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 2.545889768119015, + "language_loss": 0.68869758, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.70981175, + "num_input_tokens_seen": 241227395, + "step": 11175, + "time_per_iteration": 2.5030977725982666 + }, + { + "auxiliary_loss_clip": 0.01086236, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.03931093, + "balance_loss_mlp": 1.02409148, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 2.228337539832551, + "language_loss": 0.7406584, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.76188552, + "num_input_tokens_seen": 241246355, + "step": 11176, + "time_per_iteration": 2.4618067741394043 + }, + { + "auxiliary_loss_clip": 0.01093447, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.0391922, + "balance_loss_mlp": 1.01793814, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 1.9193738430854927, + "language_loss": 0.73067784, + "learning_rate": 1.026195675108182e-06, + "loss": 0.75193048, + "num_input_tokens_seen": 241264180, + "step": 11177, + "time_per_iteration": 2.4832935333251953 + }, + { + "auxiliary_loss_clip": 0.01114464, + "auxiliary_loss_mlp": 0.01036549, + "balance_loss_clip": 1.03983867, + "balance_loss_mlp": 1.02283347, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.5015037035593433, + "language_loss": 0.76583713, + "learning_rate": 1.025855515730551e-06, + "loss": 0.7873472, + "num_input_tokens_seen": 241282245, + "step": 11178, + "time_per_iteration": 2.443988561630249 + }, + { + "auxiliary_loss_clip": 0.01106761, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.04220498, + "balance_loss_mlp": 1.01914263, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.8546871816429478, + "language_loss": 0.69586772, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.71724784, + "num_input_tokens_seen": 241300745, + "step": 11179, + "time_per_iteration": 2.431818723678589 + }, + { + "auxiliary_loss_clip": 0.0106868, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.03784847, + "balance_loss_mlp": 1.01748204, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.56886796564317, + "language_loss": 0.73892409, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.75990617, + "num_input_tokens_seen": 241319320, + "step": 11180, + "time_per_iteration": 2.583116054534912 + }, + { + "auxiliary_loss_clip": 0.01094782, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.03978837, + "balance_loss_mlp": 1.02135384, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.6694654467019685, + "language_loss": 0.7514388, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77272475, + "num_input_tokens_seen": 241342225, + "step": 11181, + "time_per_iteration": 2.5566680431365967 + }, + { + "auxiliary_loss_clip": 0.01098368, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.04180527, + "balance_loss_mlp": 1.01609528, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 2.45341202700266, + "language_loss": 0.74323446, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76449865, + "num_input_tokens_seen": 241358240, + "step": 11182, + "time_per_iteration": 2.475473642349243 + }, + { + "auxiliary_loss_clip": 0.01098224, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.03780305, + "balance_loss_mlp": 1.01868892, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 1.809794400703546, + "language_loss": 0.69847023, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.71975446, + "num_input_tokens_seen": 241378420, + "step": 11183, + "time_per_iteration": 2.4761486053466797 + }, + { + "auxiliary_loss_clip": 0.01067202, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.03833818, + "balance_loss_mlp": 1.01684856, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.6613840883904585, + "language_loss": 0.7762022, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79716772, + "num_input_tokens_seen": 241397185, + "step": 11184, + "time_per_iteration": 2.56542706489563 + }, + { + "auxiliary_loss_clip": 0.01097708, + "auxiliary_loss_mlp": 0.00784731, + "balance_loss_clip": 1.04097128, + "balance_loss_mlp": 1.00080633, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 1.9157027209708561, + "language_loss": 0.65982664, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.67865098, + "num_input_tokens_seen": 241415785, + "step": 11185, + "time_per_iteration": 2.4946014881134033 + }, + { + "auxiliary_loss_clip": 0.01079701, + "auxiliary_loss_mlp": 0.01037567, + "balance_loss_clip": 1.0373677, + "balance_loss_mlp": 1.02382851, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.7335106910158484, + "language_loss": 0.80288589, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82405865, + "num_input_tokens_seen": 241437390, + "step": 11186, + "time_per_iteration": 4.14688515663147 + }, + { + "auxiliary_loss_clip": 0.01101478, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.04114866, + "balance_loss_mlp": 1.02182293, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.772394568812466, + "language_loss": 0.80046022, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82180631, + "num_input_tokens_seen": 241458085, + "step": 11187, + "time_per_iteration": 2.488560914993286 + }, + { + "auxiliary_loss_clip": 0.01075815, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.04542375, + "balance_loss_mlp": 1.02082944, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 2.2916590078503623, + "language_loss": 0.70511109, + "learning_rate": 1.022455955762965e-06, + "loss": 0.72621477, + "num_input_tokens_seen": 241476880, + "step": 11188, + "time_per_iteration": 2.5723180770874023 + }, + { + "auxiliary_loss_clip": 0.01057946, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.04560351, + "balance_loss_mlp": 1.02045083, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.7896575171363511, + "language_loss": 0.75679338, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.77769423, + "num_input_tokens_seen": 241496535, + "step": 11189, + "time_per_iteration": 2.671767234802246 + }, + { + "auxiliary_loss_clip": 0.01116363, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.03878701, + "balance_loss_mlp": 1.01902425, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.4536963918768357, + "language_loss": 0.75006765, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.77156609, + "num_input_tokens_seen": 241513465, + "step": 11190, + "time_per_iteration": 2.408818244934082 + }, + { + "auxiliary_loss_clip": 0.01051259, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.03352499, + "balance_loss_mlp": 1.02181125, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 1.5072704877498366, + "language_loss": 0.76872694, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.78959358, + "num_input_tokens_seen": 241534125, + "step": 11191, + "time_per_iteration": 2.647677421569824 + }, + { + "auxiliary_loss_clip": 0.01110642, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.03835917, + "balance_loss_mlp": 1.0198102, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 1.9021884872085661, + "language_loss": 0.86472917, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.8861593, + "num_input_tokens_seen": 241556340, + "step": 11192, + "time_per_iteration": 2.551910400390625 + }, + { + "auxiliary_loss_clip": 0.01103811, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.03805363, + "balance_loss_mlp": 1.02290356, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 1.9322683071593827, + "language_loss": 0.75493914, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.77634013, + "num_input_tokens_seen": 241575185, + "step": 11193, + "time_per_iteration": 2.4820237159729004 + }, + { + "auxiliary_loss_clip": 0.01080345, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.04071999, + "balance_loss_mlp": 1.02174067, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 2.4666356155625007, + "language_loss": 0.78757787, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.80872577, + "num_input_tokens_seen": 241592970, + "step": 11194, + "time_per_iteration": 2.5250797271728516 + }, + { + "auxiliary_loss_clip": 0.01102573, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.03841472, + "balance_loss_mlp": 1.01791453, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 2.0244039959373934, + "language_loss": 0.90402514, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.9253481, + "num_input_tokens_seen": 241610245, + "step": 11195, + "time_per_iteration": 2.4806761741638184 + }, + { + "auxiliary_loss_clip": 0.01100674, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03617501, + "balance_loss_mlp": 1.02021194, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 1.7249653777070553, + "language_loss": 0.72369695, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74502766, + "num_input_tokens_seen": 241630350, + "step": 11196, + "time_per_iteration": 2.5258047580718994 + }, + { + "auxiliary_loss_clip": 0.00990046, + "auxiliary_loss_mlp": 0.01006357, + "balance_loss_clip": 1.02887833, + "balance_loss_mlp": 1.00516462, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7685075389510999, + "language_loss": 0.56461215, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58457625, + "num_input_tokens_seen": 241692380, + "step": 11197, + "time_per_iteration": 3.479853630065918 + }, + { + "auxiliary_loss_clip": 0.01092605, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.04088438, + "balance_loss_mlp": 1.01877999, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 3.000010677385184, + "language_loss": 0.75470418, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77593476, + "num_input_tokens_seen": 241710430, + "step": 11198, + "time_per_iteration": 2.916717529296875 + }, + { + "auxiliary_loss_clip": 0.01101302, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.03554344, + "balance_loss_mlp": 1.01765728, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 2.1335462807145356, + "language_loss": 0.81714571, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83847004, + "num_input_tokens_seen": 241724775, + "step": 11199, + "time_per_iteration": 2.4602131843566895 + }, + { + "auxiliary_loss_clip": 0.01061396, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.03704524, + "balance_loss_mlp": 1.02234793, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 2.1387579657086433, + "language_loss": 0.71567225, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73664153, + "num_input_tokens_seen": 241744440, + "step": 11200, + "time_per_iteration": 2.76377010345459 + }, + { + "auxiliary_loss_clip": 0.01113698, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.03921413, + "balance_loss_mlp": 1.02137506, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.804130512906506, + "language_loss": 0.64840257, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66988099, + "num_input_tokens_seen": 241771705, + "step": 11201, + "time_per_iteration": 2.848165988922119 + }, + { + "auxiliary_loss_clip": 0.0109481, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.03933799, + "balance_loss_mlp": 1.01927733, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 2.148307526246571, + "language_loss": 0.63525808, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65652931, + "num_input_tokens_seen": 241790830, + "step": 11202, + "time_per_iteration": 4.1642005443573 + }, + { + "auxiliary_loss_clip": 0.01112355, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.03746831, + "balance_loss_mlp": 1.01865101, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 2.169209251852396, + "language_loss": 0.74595225, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76738536, + "num_input_tokens_seen": 241808165, + "step": 11203, + "time_per_iteration": 2.5103516578674316 + }, + { + "auxiliary_loss_clip": 0.01098532, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.03929663, + "balance_loss_mlp": 1.01970077, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 2.6196944304259615, + "language_loss": 0.68093264, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.70225441, + "num_input_tokens_seen": 241826925, + "step": 11204, + "time_per_iteration": 2.6897430419921875 + }, + { + "auxiliary_loss_clip": 0.01109683, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.04141021, + "balance_loss_mlp": 1.02065539, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.5949879668948612, + "language_loss": 0.73942083, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76086408, + "num_input_tokens_seen": 241845525, + "step": 11205, + "time_per_iteration": 2.639225482940674 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.03623128, + "balance_loss_mlp": 1.02353048, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.6421056409728934, + "language_loss": 0.71704727, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73848623, + "num_input_tokens_seen": 241866815, + "step": 11206, + "time_per_iteration": 3.334144115447998 + }, + { + "auxiliary_loss_clip": 0.01079379, + "auxiliary_loss_mlp": 0.00781105, + "balance_loss_clip": 1.03753567, + "balance_loss_mlp": 1.00075531, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 3.1245092489136694, + "language_loss": 0.67726433, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69586915, + "num_input_tokens_seen": 241887050, + "step": 11207, + "time_per_iteration": 2.652461290359497 + }, + { + "auxiliary_loss_clip": 0.01063318, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.03584468, + "balance_loss_mlp": 1.01945806, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 2.178311430188471, + "language_loss": 0.74145776, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.76240802, + "num_input_tokens_seen": 241904280, + "step": 11208, + "time_per_iteration": 2.615368366241455 + }, + { + "auxiliary_loss_clip": 0.01097563, + "auxiliary_loss_mlp": 0.01045627, + "balance_loss_clip": 1.03425562, + "balance_loss_mlp": 1.02902079, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 8.730831449102043, + "language_loss": 0.7541306, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.77556252, + "num_input_tokens_seen": 241919190, + "step": 11209, + "time_per_iteration": 2.4609639644622803 + }, + { + "auxiliary_loss_clip": 0.0107194, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.03582215, + "balance_loss_mlp": 1.01658821, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 1.7979817938240434, + "language_loss": 0.66157222, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68257642, + "num_input_tokens_seen": 241940525, + "step": 11210, + "time_per_iteration": 2.571530818939209 + }, + { + "auxiliary_loss_clip": 0.01107281, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.03531492, + "balance_loss_mlp": 1.01729739, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.317100706168738, + "language_loss": 0.80109859, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82246631, + "num_input_tokens_seen": 241959290, + "step": 11211, + "time_per_iteration": 2.4624531269073486 + }, + { + "auxiliary_loss_clip": 0.0106522, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.03699315, + "balance_loss_mlp": 1.01784921, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.611051000014967, + "language_loss": 0.7657603, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78672242, + "num_input_tokens_seen": 241980715, + "step": 11212, + "time_per_iteration": 4.567629337310791 + }, + { + "auxiliary_loss_clip": 0.01075045, + "auxiliary_loss_mlp": 0.00783378, + "balance_loss_clip": 1.03679335, + "balance_loss_mlp": 1.00066805, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.5762673326438414, + "language_loss": 0.77765721, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.7962414, + "num_input_tokens_seen": 241999985, + "step": 11213, + "time_per_iteration": 2.664360523223877 + }, + { + "auxiliary_loss_clip": 0.0106786, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.04241729, + "balance_loss_mlp": 1.01880956, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 1.9171446879981113, + "language_loss": 0.67537081, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69637692, + "num_input_tokens_seen": 242018990, + "step": 11214, + "time_per_iteration": 3.8531956672668457 + }, + { + "auxiliary_loss_clip": 0.01111946, + "auxiliary_loss_mlp": 0.00778414, + "balance_loss_clip": 1.03753543, + "balance_loss_mlp": 1.00077391, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.9762330148499048, + "language_loss": 0.7308495, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74975306, + "num_input_tokens_seen": 242039340, + "step": 11215, + "time_per_iteration": 2.5716288089752197 + }, + { + "auxiliary_loss_clip": 0.01101799, + "auxiliary_loss_mlp": 0.00778507, + "balance_loss_clip": 1.03576708, + "balance_loss_mlp": 1.00067282, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 2.261350220872376, + "language_loss": 0.67241889, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69122195, + "num_input_tokens_seen": 242062215, + "step": 11216, + "time_per_iteration": 2.632289409637451 + }, + { + "auxiliary_loss_clip": 0.01031514, + "auxiliary_loss_mlp": 0.01002671, + "balance_loss_clip": 1.00690305, + "balance_loss_mlp": 1.00141943, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6742688302235226, + "language_loss": 0.56228685, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58262867, + "num_input_tokens_seen": 242131130, + "step": 11217, + "time_per_iteration": 3.139369249343872 + }, + { + "auxiliary_loss_clip": 0.01100461, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.03755307, + "balance_loss_mlp": 1.01860881, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 1.8664486736207668, + "language_loss": 0.74191225, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76323205, + "num_input_tokens_seen": 242149720, + "step": 11218, + "time_per_iteration": 2.601823568344116 + }, + { + "auxiliary_loss_clip": 0.01082446, + "auxiliary_loss_mlp": 0.01048166, + "balance_loss_clip": 1.03902626, + "balance_loss_mlp": 1.03346133, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.748502918711854, + "language_loss": 0.65845531, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.67976141, + "num_input_tokens_seen": 242168875, + "step": 11219, + "time_per_iteration": 2.6719980239868164 + }, + { + "auxiliary_loss_clip": 0.01073746, + "auxiliary_loss_mlp": 0.01045388, + "balance_loss_clip": 1.03307939, + "balance_loss_mlp": 1.03045726, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.8439334037756037, + "language_loss": 0.75203621, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.77322757, + "num_input_tokens_seen": 242188465, + "step": 11220, + "time_per_iteration": 2.568466901779175 + }, + { + "auxiliary_loss_clip": 0.01097985, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.03951132, + "balance_loss_mlp": 1.01801503, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.5686894370465747, + "language_loss": 0.70466816, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72596079, + "num_input_tokens_seen": 242208675, + "step": 11221, + "time_per_iteration": 2.4985718727111816 + }, + { + "auxiliary_loss_clip": 0.01077377, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.03467178, + "balance_loss_mlp": 1.01662362, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 1.9331318704231768, + "language_loss": 0.57936817, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60042834, + "num_input_tokens_seen": 242227440, + "step": 11222, + "time_per_iteration": 2.569182872772217 + }, + { + "auxiliary_loss_clip": 0.01099063, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.03850031, + "balance_loss_mlp": 1.02253962, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 1.7036415245015282, + "language_loss": 0.7667008, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.78804654, + "num_input_tokens_seen": 242245240, + "step": 11223, + "time_per_iteration": 2.491373062133789 + }, + { + "auxiliary_loss_clip": 0.01107557, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.04098713, + "balance_loss_mlp": 1.01967037, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.7491717557229938, + "language_loss": 0.75665927, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77806425, + "num_input_tokens_seen": 242263435, + "step": 11224, + "time_per_iteration": 2.5205066204071045 + }, + { + "auxiliary_loss_clip": 0.01064156, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.03986049, + "balance_loss_mlp": 1.01579595, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.6592230565663593, + "language_loss": 0.6305787, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65148932, + "num_input_tokens_seen": 242282765, + "step": 11225, + "time_per_iteration": 2.6605849266052246 + }, + { + "auxiliary_loss_clip": 0.01108639, + "auxiliary_loss_mlp": 0.00776329, + "balance_loss_clip": 1.03853106, + "balance_loss_mlp": 1.00063419, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.7478712449324096, + "language_loss": 0.63761199, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.65646166, + "num_input_tokens_seen": 242298980, + "step": 11226, + "time_per_iteration": 3.9747703075408936 + }, + { + "auxiliary_loss_clip": 0.01103281, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.03764617, + "balance_loss_mlp": 1.01960278, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.3774471654964104, + "language_loss": 0.71570218, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73706323, + "num_input_tokens_seen": 242315420, + "step": 11227, + "time_per_iteration": 2.442192554473877 + }, + { + "auxiliary_loss_clip": 0.01084685, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.03448081, + "balance_loss_mlp": 1.02171826, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 2.009117391408565, + "language_loss": 0.71344304, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73464549, + "num_input_tokens_seen": 242332805, + "step": 11228, + "time_per_iteration": 2.4847724437713623 + }, + { + "auxiliary_loss_clip": 0.01023551, + "auxiliary_loss_mlp": 0.01000514, + "balance_loss_clip": 1.02305603, + "balance_loss_mlp": 0.99944067, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7571294946021189, + "language_loss": 0.53258079, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.5528214, + "num_input_tokens_seen": 242396160, + "step": 11229, + "time_per_iteration": 3.1240994930267334 + }, + { + "auxiliary_loss_clip": 0.01102104, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.03802085, + "balance_loss_mlp": 1.01972187, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.6937748351466442, + "language_loss": 0.80202043, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82336199, + "num_input_tokens_seen": 242414660, + "step": 11230, + "time_per_iteration": 2.505711793899536 + }, + { + "auxiliary_loss_clip": 0.0108052, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.03440547, + "balance_loss_mlp": 1.01982975, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.5015599449179882, + "language_loss": 0.65714169, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.67826563, + "num_input_tokens_seen": 242434225, + "step": 11231, + "time_per_iteration": 2.516974925994873 + }, + { + "auxiliary_loss_clip": 0.0107978, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.04396749, + "balance_loss_mlp": 1.02376676, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 2.456478760955268, + "language_loss": 0.66802967, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68920624, + "num_input_tokens_seen": 242454355, + "step": 11232, + "time_per_iteration": 2.6724393367767334 + }, + { + "auxiliary_loss_clip": 0.0107134, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.03291845, + "balance_loss_mlp": 1.0159061, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.6717384143558502, + "language_loss": 0.72560877, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74660838, + "num_input_tokens_seen": 242474935, + "step": 11233, + "time_per_iteration": 2.54854154586792 + }, + { + "auxiliary_loss_clip": 0.01102714, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.0379014, + "balance_loss_mlp": 1.01640916, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.582027636525046, + "language_loss": 0.76912349, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79044223, + "num_input_tokens_seen": 242495530, + "step": 11234, + "time_per_iteration": 2.50852632522583 + }, + { + "auxiliary_loss_clip": 0.01111578, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.03898036, + "balance_loss_mlp": 1.02049923, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.5227430370982846, + "language_loss": 0.7517764, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.7732265, + "num_input_tokens_seen": 242514550, + "step": 11235, + "time_per_iteration": 2.474365472793579 + }, + { + "auxiliary_loss_clip": 0.01024349, + "auxiliary_loss_mlp": 0.0100785, + "balance_loss_clip": 1.00933862, + "balance_loss_mlp": 1.00649154, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7826634117690493, + "language_loss": 0.51342463, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.5337466, + "num_input_tokens_seen": 242569200, + "step": 11236, + "time_per_iteration": 2.9973113536834717 + }, + { + "auxiliary_loss_clip": 0.01079836, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.03679109, + "balance_loss_mlp": 1.01903784, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 2.094986987493446, + "language_loss": 0.75604832, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77718192, + "num_input_tokens_seen": 242586950, + "step": 11237, + "time_per_iteration": 2.5669658184051514 + }, + { + "auxiliary_loss_clip": 0.01085979, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.03692365, + "balance_loss_mlp": 1.0258255, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 1.8737861280067913, + "language_loss": 0.77007258, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79131627, + "num_input_tokens_seen": 242607380, + "step": 11238, + "time_per_iteration": 2.6390926837921143 + }, + { + "auxiliary_loss_clip": 0.01101231, + "auxiliary_loss_mlp": 0.01035779, + "balance_loss_clip": 1.03563869, + "balance_loss_mlp": 1.02234411, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.722539253147427, + "language_loss": 0.66554838, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.6869185, + "num_input_tokens_seen": 242628025, + "step": 11239, + "time_per_iteration": 2.5768837928771973 + }, + { + "auxiliary_loss_clip": 0.0109301, + "auxiliary_loss_mlp": 0.01027087, + "balance_loss_clip": 1.04308999, + "balance_loss_mlp": 1.01406932, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 2.252261961594924, + "language_loss": 0.83105242, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85225332, + "num_input_tokens_seen": 242643825, + "step": 11240, + "time_per_iteration": 2.511207342147827 + }, + { + "auxiliary_loss_clip": 0.01093462, + "auxiliary_loss_mlp": 0.0103658, + "balance_loss_clip": 1.04300451, + "balance_loss_mlp": 1.02114248, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 2.087352062520805, + "language_loss": 0.74619299, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76749343, + "num_input_tokens_seen": 242661820, + "step": 11241, + "time_per_iteration": 4.061621427536011 + }, + { + "auxiliary_loss_clip": 0.01065689, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.03938556, + "balance_loss_mlp": 1.0224328, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 4.347538688013316, + "language_loss": 0.79902399, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82003278, + "num_input_tokens_seen": 242679890, + "step": 11242, + "time_per_iteration": 2.655247211456299 + }, + { + "auxiliary_loss_clip": 0.01095906, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.03434443, + "balance_loss_mlp": 1.023422, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.8778034499181289, + "language_loss": 0.72840595, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74973899, + "num_input_tokens_seen": 242699495, + "step": 11243, + "time_per_iteration": 2.522336483001709 + }, + { + "auxiliary_loss_clip": 0.01099728, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.03767014, + "balance_loss_mlp": 1.02319765, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.6654593814628107, + "language_loss": 0.72946835, + "learning_rate": 1.003487287162221e-06, + "loss": 0.75081378, + "num_input_tokens_seen": 242719500, + "step": 11244, + "time_per_iteration": 2.530258893966675 + }, + { + "auxiliary_loss_clip": 0.01113888, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.03868675, + "balance_loss_mlp": 1.0274421, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 1.8703321336026208, + "language_loss": 0.85549873, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87704289, + "num_input_tokens_seen": 242738325, + "step": 11245, + "time_per_iteration": 2.445305824279785 + }, + { + "auxiliary_loss_clip": 0.01115257, + "auxiliary_loss_mlp": 0.00778839, + "balance_loss_clip": 1.03761196, + "balance_loss_mlp": 1.00068855, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 1.965097178851616, + "language_loss": 0.73564839, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75458932, + "num_input_tokens_seen": 242756620, + "step": 11246, + "time_per_iteration": 2.450375556945801 + }, + { + "auxiliary_loss_clip": 0.01094723, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.03525484, + "balance_loss_mlp": 1.01792753, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 1.828835097538207, + "language_loss": 0.87903643, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90028632, + "num_input_tokens_seen": 242774505, + "step": 11247, + "time_per_iteration": 2.4689719676971436 + }, + { + "auxiliary_loss_clip": 0.0102145, + "auxiliary_loss_mlp": 0.01001189, + "balance_loss_clip": 1.0206449, + "balance_loss_mlp": 1.00005019, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8191608192255504, + "language_loss": 0.53951991, + "learning_rate": 1.002136890130115e-06, + "loss": 0.55974633, + "num_input_tokens_seen": 242828645, + "step": 11248, + "time_per_iteration": 3.1242454051971436 + }, + { + "auxiliary_loss_clip": 0.01059012, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.04396462, + "balance_loss_mlp": 1.01813805, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.688697541897043, + "language_loss": 0.73466271, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75555146, + "num_input_tokens_seen": 242850100, + "step": 11249, + "time_per_iteration": 2.723102569580078 + }, + { + "auxiliary_loss_clip": 0.01102577, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.03657508, + "balance_loss_mlp": 1.02439487, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.2652737798682465, + "language_loss": 0.73858076, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.75998652, + "num_input_tokens_seen": 242867775, + "step": 11250, + "time_per_iteration": 2.4605519771575928 + }, + { + "auxiliary_loss_clip": 0.01112431, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.03787386, + "balance_loss_mlp": 1.01636076, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 1.883892553820218, + "language_loss": 0.74631137, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.76772523, + "num_input_tokens_seen": 242886865, + "step": 11251, + "time_per_iteration": 3.8987717628479004 + }, + { + "auxiliary_loss_clip": 0.01081551, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.04090095, + "balance_loss_mlp": 1.01679504, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 1.9900068787465974, + "language_loss": 0.70110118, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72220671, + "num_input_tokens_seen": 242906705, + "step": 11252, + "time_per_iteration": 3.8168458938598633 + }, + { + "auxiliary_loss_clip": 0.01068929, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.04055595, + "balance_loss_mlp": 1.01857281, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.8517139897165167, + "language_loss": 0.67140049, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.69239807, + "num_input_tokens_seen": 242925215, + "step": 11253, + "time_per_iteration": 2.6715190410614014 + }, + { + "auxiliary_loss_clip": 0.01073316, + "auxiliary_loss_mlp": 0.00785334, + "balance_loss_clip": 1.03401089, + "balance_loss_mlp": 1.0006671, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 3.289597149493642, + "language_loss": 0.77172422, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79031074, + "num_input_tokens_seen": 242944750, + "step": 11254, + "time_per_iteration": 2.5307140350341797 + }, + { + "auxiliary_loss_clip": 0.01101324, + "auxiliary_loss_mlp": 0.01032248, + "balance_loss_clip": 1.03714955, + "balance_loss_mlp": 1.01906323, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.6604480015329544, + "language_loss": 0.7222349, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74357057, + "num_input_tokens_seen": 242963860, + "step": 11255, + "time_per_iteration": 2.4586310386657715 + }, + { + "auxiliary_loss_clip": 0.01065767, + "auxiliary_loss_mlp": 0.00780375, + "balance_loss_clip": 1.03435445, + "balance_loss_mlp": 1.00060987, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.374186136819413, + "language_loss": 0.75851595, + "learning_rate": 9.994379131600828e-07, + "loss": 0.77697736, + "num_input_tokens_seen": 242983050, + "step": 11256, + "time_per_iteration": 2.621739387512207 + }, + { + "auxiliary_loss_clip": 0.01102773, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.03907061, + "balance_loss_mlp": 1.01926112, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.4568761653318414, + "language_loss": 0.64532214, + "learning_rate": 9.991007116408965e-07, + "loss": 0.66666692, + "num_input_tokens_seen": 243001125, + "step": 11257, + "time_per_iteration": 2.451261281967163 + }, + { + "auxiliary_loss_clip": 0.01067062, + "auxiliary_loss_mlp": 0.01031048, + "balance_loss_clip": 1.04123902, + "balance_loss_mlp": 1.01909733, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.4894372811111756, + "language_loss": 0.75457537, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77555645, + "num_input_tokens_seen": 243021865, + "step": 11258, + "time_per_iteration": 2.6123239994049072 + }, + { + "auxiliary_loss_clip": 0.01088633, + "auxiliary_loss_mlp": 0.01033514, + "balance_loss_clip": 1.0385921, + "balance_loss_mlp": 1.02154565, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.5805607316295973, + "language_loss": 0.66906071, + "learning_rate": 9.984264224779127e-07, + "loss": 0.69028223, + "num_input_tokens_seen": 243042970, + "step": 11259, + "time_per_iteration": 2.6073403358459473 + }, + { + "auxiliary_loss_clip": 0.01091161, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.03662884, + "balance_loss_mlp": 1.01967227, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.160828136127085, + "language_loss": 0.85669589, + "learning_rate": 9.980893348596839e-07, + "loss": 0.8779304, + "num_input_tokens_seen": 243058470, + "step": 11260, + "time_per_iteration": 2.5142972469329834 + }, + { + "auxiliary_loss_clip": 0.01087401, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.03468657, + "balance_loss_mlp": 1.02404118, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 2.630812018237951, + "language_loss": 0.77184772, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79309934, + "num_input_tokens_seen": 243076630, + "step": 11261, + "time_per_iteration": 2.479365348815918 + }, + { + "auxiliary_loss_clip": 0.01093072, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.03869772, + "balance_loss_mlp": 1.0227164, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.633741991163507, + "language_loss": 0.87734532, + "learning_rate": 9.97415273613666e-07, + "loss": 0.89863163, + "num_input_tokens_seen": 243092260, + "step": 11262, + "time_per_iteration": 2.501337766647339 + }, + { + "auxiliary_loss_clip": 0.01090097, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.03844452, + "balance_loss_mlp": 1.01896, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 2.8923049778359244, + "language_loss": 0.74098742, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76221347, + "num_input_tokens_seen": 243109405, + "step": 11263, + "time_per_iteration": 2.4680538177490234 + }, + { + "auxiliary_loss_clip": 0.01107649, + "auxiliary_loss_mlp": 0.01036924, + "balance_loss_clip": 1.03919232, + "balance_loss_mlp": 1.02242851, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 2.656493841277849, + "language_loss": 0.68306422, + "learning_rate": 9.967413644401016e-07, + "loss": 0.70450997, + "num_input_tokens_seen": 243128135, + "step": 11264, + "time_per_iteration": 2.4840753078460693 + }, + { + "auxiliary_loss_clip": 0.01091268, + "auxiliary_loss_mlp": 0.0103548, + "balance_loss_clip": 1.03958297, + "balance_loss_mlp": 1.02216434, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 2.0605406367895984, + "language_loss": 0.73127449, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75254196, + "num_input_tokens_seen": 243146785, + "step": 11265, + "time_per_iteration": 2.527714490890503 + }, + { + "auxiliary_loss_clip": 0.01075243, + "auxiliary_loss_mlp": 0.01037918, + "balance_loss_clip": 1.03837371, + "balance_loss_mlp": 1.02449501, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.5601033637752655, + "language_loss": 0.61638641, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63751793, + "num_input_tokens_seen": 243165275, + "step": 11266, + "time_per_iteration": 4.039307117462158 + }, + { + "auxiliary_loss_clip": 0.01086731, + "auxiliary_loss_mlp": 0.01035886, + "balance_loss_clip": 1.0406003, + "balance_loss_mlp": 1.02263582, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 1.944113025692587, + "language_loss": 0.70046538, + "learning_rate": 9.957307860391976e-07, + "loss": 0.72169161, + "num_input_tokens_seen": 243182845, + "step": 11267, + "time_per_iteration": 2.5120060443878174 + }, + { + "auxiliary_loss_clip": 0.01113585, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.03825593, + "balance_loss_mlp": 1.02117431, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 1.9531167435017653, + "language_loss": 0.71090138, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73237729, + "num_input_tokens_seen": 243201475, + "step": 11268, + "time_per_iteration": 2.450836181640625 + }, + { + "auxiliary_loss_clip": 0.01089103, + "auxiliary_loss_mlp": 0.01033239, + "balance_loss_clip": 1.03533483, + "balance_loss_mlp": 1.01968551, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.5476384695044374, + "language_loss": 0.77116287, + "learning_rate": 9.950572574939194e-07, + "loss": 0.79238623, + "num_input_tokens_seen": 243221850, + "step": 11269, + "time_per_iteration": 2.5073578357696533 + }, + { + "auxiliary_loss_clip": 0.01080988, + "auxiliary_loss_mlp": 0.01043481, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.02996314, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 2.402853054165338, + "language_loss": 0.74497998, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76622462, + "num_input_tokens_seen": 243239855, + "step": 11270, + "time_per_iteration": 2.552450656890869 + }, + { + "auxiliary_loss_clip": 0.01059737, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.04103804, + "balance_loss_mlp": 1.02503896, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.8808109345351154, + "language_loss": 0.73007166, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75105333, + "num_input_tokens_seen": 243260085, + "step": 11271, + "time_per_iteration": 2.577286720275879 + }, + { + "auxiliary_loss_clip": 0.01112788, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.03868508, + "balance_loss_mlp": 1.02177095, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.7749297337753498, + "language_loss": 0.67900449, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70047557, + "num_input_tokens_seen": 243280065, + "step": 11272, + "time_per_iteration": 2.4730007648468018 + }, + { + "auxiliary_loss_clip": 0.01102936, + "auxiliary_loss_mlp": 0.01035117, + "balance_loss_clip": 1.03950024, + "balance_loss_mlp": 1.02074027, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 1.9436521323722982, + "language_loss": 0.73642969, + "learning_rate": 9.937106577958481e-07, + "loss": 0.75781018, + "num_input_tokens_seen": 243297775, + "step": 11273, + "time_per_iteration": 2.437814235687256 + }, + { + "auxiliary_loss_clip": 0.01095702, + "auxiliary_loss_mlp": 0.01041928, + "balance_loss_clip": 1.03908014, + "balance_loss_mlp": 1.0283618, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 1.6428344670521133, + "language_loss": 0.70328939, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72466564, + "num_input_tokens_seen": 243315760, + "step": 11274, + "time_per_iteration": 2.4833505153656006 + }, + { + "auxiliary_loss_clip": 0.0111489, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.0389607, + "balance_loss_mlp": 1.01990604, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.639207424623704, + "language_loss": 0.6539163, + "learning_rate": 9.930375868473093e-07, + "loss": 0.67539787, + "num_input_tokens_seen": 243335715, + "step": 11275, + "time_per_iteration": 2.423048496246338 + }, + { + "auxiliary_loss_clip": 0.01105621, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.04044366, + "balance_loss_mlp": 1.02092636, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 1.723871669109765, + "language_loss": 0.72564387, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74703062, + "num_input_tokens_seen": 243356935, + "step": 11276, + "time_per_iteration": 2.5379834175109863 + }, + { + "auxiliary_loss_clip": 0.01088172, + "auxiliary_loss_mlp": 0.00779059, + "balance_loss_clip": 1.03485489, + "balance_loss_mlp": 1.00053549, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.6964286126174704, + "language_loss": 0.76841879, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78709114, + "num_input_tokens_seen": 243375625, + "step": 11277, + "time_per_iteration": 2.4975156784057617 + }, + { + "auxiliary_loss_clip": 0.01094335, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.03856468, + "balance_loss_mlp": 1.01742172, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 2.648013235604961, + "language_loss": 0.8340345, + "learning_rate": 9.920282668372627e-07, + "loss": 0.8552857, + "num_input_tokens_seen": 243390195, + "step": 11278, + "time_per_iteration": 2.5137689113616943 + }, + { + "auxiliary_loss_clip": 0.01088097, + "auxiliary_loss_mlp": 0.0077599, + "balance_loss_clip": 1.04348636, + "balance_loss_mlp": 1.00054216, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.543042576405404, + "language_loss": 0.70358348, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72222435, + "num_input_tokens_seen": 243411690, + "step": 11279, + "time_per_iteration": 2.552454710006714 + }, + { + "auxiliary_loss_clip": 0.01106486, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.04079783, + "balance_loss_mlp": 1.01883984, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 4.29148225591448, + "language_loss": 0.73841357, + "learning_rate": 9.913555779212485e-07, + "loss": 0.75979954, + "num_input_tokens_seen": 243430280, + "step": 11280, + "time_per_iteration": 3.9310522079467773 + }, + { + "auxiliary_loss_clip": 0.01104, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.03696585, + "balance_loss_mlp": 1.01888156, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.8573253274883497, + "language_loss": 0.7006554, + "learning_rate": 9.910192908287104e-07, + "loss": 0.72201777, + "num_input_tokens_seen": 243448690, + "step": 11281, + "time_per_iteration": 2.4528658390045166 + }, + { + "auxiliary_loss_clip": 0.01110851, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.0395422, + "balance_loss_mlp": 1.02131629, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.6477331491249922, + "language_loss": 0.63991863, + "learning_rate": 9.906830419968217e-07, + "loss": 0.66136074, + "num_input_tokens_seen": 243470695, + "step": 11282, + "time_per_iteration": 2.5079238414764404 + }, + { + "auxiliary_loss_clip": 0.01078365, + "auxiliary_loss_mlp": 0.0104566, + "balance_loss_clip": 1.03308463, + "balance_loss_mlp": 1.02932847, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.5607140901053604, + "language_loss": 0.7449035, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76614374, + "num_input_tokens_seen": 243493345, + "step": 11283, + "time_per_iteration": 2.621339797973633 + }, + { + "auxiliary_loss_clip": 0.01102557, + "auxiliary_loss_mlp": 0.01027023, + "balance_loss_clip": 1.03801537, + "balance_loss_mlp": 1.01485217, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.7296954247194645, + "language_loss": 0.57079566, + "learning_rate": 9.900106591659948e-07, + "loss": 0.5920915, + "num_input_tokens_seen": 243515670, + "step": 11284, + "time_per_iteration": 2.5999135971069336 + }, + { + "auxiliary_loss_clip": 0.0109252, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.0400753, + "balance_loss_mlp": 1.01909792, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 1.9360435092042563, + "language_loss": 0.75135136, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77259469, + "num_input_tokens_seen": 243533625, + "step": 11285, + "time_per_iteration": 2.4914309978485107 + }, + { + "auxiliary_loss_clip": 0.01112004, + "auxiliary_loss_mlp": 0.01029755, + "balance_loss_clip": 1.04016268, + "balance_loss_mlp": 1.01708865, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.553548630015208, + "language_loss": 0.66424954, + "learning_rate": 9.893384295307557e-07, + "loss": 0.6856671, + "num_input_tokens_seen": 243553040, + "step": 11286, + "time_per_iteration": 2.465801954269409 + }, + { + "auxiliary_loss_clip": 0.01090857, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.03702152, + "balance_loss_mlp": 1.0152992, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 3.2102036176039137, + "language_loss": 0.53074598, + "learning_rate": 9.890023721933447e-07, + "loss": 0.55193979, + "num_input_tokens_seen": 243572590, + "step": 11287, + "time_per_iteration": 2.55023455619812 + }, + { + "auxiliary_loss_clip": 0.01071149, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.03685653, + "balance_loss_mlp": 1.02204013, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 1.5409258863653825, + "language_loss": 0.7728647, + "learning_rate": 9.886663531930655e-07, + "loss": 0.7939257, + "num_input_tokens_seen": 243594140, + "step": 11288, + "time_per_iteration": 2.6153759956359863 + }, + { + "auxiliary_loss_clip": 0.01106075, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.04119337, + "balance_loss_mlp": 1.02321386, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 1.9463817564973007, + "language_loss": 0.73832691, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75974053, + "num_input_tokens_seen": 243615170, + "step": 11289, + "time_per_iteration": 2.502958059310913 + }, + { + "auxiliary_loss_clip": 0.01113943, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.03911769, + "balance_loss_mlp": 1.02052593, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.6298525642005979, + "language_loss": 0.79950202, + "learning_rate": 9.879944302548682e-07, + "loss": 0.8209818, + "num_input_tokens_seen": 243635675, + "step": 11290, + "time_per_iteration": 2.460568428039551 + }, + { + "auxiliary_loss_clip": 0.01099863, + "auxiliary_loss_mlp": 0.01031573, + "balance_loss_clip": 1.03967774, + "balance_loss_mlp": 1.01974797, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 1.475510537330819, + "language_loss": 0.75040495, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77171934, + "num_input_tokens_seen": 243654950, + "step": 11291, + "time_per_iteration": 3.980491876602173 + }, + { + "auxiliary_loss_clip": 0.01095275, + "auxiliary_loss_mlp": 0.00778117, + "balance_loss_clip": 1.04028976, + "balance_loss_mlp": 1.00057983, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.7136360516450453, + "language_loss": 0.75970507, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77843899, + "num_input_tokens_seen": 243674970, + "step": 11292, + "time_per_iteration": 4.168668508529663 + }, + { + "auxiliary_loss_clip": 0.01072621, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.03472054, + "balance_loss_mlp": 1.0218184, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 3.525426554878319, + "language_loss": 0.83650196, + "learning_rate": 9.869868336945556e-07, + "loss": 0.85758245, + "num_input_tokens_seen": 243693440, + "step": 11293, + "time_per_iteration": 2.5261082649230957 + }, + { + "auxiliary_loss_clip": 0.01119199, + "auxiliary_loss_mlp": 0.01039953, + "balance_loss_clip": 1.0415988, + "balance_loss_mlp": 1.02548134, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.6721852151018854, + "language_loss": 0.79289794, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81448948, + "num_input_tokens_seen": 243710055, + "step": 11294, + "time_per_iteration": 2.4846713542938232 + }, + { + "auxiliary_loss_clip": 0.01092546, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.04016972, + "balance_loss_mlp": 1.01608062, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.6164120910032482, + "language_loss": 0.78852504, + "learning_rate": 9.86315294700924e-07, + "loss": 0.80973101, + "num_input_tokens_seen": 243728635, + "step": 11295, + "time_per_iteration": 2.51304292678833 + }, + { + "auxiliary_loss_clip": 0.01084183, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.03905821, + "balance_loss_mlp": 1.0190258, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 2.2647965917637234, + "language_loss": 0.71495724, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73609507, + "num_input_tokens_seen": 243748330, + "step": 11296, + "time_per_iteration": 2.506761074066162 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.03731203, + "balance_loss_mlp": 1.01776278, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.6987972540675271, + "language_loss": 0.70658368, + "learning_rate": 9.856439094633949e-07, + "loss": 0.72788608, + "num_input_tokens_seen": 243769380, + "step": 11297, + "time_per_iteration": 2.4972705841064453 + }, + { + "auxiliary_loss_clip": 0.01087294, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.03908348, + "balance_loss_mlp": 1.02162504, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 4.486336747424935, + "language_loss": 0.66731948, + "learning_rate": 9.853082745349918e-07, + "loss": 0.6885432, + "num_input_tokens_seen": 243785510, + "step": 11298, + "time_per_iteration": 2.52493953704834 + }, + { + "auxiliary_loss_clip": 0.01104826, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.03903699, + "balance_loss_mlp": 1.01717925, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 2.187541292773019, + "language_loss": 0.71745497, + "learning_rate": 9.84972678083801e-07, + "loss": 0.73879051, + "num_input_tokens_seen": 243805545, + "step": 11299, + "time_per_iteration": 2.5417940616607666 + }, + { + "auxiliary_loss_clip": 0.01115006, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.04030108, + "balance_loss_mlp": 1.02286649, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.4743564662394313, + "language_loss": 0.77324367, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79475605, + "num_input_tokens_seen": 243825185, + "step": 11300, + "time_per_iteration": 2.47121524810791 + }, + { + "auxiliary_loss_clip": 0.01100043, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.03690815, + "balance_loss_mlp": 1.01766038, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 2.126928135234906, + "language_loss": 0.63124549, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65255249, + "num_input_tokens_seen": 243841600, + "step": 11301, + "time_per_iteration": 2.4175047874450684 + }, + { + "auxiliary_loss_clip": 0.01099288, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.03700686, + "balance_loss_mlp": 1.01850593, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.7780171705113423, + "language_loss": 0.82643914, + "learning_rate": 9.839661197207525e-07, + "loss": 0.84773993, + "num_input_tokens_seen": 243862250, + "step": 11302, + "time_per_iteration": 2.524960517883301 + }, + { + "auxiliary_loss_clip": 0.01102877, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.03591812, + "balance_loss_mlp": 1.01781583, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 1.889853065362866, + "language_loss": 0.69763845, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71896839, + "num_input_tokens_seen": 243880560, + "step": 11303, + "time_per_iteration": 2.4424054622650146 + }, + { + "auxiliary_loss_clip": 0.01085427, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.04023123, + "balance_loss_mlp": 1.02075231, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 3.20642048445051, + "language_loss": 0.69990945, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72110116, + "num_input_tokens_seen": 243900635, + "step": 11304, + "time_per_iteration": 2.526609182357788 + }, + { + "auxiliary_loss_clip": 0.01102874, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.03847957, + "balance_loss_mlp": 1.02409935, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 2.1064230385704548, + "language_loss": 0.72619998, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74760431, + "num_input_tokens_seen": 243920160, + "step": 11305, + "time_per_iteration": 3.9664738178253174 + }, + { + "auxiliary_loss_clip": 0.01091473, + "auxiliary_loss_mlp": 0.01026942, + "balance_loss_clip": 1.03922117, + "balance_loss_mlp": 1.01425242, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 1.86207329268558, + "language_loss": 0.65478241, + "learning_rate": 9.826245813561882e-07, + "loss": 0.67596662, + "num_input_tokens_seen": 243939015, + "step": 11306, + "time_per_iteration": 2.5389649868011475 + }, + { + "auxiliary_loss_clip": 0.01089015, + "auxiliary_loss_mlp": 0.01029929, + "balance_loss_clip": 1.03877008, + "balance_loss_mlp": 1.0165838, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 3.0128132486017463, + "language_loss": 0.79686177, + "learning_rate": 9.822892931807021e-07, + "loss": 0.81805122, + "num_input_tokens_seen": 243958470, + "step": 11307, + "time_per_iteration": 2.5130650997161865 + }, + { + "auxiliary_loss_clip": 0.01085963, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.03551078, + "balance_loss_mlp": 1.02199578, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.6267378567815782, + "language_loss": 0.89062464, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91184092, + "num_input_tokens_seen": 243975450, + "step": 11308, + "time_per_iteration": 2.4563841819763184 + }, + { + "auxiliary_loss_clip": 0.01077568, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.03402483, + "balance_loss_mlp": 1.02303088, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 1.9347392681519577, + "language_loss": 0.71295285, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73410428, + "num_input_tokens_seen": 243994355, + "step": 11309, + "time_per_iteration": 2.5330066680908203 + }, + { + "auxiliary_loss_clip": 0.0108294, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.04003775, + "balance_loss_mlp": 1.02344406, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 2.2792270536274484, + "language_loss": 0.84435409, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86554331, + "num_input_tokens_seen": 244011620, + "step": 11310, + "time_per_iteration": 2.541029453277588 + }, + { + "auxiliary_loss_clip": 0.0108464, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.03884125, + "balance_loss_mlp": 1.0182569, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.3061787950024413, + "language_loss": 0.83243978, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85358584, + "num_input_tokens_seen": 244029925, + "step": 11311, + "time_per_iteration": 2.518479585647583 + }, + { + "auxiliary_loss_clip": 0.01068828, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.03298211, + "balance_loss_mlp": 1.01694465, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 1.7002670027601103, + "language_loss": 0.76167393, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78268033, + "num_input_tokens_seen": 244051225, + "step": 11312, + "time_per_iteration": 2.59651780128479 + }, + { + "auxiliary_loss_clip": 0.0103723, + "auxiliary_loss_mlp": 0.01000507, + "balance_loss_clip": 1.01293659, + "balance_loss_mlp": 0.99912375, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6566518223382771, + "language_loss": 0.57157636, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59195369, + "num_input_tokens_seen": 244115930, + "step": 11313, + "time_per_iteration": 3.122354030609131 + }, + { + "auxiliary_loss_clip": 0.01100053, + "auxiliary_loss_mlp": 0.01027941, + "balance_loss_clip": 1.03577423, + "balance_loss_mlp": 1.01492918, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 5.32109378905349, + "language_loss": 0.69166327, + "learning_rate": 9.799433572314754e-07, + "loss": 0.71294326, + "num_input_tokens_seen": 244137320, + "step": 11314, + "time_per_iteration": 2.519779920578003 + }, + { + "auxiliary_loss_clip": 0.01098643, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.03502727, + "balance_loss_mlp": 1.01792574, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.7664623777830377, + "language_loss": 0.81738561, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83866823, + "num_input_tokens_seen": 244152755, + "step": 11315, + "time_per_iteration": 2.4308300018310547 + }, + { + "auxiliary_loss_clip": 0.01078715, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.04391325, + "balance_loss_mlp": 1.01692116, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.6702572866882093, + "language_loss": 0.70188624, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72297001, + "num_input_tokens_seen": 244171480, + "step": 11316, + "time_per_iteration": 2.5826780796051025 + }, + { + "auxiliary_loss_clip": 0.01102656, + "auxiliary_loss_mlp": 0.01027121, + "balance_loss_clip": 1.04085326, + "balance_loss_mlp": 1.01562309, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.2347469823057633, + "language_loss": 0.66886806, + "learning_rate": 9.789385360660003e-07, + "loss": 0.69016576, + "num_input_tokens_seen": 244187920, + "step": 11317, + "time_per_iteration": 2.4382452964782715 + }, + { + "auxiliary_loss_clip": 0.01104376, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.04123068, + "balance_loss_mlp": 1.0280863, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.6605061012614997, + "language_loss": 0.74759459, + "learning_rate": 9.78603673098082e-07, + "loss": 0.76904309, + "num_input_tokens_seen": 244209565, + "step": 11318, + "time_per_iteration": 2.5108978748321533 + }, + { + "auxiliary_loss_clip": 0.01081798, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.03435659, + "balance_loss_mlp": 1.01919007, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 1.6974853222277462, + "language_loss": 0.67914599, + "learning_rate": 9.782688488616143e-07, + "loss": 0.70027125, + "num_input_tokens_seen": 244228015, + "step": 11319, + "time_per_iteration": 2.484835624694824 + }, + { + "auxiliary_loss_clip": 0.01073871, + "auxiliary_loss_mlp": 0.00777501, + "balance_loss_clip": 1.04111028, + "balance_loss_mlp": 1.00058866, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.6559764370906218, + "language_loss": 0.76526403, + "learning_rate": 9.779340633692945e-07, + "loss": 0.78377777, + "num_input_tokens_seen": 244245615, + "step": 11320, + "time_per_iteration": 4.123321533203125 + }, + { + "auxiliary_loss_clip": 0.01086288, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.03930628, + "balance_loss_mlp": 1.0205617, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 2.0231711624545157, + "language_loss": 0.74851102, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76970458, + "num_input_tokens_seen": 244263625, + "step": 11321, + "time_per_iteration": 2.5403928756713867 + }, + { + "auxiliary_loss_clip": 0.01095371, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.04281235, + "balance_loss_mlp": 1.02405167, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.8240977490445909, + "language_loss": 0.72798836, + "learning_rate": 9.772646086678758e-07, + "loss": 0.74930829, + "num_input_tokens_seen": 244282745, + "step": 11322, + "time_per_iteration": 2.515331745147705 + }, + { + "auxiliary_loss_clip": 0.01066458, + "auxiliary_loss_mlp": 0.00779623, + "balance_loss_clip": 1.03795505, + "balance_loss_mlp": 1.00065315, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 2.4146428253311765, + "language_loss": 0.78514373, + "learning_rate": 9.769299394841638e-07, + "loss": 0.80360448, + "num_input_tokens_seen": 244303770, + "step": 11323, + "time_per_iteration": 2.585524082183838 + }, + { + "auxiliary_loss_clip": 0.01010099, + "auxiliary_loss_mlp": 0.01002792, + "balance_loss_clip": 1.01835835, + "balance_loss_mlp": 1.00154006, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7812281772901747, + "language_loss": 0.57137656, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59150541, + "num_input_tokens_seen": 244355910, + "step": 11324, + "time_per_iteration": 2.9131076335906982 + }, + { + "auxiliary_loss_clip": 0.01097618, + "auxiliary_loss_mlp": 0.01040446, + "balance_loss_clip": 1.04185832, + "balance_loss_mlp": 1.02683282, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 3.2416793201744065, + "language_loss": 0.68216181, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70354241, + "num_input_tokens_seen": 244376610, + "step": 11325, + "time_per_iteration": 2.552553653717041 + }, + { + "auxiliary_loss_clip": 0.01105511, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.03669047, + "balance_loss_mlp": 1.01960063, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 2.22867159728502, + "language_loss": 0.70066398, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72205269, + "num_input_tokens_seen": 244393000, + "step": 11326, + "time_per_iteration": 2.4419057369232178 + }, + { + "auxiliary_loss_clip": 0.01113244, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.03930676, + "balance_loss_mlp": 1.02095103, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.849484325189097, + "language_loss": 0.7292999, + "learning_rate": 9.75591650825392e-07, + "loss": 0.75076771, + "num_input_tokens_seen": 244409515, + "step": 11327, + "time_per_iteration": 2.4481871128082275 + }, + { + "auxiliary_loss_clip": 0.01100214, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.03845644, + "balance_loss_mlp": 1.0169878, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 1.8394756123194036, + "language_loss": 0.77350813, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79480898, + "num_input_tokens_seen": 244427165, + "step": 11328, + "time_per_iteration": 2.435595989227295 + }, + { + "auxiliary_loss_clip": 0.01114037, + "auxiliary_loss_mlp": 0.01028794, + "balance_loss_clip": 1.03894758, + "balance_loss_mlp": 1.01613402, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 1.9305703917101102, + "language_loss": 0.64012671, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66155493, + "num_input_tokens_seen": 244445705, + "step": 11329, + "time_per_iteration": 2.3914568424224854 + }, + { + "auxiliary_loss_clip": 0.01058624, + "auxiliary_loss_mlp": 0.00778196, + "balance_loss_clip": 1.03829753, + "balance_loss_mlp": 1.00058866, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 2.1113978139780345, + "language_loss": 0.79496396, + "learning_rate": 9.745883421664096e-07, + "loss": 0.8133322, + "num_input_tokens_seen": 244460415, + "step": 11330, + "time_per_iteration": 2.580540418624878 + }, + { + "auxiliary_loss_clip": 0.011028, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.0391798, + "balance_loss_mlp": 1.0211215, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 1.822047028431717, + "language_loss": 0.63611114, + "learning_rate": 9.742539836972665e-07, + "loss": 0.65748227, + "num_input_tokens_seen": 244480555, + "step": 11331, + "time_per_iteration": 5.245110273361206 + }, + { + "auxiliary_loss_clip": 0.01067765, + "auxiliary_loss_mlp": 0.01038941, + "balance_loss_clip": 1.03661895, + "balance_loss_mlp": 1.024266, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.6177176237840432, + "language_loss": 0.72195548, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74302256, + "num_input_tokens_seen": 244498540, + "step": 11332, + "time_per_iteration": 2.5427563190460205 + }, + { + "auxiliary_loss_clip": 0.01102878, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.04009938, + "balance_loss_mlp": 1.02188265, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 2.3911010273529505, + "language_loss": 0.74821055, + "learning_rate": 9.735853834608326e-07, + "loss": 0.76959026, + "num_input_tokens_seen": 244517015, + "step": 11333, + "time_per_iteration": 2.4643945693969727 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.03983879, + "balance_loss_mlp": 1.01798487, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.501121203076138, + "language_loss": 0.72139239, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74277693, + "num_input_tokens_seen": 244537450, + "step": 11334, + "time_per_iteration": 2.494006633758545 + }, + { + "auxiliary_loss_clip": 0.01099597, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.04169989, + "balance_loss_mlp": 1.01944172, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.5974412123911912, + "language_loss": 0.85885125, + "learning_rate": 9.729169389113791e-07, + "loss": 0.8801707, + "num_input_tokens_seen": 244555640, + "step": 11335, + "time_per_iteration": 2.4738948345184326 + }, + { + "auxiliary_loss_clip": 0.01095262, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.03535998, + "balance_loss_mlp": 1.02114987, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 1.635314265864847, + "language_loss": 0.8206014, + "learning_rate": 9.725827750509542e-07, + "loss": 0.8418799, + "num_input_tokens_seen": 244574005, + "step": 11336, + "time_per_iteration": 2.5079264640808105 + }, + { + "auxiliary_loss_clip": 0.01068607, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.03535938, + "balance_loss_mlp": 1.02377725, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.7908337120965012, + "language_loss": 0.82001245, + "learning_rate": 9.72248650150294e-07, + "loss": 0.84106427, + "num_input_tokens_seen": 244591395, + "step": 11337, + "time_per_iteration": 2.574979782104492 + }, + { + "auxiliary_loss_clip": 0.01069895, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.04000068, + "balance_loss_mlp": 1.0191015, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.969597470953315, + "language_loss": 0.7237184, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74472809, + "num_input_tokens_seen": 244610400, + "step": 11338, + "time_per_iteration": 2.5631043910980225 + }, + { + "auxiliary_loss_clip": 0.01071638, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.03481889, + "balance_loss_mlp": 1.02258039, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.6511446903766862, + "language_loss": 0.77613592, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79720885, + "num_input_tokens_seen": 244630400, + "step": 11339, + "time_per_iteration": 2.60329270362854 + }, + { + "auxiliary_loss_clip": 0.01079736, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.03379726, + "balance_loss_mlp": 1.02330732, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 1.845592254625795, + "language_loss": 0.70472467, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72588933, + "num_input_tokens_seen": 244649155, + "step": 11340, + "time_per_iteration": 2.6171860694885254 + }, + { + "auxiliary_loss_clip": 0.01095418, + "auxiliary_loss_mlp": 0.01033926, + "balance_loss_clip": 1.04041612, + "balance_loss_mlp": 1.0209744, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.453474701967545, + "language_loss": 0.83863235, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85992575, + "num_input_tokens_seen": 244665470, + "step": 11341, + "time_per_iteration": 2.53013277053833 + }, + { + "auxiliary_loss_clip": 0.01076917, + "auxiliary_loss_mlp": 0.01035875, + "balance_loss_clip": 1.03611922, + "balance_loss_mlp": 1.02177298, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.91526879683783, + "language_loss": 0.68651164, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70763958, + "num_input_tokens_seen": 244684390, + "step": 11342, + "time_per_iteration": 2.5426111221313477 + }, + { + "auxiliary_loss_clip": 0.01060001, + "auxiliary_loss_mlp": 0.01034005, + "balance_loss_clip": 1.03804827, + "balance_loss_mlp": 1.02120209, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.727301743617447, + "language_loss": 0.7472291, + "learning_rate": 9.702447196107963e-07, + "loss": 0.76816916, + "num_input_tokens_seen": 244703370, + "step": 11343, + "time_per_iteration": 2.5712268352508545 + }, + { + "auxiliary_loss_clip": 0.01076481, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_clip": 1.0395788, + "balance_loss_mlp": 1.02818358, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 1.6370339598631654, + "language_loss": 0.79918998, + "learning_rate": 9.699108677831639e-07, + "loss": 0.82038462, + "num_input_tokens_seen": 244723325, + "step": 11344, + "time_per_iteration": 4.092145681381226 + }, + { + "auxiliary_loss_clip": 0.01083653, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.04360378, + "balance_loss_mlp": 1.0205791, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 1.9461345239815016, + "language_loss": 0.6651746, + "learning_rate": 9.695770550166136e-07, + "loss": 0.6863451, + "num_input_tokens_seen": 244745650, + "step": 11345, + "time_per_iteration": 2.6484756469726562 + }, + { + "auxiliary_loss_clip": 0.01095057, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.04033065, + "balance_loss_mlp": 1.02007413, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.9056516091194564, + "language_loss": 0.65165013, + "learning_rate": 9.692432813238054e-07, + "loss": 0.6729337, + "num_input_tokens_seen": 244760270, + "step": 11346, + "time_per_iteration": 2.488450050354004 + }, + { + "auxiliary_loss_clip": 0.01052731, + "auxiliary_loss_mlp": 0.0078426, + "balance_loss_clip": 1.03037858, + "balance_loss_mlp": 1.00056386, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.486432342833362, + "language_loss": 0.78402489, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80239481, + "num_input_tokens_seen": 244779565, + "step": 11347, + "time_per_iteration": 2.6613800525665283 + }, + { + "auxiliary_loss_clip": 0.01024141, + "auxiliary_loss_mlp": 0.01003963, + "balance_loss_clip": 1.00861692, + "balance_loss_mlp": 1.00259197, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7158470370341697, + "language_loss": 0.52524734, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54552835, + "num_input_tokens_seen": 244838480, + "step": 11348, + "time_per_iteration": 3.037203550338745 + }, + { + "auxiliary_loss_clip": 0.01111838, + "auxiliary_loss_mlp": 0.01037107, + "balance_loss_clip": 1.0392108, + "balance_loss_mlp": 1.02437556, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.8106148929639378, + "language_loss": 0.79723185, + "learning_rate": 9.682421948143873e-07, + "loss": 0.81872129, + "num_input_tokens_seen": 244855265, + "step": 11349, + "time_per_iteration": 2.4353370666503906 + }, + { + "auxiliary_loss_clip": 0.01110246, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.04125369, + "balance_loss_mlp": 1.01658225, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 1.7564608004620472, + "language_loss": 0.73802423, + "learning_rate": 9.67908577543096e-07, + "loss": 0.75945371, + "num_input_tokens_seen": 244875555, + "step": 11350, + "time_per_iteration": 2.593207597732544 + }, + { + "auxiliary_loss_clip": 0.01111904, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.03873134, + "balance_loss_mlp": 1.01815629, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.563948731062413, + "language_loss": 0.79386604, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81530261, + "num_input_tokens_seen": 244895270, + "step": 11351, + "time_per_iteration": 2.4583704471588135 + }, + { + "auxiliary_loss_clip": 0.01101449, + "auxiliary_loss_mlp": 0.01033815, + "balance_loss_clip": 1.03842604, + "balance_loss_mlp": 1.02114296, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.6620882590445532, + "language_loss": 0.73175603, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75310862, + "num_input_tokens_seen": 244914535, + "step": 11352, + "time_per_iteration": 2.4660043716430664 + }, + { + "auxiliary_loss_clip": 0.01070773, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.03628302, + "balance_loss_mlp": 1.02222753, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.4606919169301915, + "language_loss": 0.80158561, + "learning_rate": 9.669079606018814e-07, + "loss": 0.82266533, + "num_input_tokens_seen": 244936095, + "step": 11353, + "time_per_iteration": 2.6274826526641846 + }, + { + "auxiliary_loss_clip": 0.01101549, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.03683984, + "balance_loss_mlp": 1.01503992, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.7489124862100212, + "language_loss": 0.78339422, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80469131, + "num_input_tokens_seen": 244955290, + "step": 11354, + "time_per_iteration": 2.464592218399048 + }, + { + "auxiliary_loss_clip": 0.01056477, + "auxiliary_loss_mlp": 0.01027956, + "balance_loss_clip": 1.04024506, + "balance_loss_mlp": 1.01619601, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.2576264166221853, + "language_loss": 0.62054825, + "learning_rate": 9.662410784947599e-07, + "loss": 0.64139259, + "num_input_tokens_seen": 244972935, + "step": 11355, + "time_per_iteration": 2.570246934890747 + }, + { + "auxiliary_loss_clip": 0.01059964, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.03070307, + "balance_loss_mlp": 1.01761293, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 1.7432420519010692, + "language_loss": 0.82157451, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84248555, + "num_input_tokens_seen": 244989440, + "step": 11356, + "time_per_iteration": 2.5478241443634033 + }, + { + "auxiliary_loss_clip": 0.01095645, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.04032123, + "balance_loss_mlp": 1.01807356, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 2.7883238433404487, + "language_loss": 0.78505152, + "learning_rate": 9.655743531886052e-07, + "loss": 0.80631709, + "num_input_tokens_seen": 245007830, + "step": 11357, + "time_per_iteration": 2.5363168716430664 + }, + { + "auxiliary_loss_clip": 0.01014897, + "auxiliary_loss_mlp": 0.01009145, + "balance_loss_clip": 1.00732875, + "balance_loss_mlp": 1.0075953, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8225197093941674, + "language_loss": 0.59625173, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61649203, + "num_input_tokens_seen": 245070720, + "step": 11358, + "time_per_iteration": 3.1209585666656494 + }, + { + "auxiliary_loss_clip": 0.01079852, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.03331304, + "balance_loss_mlp": 1.03093624, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 1.7837694690479529, + "language_loss": 0.7863462, + "learning_rate": 9.64907784784544e-07, + "loss": 0.80761433, + "num_input_tokens_seen": 245089070, + "step": 11359, + "time_per_iteration": 2.547508478164673 + }, + { + "auxiliary_loss_clip": 0.01103942, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.04029942, + "balance_loss_mlp": 1.02094293, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 1.9085227156817917, + "language_loss": 0.81650788, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83788508, + "num_input_tokens_seen": 245106500, + "step": 11360, + "time_per_iteration": 3.9720208644866943 + }, + { + "auxiliary_loss_clip": 0.0110164, + "auxiliary_loss_mlp": 0.01038544, + "balance_loss_clip": 1.03790784, + "balance_loss_mlp": 1.023965, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.7520635694335718, + "language_loss": 0.75474024, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77614212, + "num_input_tokens_seen": 245125260, + "step": 11361, + "time_per_iteration": 2.505009651184082 + }, + { + "auxiliary_loss_clip": 0.01028755, + "auxiliary_loss_mlp": 0.01008564, + "balance_loss_clip": 1.03016484, + "balance_loss_mlp": 1.00746095, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8784547584512573, + "language_loss": 0.59619075, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61656398, + "num_input_tokens_seen": 245188730, + "step": 11362, + "time_per_iteration": 3.165642499923706 + }, + { + "auxiliary_loss_clip": 0.01093073, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.03999126, + "balance_loss_mlp": 1.0180589, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 3.05998776498808, + "language_loss": 0.74942124, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77067339, + "num_input_tokens_seen": 245205065, + "step": 11363, + "time_per_iteration": 2.4912402629852295 + }, + { + "auxiliary_loss_clip": 0.01088913, + "auxiliary_loss_mlp": 0.0103811, + "balance_loss_clip": 1.03689814, + "balance_loss_mlp": 1.02474666, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.372957722239864, + "language_loss": 0.89353305, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91480327, + "num_input_tokens_seen": 245224265, + "step": 11364, + "time_per_iteration": 2.520279884338379 + }, + { + "auxiliary_loss_clip": 0.0108883, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.03449988, + "balance_loss_mlp": 1.02311242, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 2.2035000329717023, + "language_loss": 0.88308597, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90432918, + "num_input_tokens_seen": 245243360, + "step": 11365, + "time_per_iteration": 2.5102102756500244 + }, + { + "auxiliary_loss_clip": 0.010842, + "auxiliary_loss_mlp": 0.01040896, + "balance_loss_clip": 1.04278374, + "balance_loss_mlp": 1.02620363, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.479203896438628, + "language_loss": 0.81201261, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83326358, + "num_input_tokens_seen": 245256350, + "step": 11366, + "time_per_iteration": 2.5489273071289062 + }, + { + "auxiliary_loss_clip": 0.01092825, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.03687203, + "balance_loss_mlp": 1.01957393, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.6196620741272862, + "language_loss": 0.76715136, + "learning_rate": 9.622430822110062e-07, + "loss": 0.78840774, + "num_input_tokens_seen": 245277575, + "step": 11367, + "time_per_iteration": 2.5605993270874023 + }, + { + "auxiliary_loss_clip": 0.01088153, + "auxiliary_loss_mlp": 0.01037757, + "balance_loss_clip": 1.03894997, + "balance_loss_mlp": 1.02407229, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.604766383361815, + "language_loss": 0.69225794, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71351707, + "num_input_tokens_seen": 245296615, + "step": 11368, + "time_per_iteration": 2.5302789211273193 + }, + { + "auxiliary_loss_clip": 0.01075379, + "auxiliary_loss_mlp": 0.01038281, + "balance_loss_clip": 1.03173459, + "balance_loss_mlp": 1.02476847, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 2.0032287317240156, + "language_loss": 0.73412073, + "learning_rate": 9.615772998335261e-07, + "loss": 0.75525731, + "num_input_tokens_seen": 245316275, + "step": 11369, + "time_per_iteration": 2.5733377933502197 + }, + { + "auxiliary_loss_clip": 0.01102417, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.04115343, + "balance_loss_mlp": 1.02040696, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 1.8462569958348192, + "language_loss": 0.78911185, + "learning_rate": 9.612444677041138e-07, + "loss": 0.810471, + "num_input_tokens_seen": 245334595, + "step": 11370, + "time_per_iteration": 4.370043039321899 + }, + { + "auxiliary_loss_clip": 0.01023537, + "auxiliary_loss_mlp": 0.01003967, + "balance_loss_clip": 1.00784373, + "balance_loss_mlp": 1.00239372, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.766288527736935, + "language_loss": 0.59804326, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61831832, + "num_input_tokens_seen": 245389750, + "step": 11371, + "time_per_iteration": 4.459416627883911 + }, + { + "auxiliary_loss_clip": 0.01088161, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.04176092, + "balance_loss_mlp": 1.01604533, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.5393819944570788, + "language_loss": 0.63784337, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65900612, + "num_input_tokens_seen": 245407530, + "step": 11372, + "time_per_iteration": 2.5285165309906006 + }, + { + "auxiliary_loss_clip": 0.01098651, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.0384016, + "balance_loss_mlp": 1.01788878, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 1.6110055404594723, + "language_loss": 0.72042584, + "learning_rate": 9.602462077046375e-07, + "loss": 0.74172199, + "num_input_tokens_seen": 245427000, + "step": 11373, + "time_per_iteration": 2.486175060272217 + }, + { + "auxiliary_loss_clip": 0.01010338, + "auxiliary_loss_mlp": 0.01003236, + "balance_loss_clip": 1.01356125, + "balance_loss_mlp": 1.00177598, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.2556572911773671, + "language_loss": 0.56676781, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58690357, + "num_input_tokens_seen": 245491620, + "step": 11374, + "time_per_iteration": 3.273775100708008 + }, + { + "auxiliary_loss_clip": 0.01104425, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.03888822, + "balance_loss_mlp": 1.01450586, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.5188917078951643, + "language_loss": 0.73857802, + "learning_rate": 9.595808981551312e-07, + "loss": 0.7599014, + "num_input_tokens_seen": 245511285, + "step": 11375, + "time_per_iteration": 2.473130226135254 + }, + { + "auxiliary_loss_clip": 0.01094025, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.03971374, + "balance_loss_mlp": 1.02193558, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.8693429421078593, + "language_loss": 0.70624429, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72753155, + "num_input_tokens_seen": 245532910, + "step": 11376, + "time_per_iteration": 2.5685458183288574 + }, + { + "auxiliary_loss_clip": 0.01114955, + "auxiliary_loss_mlp": 0.0103766, + "balance_loss_clip": 1.03820109, + "balance_loss_mlp": 1.0239687, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 1.7947391360317788, + "language_loss": 0.74420786, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76573396, + "num_input_tokens_seen": 245550540, + "step": 11377, + "time_per_iteration": 2.4538052082061768 + }, + { + "auxiliary_loss_clip": 0.01017643, + "auxiliary_loss_mlp": 0.01002022, + "balance_loss_clip": 1.01121509, + "balance_loss_mlp": 1.00058508, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7223412789190434, + "language_loss": 0.56820357, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58840013, + "num_input_tokens_seen": 245619570, + "step": 11378, + "time_per_iteration": 3.2010879516601562 + }, + { + "auxiliary_loss_clip": 0.01114461, + "auxiliary_loss_mlp": 0.01037653, + "balance_loss_clip": 1.03855634, + "balance_loss_mlp": 1.02377152, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.6675093322608057, + "language_loss": 0.78390098, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80542219, + "num_input_tokens_seen": 245637980, + "step": 11379, + "time_per_iteration": 2.4401819705963135 + }, + { + "auxiliary_loss_clip": 0.01107551, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.03852129, + "balance_loss_mlp": 1.02218843, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 2.256417089204079, + "language_loss": 0.69252908, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71393585, + "num_input_tokens_seen": 245655690, + "step": 11380, + "time_per_iteration": 2.4232287406921387 + }, + { + "auxiliary_loss_clip": 0.01089465, + "auxiliary_loss_mlp": 0.01034026, + "balance_loss_clip": 1.03763366, + "balance_loss_mlp": 1.02058494, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 2.8346819319783374, + "language_loss": 0.78342724, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80466211, + "num_input_tokens_seen": 245671525, + "step": 11381, + "time_per_iteration": 2.4960999488830566 + }, + { + "auxiliary_loss_clip": 0.0102336, + "auxiliary_loss_mlp": 0.01006789, + "balance_loss_clip": 1.00910115, + "balance_loss_mlp": 1.00543594, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8650703763874499, + "language_loss": 0.67144579, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69174731, + "num_input_tokens_seen": 245724115, + "step": 11382, + "time_per_iteration": 2.903791904449463 + }, + { + "auxiliary_loss_clip": 0.01024073, + "auxiliary_loss_mlp": 0.01002378, + "balance_loss_clip": 1.00982785, + "balance_loss_mlp": 1.00095356, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.815664569649614, + "language_loss": 0.58065844, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60092294, + "num_input_tokens_seen": 245789245, + "step": 11383, + "time_per_iteration": 4.7084925174713135 + }, + { + "auxiliary_loss_clip": 0.01060143, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.03205812, + "balance_loss_mlp": 1.02224445, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 2.2686317045079734, + "language_loss": 0.7958563, + "learning_rate": 9.565889595521517e-07, + "loss": 0.8168183, + "num_input_tokens_seen": 245812420, + "step": 11384, + "time_per_iteration": 2.7237541675567627 + }, + { + "auxiliary_loss_clip": 0.01103199, + "auxiliary_loss_mlp": 0.01040829, + "balance_loss_clip": 1.03716469, + "balance_loss_mlp": 1.02786469, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 1.7405493596725652, + "language_loss": 0.77234173, + "learning_rate": 9.562567195928187e-07, + "loss": 0.793782, + "num_input_tokens_seen": 245829135, + "step": 11385, + "time_per_iteration": 2.572976589202881 + }, + { + "auxiliary_loss_clip": 0.01082923, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.03849113, + "balance_loss_mlp": 1.02378225, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.040483624256176, + "language_loss": 0.84559405, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86681139, + "num_input_tokens_seen": 245847140, + "step": 11386, + "time_per_iteration": 2.557434320449829 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01038465, + "balance_loss_clip": 1.03933215, + "balance_loss_mlp": 1.02650237, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.248615494625311, + "language_loss": 0.83533382, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85674495, + "num_input_tokens_seen": 245862855, + "step": 11387, + "time_per_iteration": 2.495614528656006 + }, + { + "auxiliary_loss_clip": 0.01095045, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.03466296, + "balance_loss_mlp": 1.02276897, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.527468446802879, + "language_loss": 0.72309756, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74440515, + "num_input_tokens_seen": 245885415, + "step": 11388, + "time_per_iteration": 2.6336162090301514 + }, + { + "auxiliary_loss_clip": 0.01097225, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.04025984, + "balance_loss_mlp": 1.01530123, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 2.176706202859443, + "language_loss": 0.627065, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64831531, + "num_input_tokens_seen": 245906285, + "step": 11389, + "time_per_iteration": 2.6968345642089844 + }, + { + "auxiliary_loss_clip": 0.01014182, + "auxiliary_loss_mlp": 0.01000755, + "balance_loss_clip": 1.00990868, + "balance_loss_mlp": 0.99944937, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7292280981480441, + "language_loss": 0.56017518, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58032453, + "num_input_tokens_seen": 245967620, + "step": 11390, + "time_per_iteration": 3.1642017364501953 + }, + { + "auxiliary_loss_clip": 0.01079511, + "auxiliary_loss_mlp": 0.00779686, + "balance_loss_clip": 1.03995681, + "balance_loss_mlp": 1.00055039, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 1.9242498246643729, + "language_loss": 0.87879956, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89739156, + "num_input_tokens_seen": 245985075, + "step": 11391, + "time_per_iteration": 2.5415236949920654 + }, + { + "auxiliary_loss_clip": 0.01074821, + "auxiliary_loss_mlp": 0.01042576, + "balance_loss_clip": 1.03737533, + "balance_loss_mlp": 1.02959967, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 3.152728181428427, + "language_loss": 0.79051065, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81168461, + "num_input_tokens_seen": 246003560, + "step": 11392, + "time_per_iteration": 2.6608901023864746 + }, + { + "auxiliary_loss_clip": 0.01089181, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.03702307, + "balance_loss_mlp": 1.01787448, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 4.42576317085665, + "language_loss": 0.70594352, + "learning_rate": 9.536002258147104e-07, + "loss": 0.72714317, + "num_input_tokens_seen": 246019600, + "step": 11393, + "time_per_iteration": 2.475961685180664 + }, + { + "auxiliary_loss_clip": 0.01076545, + "auxiliary_loss_mlp": 0.01034618, + "balance_loss_clip": 1.0378716, + "balance_loss_mlp": 1.02055156, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 2.1553902119471133, + "language_loss": 0.64649117, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66760278, + "num_input_tokens_seen": 246038920, + "step": 11394, + "time_per_iteration": 2.6242895126342773 + }, + { + "auxiliary_loss_clip": 0.0109003, + "auxiliary_loss_mlp": 0.00781655, + "balance_loss_clip": 1.03810906, + "balance_loss_mlp": 1.00054634, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 2.1897514517630405, + "language_loss": 0.80854779, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82726467, + "num_input_tokens_seen": 246060490, + "step": 11395, + "time_per_iteration": 2.586789131164551 + }, + { + "auxiliary_loss_clip": 0.01076331, + "auxiliary_loss_mlp": 0.01034978, + "balance_loss_clip": 1.03724515, + "balance_loss_mlp": 1.02086401, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 2.771958342325141, + "language_loss": 0.72799492, + "learning_rate": 9.526046950148527e-07, + "loss": 0.74910802, + "num_input_tokens_seen": 246081465, + "step": 11396, + "time_per_iteration": 2.568061351776123 + }, + { + "auxiliary_loss_clip": 0.01082353, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.03575134, + "balance_loss_mlp": 1.01563835, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 2.206929837480905, + "language_loss": 0.79115468, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81226975, + "num_input_tokens_seen": 246096110, + "step": 11397, + "time_per_iteration": 2.5393824577331543 + }, + { + "auxiliary_loss_clip": 0.01041834, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.03550839, + "balance_loss_mlp": 1.01908731, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 1.7555767562137463, + "language_loss": 0.71118438, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73193401, + "num_input_tokens_seen": 246114785, + "step": 11398, + "time_per_iteration": 2.7624073028564453 + }, + { + "auxiliary_loss_clip": 0.01068565, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.04212952, + "balance_loss_mlp": 1.0206579, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.7006113336309816, + "language_loss": 0.70755959, + "learning_rate": 9.516095216709996e-07, + "loss": 0.72857118, + "num_input_tokens_seen": 246136375, + "step": 11399, + "time_per_iteration": 4.283491849899292 + }, + { + "auxiliary_loss_clip": 0.01099502, + "auxiliary_loss_mlp": 0.01036877, + "balance_loss_clip": 1.03733587, + "balance_loss_mlp": 1.02329278, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.484394202173219, + "language_loss": 0.70390201, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72526574, + "num_input_tokens_seen": 246155090, + "step": 11400, + "time_per_iteration": 2.5257747173309326 + }, + { + "auxiliary_loss_clip": 0.01084196, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.04596019, + "balance_loss_mlp": 1.02082253, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 2.162290336667503, + "language_loss": 0.7815299, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80274296, + "num_input_tokens_seen": 246172645, + "step": 11401, + "time_per_iteration": 2.6728787422180176 + }, + { + "auxiliary_loss_clip": 0.01112409, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.03844833, + "balance_loss_mlp": 1.02175319, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 1.841234273175723, + "language_loss": 0.7531426, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77461201, + "num_input_tokens_seen": 246189055, + "step": 11402, + "time_per_iteration": 2.4439477920532227 + }, + { + "auxiliary_loss_clip": 0.01098757, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.03566122, + "balance_loss_mlp": 1.02870727, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 1.5549222696653884, + "language_loss": 0.72669494, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74810958, + "num_input_tokens_seen": 246207990, + "step": 11403, + "time_per_iteration": 2.5184500217437744 + }, + { + "auxiliary_loss_clip": 0.01110874, + "auxiliary_loss_mlp": 0.01033283, + "balance_loss_clip": 1.03804803, + "balance_loss_mlp": 1.02017045, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.1417053763587726, + "language_loss": 0.81633174, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83777332, + "num_input_tokens_seen": 246221595, + "step": 11404, + "time_per_iteration": 2.4019484519958496 + }, + { + "auxiliary_loss_clip": 0.01086206, + "auxiliary_loss_mlp": 0.01039135, + "balance_loss_clip": 1.0367161, + "balance_loss_mlp": 1.02542007, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3934558294442583, + "language_loss": 0.77402008, + "learning_rate": 9.496202487097222e-07, + "loss": 0.79527342, + "num_input_tokens_seen": 246242970, + "step": 11405, + "time_per_iteration": 2.541053533554077 + }, + { + "auxiliary_loss_clip": 0.01023962, + "auxiliary_loss_mlp": 0.01001271, + "balance_loss_clip": 1.00857425, + "balance_loss_mlp": 0.99985808, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7915992131758379, + "language_loss": 0.60982811, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63008034, + "num_input_tokens_seen": 246300405, + "step": 11406, + "time_per_iteration": 3.127819299697876 + }, + { + "auxiliary_loss_clip": 0.01081037, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.03703725, + "balance_loss_mlp": 1.01869822, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 2.271983386252065, + "language_loss": 0.7692408, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79037356, + "num_input_tokens_seen": 246318780, + "step": 11407, + "time_per_iteration": 2.583085775375366 + }, + { + "auxiliary_loss_clip": 0.01092776, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.03894842, + "balance_loss_mlp": 1.02689672, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 2.4823874284078125, + "language_loss": 0.71330905, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73465264, + "num_input_tokens_seen": 246339405, + "step": 11408, + "time_per_iteration": 2.5638978481292725 + }, + { + "auxiliary_loss_clip": 0.01105212, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.03901124, + "balance_loss_mlp": 1.01535118, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.7677868000953523, + "language_loss": 0.70006615, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72140735, + "num_input_tokens_seen": 246357055, + "step": 11409, + "time_per_iteration": 3.9222521781921387 + }, + { + "auxiliary_loss_clip": 0.01068181, + "auxiliary_loss_mlp": 0.01027819, + "balance_loss_clip": 1.03828275, + "balance_loss_mlp": 1.01590419, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.6045440746935427, + "language_loss": 0.78281963, + "learning_rate": 9.479636164655825e-07, + "loss": 0.8037796, + "num_input_tokens_seen": 246374050, + "step": 11410, + "time_per_iteration": 2.572587728500366 + }, + { + "auxiliary_loss_clip": 0.01104961, + "auxiliary_loss_mlp": 0.0103705, + "balance_loss_clip": 1.03720975, + "balance_loss_mlp": 1.0228641, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 2.4522933303589056, + "language_loss": 0.71813339, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73955351, + "num_input_tokens_seen": 246392910, + "step": 11411, + "time_per_iteration": 3.9584076404571533 + }, + { + "auxiliary_loss_clip": 0.01067824, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.03854966, + "balance_loss_mlp": 1.0220058, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 1.8704812323892035, + "language_loss": 0.70188624, + "learning_rate": 9.473012427332654e-07, + "loss": 0.72293532, + "num_input_tokens_seen": 246411540, + "step": 11412, + "time_per_iteration": 2.5615763664245605 + }, + { + "auxiliary_loss_clip": 0.01114302, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.03902745, + "balance_loss_mlp": 1.01618457, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 3.0011544408863835, + "language_loss": 0.71562934, + "learning_rate": 9.469701157384919e-07, + "loss": 0.73706961, + "num_input_tokens_seen": 246423295, + "step": 11413, + "time_per_iteration": 2.3759219646453857 + }, + { + "auxiliary_loss_clip": 0.01103755, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.03902793, + "balance_loss_mlp": 1.02042103, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.6511861110677726, + "language_loss": 0.73530012, + "learning_rate": 9.466390286747164e-07, + "loss": 0.75666684, + "num_input_tokens_seen": 246441045, + "step": 11414, + "time_per_iteration": 2.459766149520874 + }, + { + "auxiliary_loss_clip": 0.01096868, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.04367542, + "balance_loss_mlp": 1.01715004, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 4.374322577565271, + "language_loss": 0.86689341, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88817078, + "num_input_tokens_seen": 246456905, + "step": 11415, + "time_per_iteration": 2.472252130508423 + }, + { + "auxiliary_loss_clip": 0.01106109, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.03975475, + "balance_loss_mlp": 1.02224493, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.7057533912185496, + "language_loss": 0.66982079, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69124019, + "num_input_tokens_seen": 246477545, + "step": 11416, + "time_per_iteration": 2.502162218093872 + }, + { + "auxiliary_loss_clip": 0.01087915, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.03470218, + "balance_loss_mlp": 1.0252893, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.297850725799008, + "language_loss": 0.75876522, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78003412, + "num_input_tokens_seen": 246496705, + "step": 11417, + "time_per_iteration": 2.490151882171631 + }, + { + "auxiliary_loss_clip": 0.01088556, + "auxiliary_loss_mlp": 0.01037763, + "balance_loss_clip": 1.03617859, + "balance_loss_mlp": 1.02411318, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 1.8070759865903907, + "language_loss": 0.7773183, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79858148, + "num_input_tokens_seen": 246514860, + "step": 11418, + "time_per_iteration": 2.474768877029419 + }, + { + "auxiliary_loss_clip": 0.01074367, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.04559422, + "balance_loss_mlp": 1.01518404, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 1.868557504440539, + "language_loss": 0.76455879, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78557843, + "num_input_tokens_seen": 246536145, + "step": 11419, + "time_per_iteration": 2.646824359893799 + }, + { + "auxiliary_loss_clip": 0.01111383, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.03793466, + "balance_loss_mlp": 1.02260125, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.7558596822705905, + "language_loss": 0.71580398, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73726666, + "num_input_tokens_seen": 246553265, + "step": 11420, + "time_per_iteration": 2.4115793704986572 + }, + { + "auxiliary_loss_clip": 0.01073935, + "auxiliary_loss_mlp": 0.01030954, + "balance_loss_clip": 1.03422606, + "balance_loss_mlp": 1.01802528, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.3645602394644107, + "language_loss": 0.74546039, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76650929, + "num_input_tokens_seen": 246575130, + "step": 11421, + "time_per_iteration": 2.6283516883850098 + }, + { + "auxiliary_loss_clip": 0.01098241, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.03780782, + "balance_loss_mlp": 1.01973867, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.9623886925157223, + "language_loss": 0.76927644, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79058003, + "num_input_tokens_seen": 246593095, + "step": 11422, + "time_per_iteration": 3.9928722381591797 + }, + { + "auxiliary_loss_clip": 0.01102871, + "auxiliary_loss_mlp": 0.0103617, + "balance_loss_clip": 1.03813243, + "balance_loss_mlp": 1.02284276, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 1.7707795484164783, + "language_loss": 0.77157032, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79296076, + "num_input_tokens_seen": 246612165, + "step": 11423, + "time_per_iteration": 2.510999917984009 + }, + { + "auxiliary_loss_clip": 0.0108347, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.03889394, + "balance_loss_mlp": 1.01985443, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.5839728028191908, + "language_loss": 0.73213774, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75330317, + "num_input_tokens_seen": 246632065, + "step": 11424, + "time_per_iteration": 2.5505101680755615 + }, + { + "auxiliary_loss_clip": 0.01092142, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.03894281, + "balance_loss_mlp": 1.01682568, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.8432520786702384, + "language_loss": 0.65361047, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67482561, + "num_input_tokens_seen": 246651245, + "step": 11425, + "time_per_iteration": 2.541224479675293 + }, + { + "auxiliary_loss_clip": 0.01085279, + "auxiliary_loss_mlp": 0.01026394, + "balance_loss_clip": 1.04912674, + "balance_loss_mlp": 1.01395464, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.6187520882298598, + "language_loss": 0.7187947, + "learning_rate": 9.426691030957657e-07, + "loss": 0.7399115, + "num_input_tokens_seen": 246672225, + "step": 11426, + "time_per_iteration": 2.547327995300293 + }, + { + "auxiliary_loss_clip": 0.01061676, + "auxiliary_loss_mlp": 0.01037328, + "balance_loss_clip": 1.03463411, + "balance_loss_mlp": 1.02352953, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.5012789960232786, + "language_loss": 0.85165429, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87264431, + "num_input_tokens_seen": 246688385, + "step": 11427, + "time_per_iteration": 2.532860279083252 + }, + { + "auxiliary_loss_clip": 0.01099906, + "auxiliary_loss_mlp": 0.0103032, + "balance_loss_clip": 1.03803015, + "balance_loss_mlp": 1.0181663, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.680435124031696, + "language_loss": 0.76111722, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78241944, + "num_input_tokens_seen": 246710730, + "step": 11428, + "time_per_iteration": 2.520453453063965 + }, + { + "auxiliary_loss_clip": 0.01084589, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.04180884, + "balance_loss_mlp": 1.02075505, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 2.261857584559229, + "language_loss": 0.73032731, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75152075, + "num_input_tokens_seen": 246730350, + "step": 11429, + "time_per_iteration": 2.5416312217712402 + }, + { + "auxiliary_loss_clip": 0.01091441, + "auxiliary_loss_mlp": 0.01029386, + "balance_loss_clip": 1.03776193, + "balance_loss_mlp": 1.01650572, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 1.7538404743216331, + "language_loss": 0.83580732, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85701561, + "num_input_tokens_seen": 246751700, + "step": 11430, + "time_per_iteration": 2.5694069862365723 + }, + { + "auxiliary_loss_clip": 0.01103307, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.03752339, + "balance_loss_mlp": 1.02039349, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.4709554556697033, + "language_loss": 0.70208406, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72344732, + "num_input_tokens_seen": 246769860, + "step": 11431, + "time_per_iteration": 2.4929583072662354 + }, + { + "auxiliary_loss_clip": 0.01093531, + "auxiliary_loss_mlp": 0.00779097, + "balance_loss_clip": 1.03632331, + "balance_loss_mlp": 1.00052655, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.9559275224170563, + "language_loss": 0.79931021, + "learning_rate": 9.406863040327355e-07, + "loss": 0.81803644, + "num_input_tokens_seen": 246789905, + "step": 11432, + "time_per_iteration": 2.5561037063598633 + }, + { + "auxiliary_loss_clip": 0.01091076, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.03953969, + "balance_loss_mlp": 1.0178287, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.5625929926207665, + "language_loss": 0.67911214, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70032251, + "num_input_tokens_seen": 246808815, + "step": 11433, + "time_per_iteration": 2.5384058952331543 + }, + { + "auxiliary_loss_clip": 0.01103902, + "auxiliary_loss_mlp": 0.01039006, + "balance_loss_clip": 1.04069602, + "balance_loss_mlp": 1.02641153, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 1.9148689102184502, + "language_loss": 0.73188865, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75331771, + "num_input_tokens_seen": 246829775, + "step": 11434, + "time_per_iteration": 2.5770435333251953 + }, + { + "auxiliary_loss_clip": 0.01079829, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.04200912, + "balance_loss_mlp": 1.02060795, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 2.1202536039019884, + "language_loss": 0.80736065, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82849562, + "num_input_tokens_seen": 246848045, + "step": 11435, + "time_per_iteration": 2.5266499519348145 + }, + { + "auxiliary_loss_clip": 0.01113892, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.03803325, + "balance_loss_mlp": 1.02207065, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 2.313128387139054, + "language_loss": 0.80854309, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83003616, + "num_input_tokens_seen": 246866095, + "step": 11436, + "time_per_iteration": 2.4188199043273926 + }, + { + "auxiliary_loss_clip": 0.01070739, + "auxiliary_loss_mlp": 0.01039668, + "balance_loss_clip": 1.03638363, + "balance_loss_mlp": 1.02762198, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 1.7407945596453926, + "language_loss": 0.8222754, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84337956, + "num_input_tokens_seen": 246883975, + "step": 11437, + "time_per_iteration": 2.5786211490631104 + }, + { + "auxiliary_loss_clip": 0.01097701, + "auxiliary_loss_mlp": 0.01036947, + "balance_loss_clip": 1.03901982, + "balance_loss_mlp": 1.02296972, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 4.13417696416248, + "language_loss": 0.78901589, + "learning_rate": 9.387049510636793e-07, + "loss": 0.81036246, + "num_input_tokens_seen": 246901560, + "step": 11438, + "time_per_iteration": 3.96474552154541 + }, + { + "auxiliary_loss_clip": 0.01108181, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.03769147, + "balance_loss_mlp": 1.01948035, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.604467612414375, + "language_loss": 0.72431076, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74570644, + "num_input_tokens_seen": 246922655, + "step": 11439, + "time_per_iteration": 2.4717588424682617 + }, + { + "auxiliary_loss_clip": 0.01102074, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.03986645, + "balance_loss_mlp": 1.01687872, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 2.108513391042072, + "language_loss": 0.75831276, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77962691, + "num_input_tokens_seen": 246940100, + "step": 11440, + "time_per_iteration": 2.428501844406128 + }, + { + "auxiliary_loss_clip": 0.01066165, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.03458571, + "balance_loss_mlp": 1.02438498, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.5175312062485053, + "language_loss": 0.7181564, + "learning_rate": 9.377148177097167e-07, + "loss": 0.73918241, + "num_input_tokens_seen": 246958545, + "step": 11441, + "time_per_iteration": 2.518042802810669 + }, + { + "auxiliary_loss_clip": 0.01082416, + "auxiliary_loss_mlp": 0.01040885, + "balance_loss_clip": 1.04083323, + "balance_loss_mlp": 1.02566791, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.6100617111097735, + "language_loss": 0.66954148, + "learning_rate": 9.373848538056317e-07, + "loss": 0.69077444, + "num_input_tokens_seen": 246974805, + "step": 11442, + "time_per_iteration": 2.508415937423706 + }, + { + "auxiliary_loss_clip": 0.01101878, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.04289842, + "balance_loss_mlp": 1.01580274, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 1.9752495753923895, + "language_loss": 0.69682443, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71812403, + "num_input_tokens_seen": 246992505, + "step": 11443, + "time_per_iteration": 2.4778807163238525 + }, + { + "auxiliary_loss_clip": 0.01095707, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.03871644, + "balance_loss_mlp": 1.02068353, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.4237936640892082, + "language_loss": 0.7628352, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78413308, + "num_input_tokens_seen": 247013370, + "step": 11444, + "time_per_iteration": 2.5395519733428955 + }, + { + "auxiliary_loss_clip": 0.01109198, + "auxiliary_loss_mlp": 0.01031705, + "balance_loss_clip": 1.03742099, + "balance_loss_mlp": 1.02012396, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 2.063204859691436, + "language_loss": 0.76360452, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78501356, + "num_input_tokens_seen": 247029855, + "step": 11445, + "time_per_iteration": 2.423051357269287 + }, + { + "auxiliary_loss_clip": 0.01024826, + "auxiliary_loss_mlp": 0.01004771, + "balance_loss_clip": 1.00922978, + "balance_loss_mlp": 1.0035671, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8192727114897845, + "language_loss": 0.58381724, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60411322, + "num_input_tokens_seen": 247085030, + "step": 11446, + "time_per_iteration": 3.0793862342834473 + }, + { + "auxiliary_loss_clip": 0.01101817, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.03547692, + "balance_loss_mlp": 1.01917207, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.7146989256327434, + "language_loss": 0.75978786, + "learning_rate": 9.357356389524886e-07, + "loss": 0.78112781, + "num_input_tokens_seen": 247104840, + "step": 11447, + "time_per_iteration": 2.467820882797241 + }, + { + "auxiliary_loss_clip": 0.01093355, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.03747821, + "balance_loss_mlp": 1.02043366, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.1711947585818447, + "language_loss": 0.73610133, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75736082, + "num_input_tokens_seen": 247121905, + "step": 11448, + "time_per_iteration": 2.485809326171875 + }, + { + "auxiliary_loss_clip": 0.01101288, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.03598237, + "balance_loss_mlp": 1.02968311, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.8305191594077144, + "language_loss": 0.74524724, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76671028, + "num_input_tokens_seen": 247142375, + "step": 11449, + "time_per_iteration": 3.986840009689331 + }, + { + "auxiliary_loss_clip": 0.01108966, + "auxiliary_loss_mlp": 0.01035635, + "balance_loss_clip": 1.03705788, + "balance_loss_mlp": 1.02372575, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.7424891758081638, + "language_loss": 0.70103145, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72247744, + "num_input_tokens_seen": 247161095, + "step": 11450, + "time_per_iteration": 3.8325350284576416 + }, + { + "auxiliary_loss_clip": 0.01075512, + "auxiliary_loss_mlp": 0.01037702, + "balance_loss_clip": 1.03738725, + "balance_loss_mlp": 1.02424312, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 1.9619770631465516, + "language_loss": 0.76475012, + "learning_rate": 9.344169934211068e-07, + "loss": 0.78588223, + "num_input_tokens_seen": 247178565, + "step": 11451, + "time_per_iteration": 2.4866018295288086 + }, + { + "auxiliary_loss_clip": 0.01098951, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.0376972, + "balance_loss_mlp": 1.01586187, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.5311093609229023, + "language_loss": 0.69481623, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71608663, + "num_input_tokens_seen": 247202345, + "step": 11452, + "time_per_iteration": 2.5029282569885254 + }, + { + "auxiliary_loss_clip": 0.01110504, + "auxiliary_loss_mlp": 0.01035433, + "balance_loss_clip": 1.03807104, + "balance_loss_mlp": 1.02125907, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 1.854788948247744, + "language_loss": 0.71824884, + "learning_rate": 9.337579130475042e-07, + "loss": 0.73970824, + "num_input_tokens_seen": 247219240, + "step": 11453, + "time_per_iteration": 2.403351306915283 + }, + { + "auxiliary_loss_clip": 0.01023202, + "auxiliary_loss_mlp": 0.00754012, + "balance_loss_clip": 1.00788438, + "balance_loss_mlp": 1.0003804, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7818755308089075, + "language_loss": 0.50684869, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52462077, + "num_input_tokens_seen": 247272010, + "step": 11454, + "time_per_iteration": 2.8929102420806885 + }, + { + "auxiliary_loss_clip": 0.01097538, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.03828478, + "balance_loss_mlp": 1.01696873, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.7975269501869982, + "language_loss": 0.75280601, + "learning_rate": 9.330989944019263e-07, + "loss": 0.7740711, + "num_input_tokens_seen": 247290630, + "step": 11455, + "time_per_iteration": 2.4493801593780518 + }, + { + "auxiliary_loss_clip": 0.01092559, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_clip": 1.03550601, + "balance_loss_mlp": 1.02209592, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.3851885528902597, + "language_loss": 0.72599518, + "learning_rate": 9.327695957583803e-07, + "loss": 0.7472769, + "num_input_tokens_seen": 247304800, + "step": 11456, + "time_per_iteration": 2.4361886978149414 + }, + { + "auxiliary_loss_clip": 0.01088573, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.03758788, + "balance_loss_mlp": 1.02322078, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 1.9358095262640855, + "language_loss": 0.80714613, + "learning_rate": 9.32440237584319e-07, + "loss": 0.82838643, + "num_input_tokens_seen": 247323450, + "step": 11457, + "time_per_iteration": 2.563282012939453 + }, + { + "auxiliary_loss_clip": 0.0110472, + "auxiliary_loss_mlp": 0.00778211, + "balance_loss_clip": 1.03955173, + "balance_loss_mlp": 1.00055647, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.563945299184081, + "language_loss": 0.76089513, + "learning_rate": 9.321109198922301e-07, + "loss": 0.77972442, + "num_input_tokens_seen": 247343845, + "step": 11458, + "time_per_iteration": 2.491757869720459 + }, + { + "auxiliary_loss_clip": 0.01112807, + "auxiliary_loss_mlp": 0.01036082, + "balance_loss_clip": 1.03892183, + "balance_loss_mlp": 1.02411938, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 2.8410983948557615, + "language_loss": 0.67995679, + "learning_rate": 9.31781642694603e-07, + "loss": 0.7014457, + "num_input_tokens_seen": 247356650, + "step": 11459, + "time_per_iteration": 2.39015531539917 + }, + { + "auxiliary_loss_clip": 0.01068988, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.03888369, + "balance_loss_mlp": 1.02194953, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.4401358833044844, + "language_loss": 0.68554831, + "learning_rate": 9.314524060039221e-07, + "loss": 0.70657754, + "num_input_tokens_seen": 247377340, + "step": 11460, + "time_per_iteration": 2.558039903640747 + }, + { + "auxiliary_loss_clip": 0.01087547, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.04018712, + "balance_loss_mlp": 1.01994228, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 2.5442489959101295, + "language_loss": 0.77026415, + "learning_rate": 9.311232098326731e-07, + "loss": 0.79147995, + "num_input_tokens_seen": 247395805, + "step": 11461, + "time_per_iteration": 2.5605969429016113 + }, + { + "auxiliary_loss_clip": 0.01091226, + "auxiliary_loss_mlp": 0.01037457, + "balance_loss_clip": 1.03736067, + "balance_loss_mlp": 1.02458215, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 2.0573717196867016, + "language_loss": 0.70049667, + "learning_rate": 9.307940541933401e-07, + "loss": 0.7217834, + "num_input_tokens_seen": 247413165, + "step": 11462, + "time_per_iteration": 4.028581142425537 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.04170179, + "balance_loss_mlp": 1.01446986, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.4370332819165288, + "language_loss": 0.87331986, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89464051, + "num_input_tokens_seen": 247433140, + "step": 11463, + "time_per_iteration": 2.4974172115325928 + }, + { + "auxiliary_loss_clip": 0.01064608, + "auxiliary_loss_mlp": 0.01026454, + "balance_loss_clip": 1.04093099, + "balance_loss_mlp": 1.01579642, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.7898780741392388, + "language_loss": 0.68390238, + "learning_rate": 9.301358645603428e-07, + "loss": 0.704813, + "num_input_tokens_seen": 247451265, + "step": 11464, + "time_per_iteration": 2.569575309753418 + }, + { + "auxiliary_loss_clip": 0.01101022, + "auxiliary_loss_mlp": 0.01037519, + "balance_loss_clip": 1.03811169, + "balance_loss_mlp": 1.02490044, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.7712387339480837, + "language_loss": 0.65202445, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67340988, + "num_input_tokens_seen": 247471645, + "step": 11465, + "time_per_iteration": 2.5449306964874268 + }, + { + "auxiliary_loss_clip": 0.01103933, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.0381608, + "balance_loss_mlp": 1.02365875, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 2.1933811277140784, + "language_loss": 0.72482717, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74623013, + "num_input_tokens_seen": 247491170, + "step": 11466, + "time_per_iteration": 2.496964454650879 + }, + { + "auxiliary_loss_clip": 0.01114512, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.03994215, + "balance_loss_mlp": 1.01776624, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 3.828231984710499, + "language_loss": 0.71985406, + "learning_rate": 9.291488844121995e-07, + "loss": 0.74130261, + "num_input_tokens_seen": 247509005, + "step": 11467, + "time_per_iteration": 2.401144027709961 + }, + { + "auxiliary_loss_clip": 0.01095969, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.04031491, + "balance_loss_mlp": 1.0189954, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 2.0340593321848264, + "language_loss": 0.81258798, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83387703, + "num_input_tokens_seen": 247527050, + "step": 11468, + "time_per_iteration": 2.477362632751465 + }, + { + "auxiliary_loss_clip": 0.01115375, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.03972459, + "balance_loss_mlp": 1.02450407, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.507000182448515, + "language_loss": 0.6622895, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68381709, + "num_input_tokens_seen": 247547765, + "step": 11469, + "time_per_iteration": 2.5513930320739746 + }, + { + "auxiliary_loss_clip": 0.01024845, + "auxiliary_loss_mlp": 0.01001066, + "balance_loss_clip": 1.01034403, + "balance_loss_mlp": 0.99956971, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.804907028441136, + "language_loss": 0.5514611, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57172024, + "num_input_tokens_seen": 247603515, + "step": 11470, + "time_per_iteration": 2.928746223449707 + }, + { + "auxiliary_loss_clip": 0.01097503, + "auxiliary_loss_mlp": 0.0103138, + "balance_loss_clip": 1.03812194, + "balance_loss_mlp": 1.02097881, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 1.7275430460398178, + "language_loss": 0.78073502, + "learning_rate": 9.278334794344715e-07, + "loss": 0.80202383, + "num_input_tokens_seen": 247622110, + "step": 11471, + "time_per_iteration": 2.43670916557312 + }, + { + "auxiliary_loss_clip": 0.0108996, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.03475034, + "balance_loss_mlp": 1.02315068, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 2.4862129783872353, + "language_loss": 0.78436995, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80562437, + "num_input_tokens_seen": 247641905, + "step": 11472, + "time_per_iteration": 2.501767158508301 + }, + { + "auxiliary_loss_clip": 0.01082006, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.03534913, + "balance_loss_mlp": 1.02021074, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.7538611539625353, + "language_loss": 0.76171672, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78285551, + "num_input_tokens_seen": 247660945, + "step": 11473, + "time_per_iteration": 2.5176422595977783 + }, + { + "auxiliary_loss_clip": 0.01075407, + "auxiliary_loss_mlp": 0.01047611, + "balance_loss_clip": 1.03620696, + "balance_loss_mlp": 1.03140473, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 5.133119659140262, + "language_loss": 0.75394064, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77517086, + "num_input_tokens_seen": 247678395, + "step": 11474, + "time_per_iteration": 2.516327381134033 + }, + { + "auxiliary_loss_clip": 0.01071636, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.04715586, + "balance_loss_mlp": 1.01856756, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.5659617222102111, + "language_loss": 0.74198848, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76301098, + "num_input_tokens_seen": 247698380, + "step": 11475, + "time_per_iteration": 2.65647292137146 + }, + { + "auxiliary_loss_clip": 0.01084194, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.03572607, + "balance_loss_mlp": 1.01873684, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.3361630731215484, + "language_loss": 0.88515544, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90631837, + "num_input_tokens_seen": 247716370, + "step": 11476, + "time_per_iteration": 2.5416834354400635 + }, + { + "auxiliary_loss_clip": 0.01110231, + "auxiliary_loss_mlp": 0.01035141, + "balance_loss_clip": 1.03785729, + "balance_loss_mlp": 1.02327943, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.464300152241672, + "language_loss": 0.70221496, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72366869, + "num_input_tokens_seen": 247737335, + "step": 11477, + "time_per_iteration": 3.938026189804077 + }, + { + "auxiliary_loss_clip": 0.01106901, + "auxiliary_loss_mlp": 0.01043682, + "balance_loss_clip": 1.03981566, + "balance_loss_mlp": 1.03002644, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.018756678935298, + "language_loss": 0.6886766, + "learning_rate": 9.255330864847313e-07, + "loss": 0.71018237, + "num_input_tokens_seen": 247756680, + "step": 11478, + "time_per_iteration": 2.5232980251312256 + }, + { + "auxiliary_loss_clip": 0.01103823, + "auxiliary_loss_mlp": 0.01032544, + "balance_loss_clip": 1.0401684, + "balance_loss_mlp": 1.02024114, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 2.0346741776159303, + "language_loss": 0.76409298, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78545666, + "num_input_tokens_seen": 247774265, + "step": 11479, + "time_per_iteration": 2.4517292976379395 + }, + { + "auxiliary_loss_clip": 0.01103329, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.03669322, + "balance_loss_mlp": 1.01951313, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 2.005911144244193, + "language_loss": 0.78595173, + "learning_rate": 9.248761978643856e-07, + "loss": 0.8073138, + "num_input_tokens_seen": 247792395, + "step": 11480, + "time_per_iteration": 2.500659465789795 + }, + { + "auxiliary_loss_clip": 0.01073503, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.03405833, + "balance_loss_mlp": 1.01865411, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.6244566425698932, + "language_loss": 0.75414586, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77519923, + "num_input_tokens_seen": 247811985, + "step": 11481, + "time_per_iteration": 2.6276144981384277 + }, + { + "auxiliary_loss_clip": 0.01077868, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.0396595, + "balance_loss_mlp": 1.01715219, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.8092974154228973, + "language_loss": 0.69194978, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71302485, + "num_input_tokens_seen": 247831880, + "step": 11482, + "time_per_iteration": 2.6304640769958496 + }, + { + "auxiliary_loss_clip": 0.01113076, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.03909802, + "balance_loss_mlp": 1.02169645, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 2.0231543252011104, + "language_loss": 0.82662606, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84809935, + "num_input_tokens_seen": 247851170, + "step": 11483, + "time_per_iteration": 2.439836263656616 + }, + { + "auxiliary_loss_clip": 0.01113995, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.03862405, + "balance_loss_mlp": 1.02072668, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 2.255200848558164, + "language_loss": 0.65952313, + "learning_rate": 9.235629099489273e-07, + "loss": 0.68098849, + "num_input_tokens_seen": 247868950, + "step": 11484, + "time_per_iteration": 2.47426438331604 + }, + { + "auxiliary_loss_clip": 0.01080133, + "auxiliary_loss_mlp": 0.01044463, + "balance_loss_clip": 1.03603518, + "balance_loss_mlp": 1.03078985, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.5851904395342884, + "language_loss": 0.7361483, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75739425, + "num_input_tokens_seen": 247889805, + "step": 11485, + "time_per_iteration": 2.595158815383911 + }, + { + "auxiliary_loss_clip": 0.01104078, + "auxiliary_loss_mlp": 0.0077869, + "balance_loss_clip": 1.0435648, + "balance_loss_mlp": 1.0005511, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 2.3470322286917327, + "language_loss": 0.85142541, + "learning_rate": 9.22906510853017e-07, + "loss": 0.87025309, + "num_input_tokens_seen": 247908585, + "step": 11486, + "time_per_iteration": 2.4807398319244385 + }, + { + "auxiliary_loss_clip": 0.01053494, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.03475368, + "balance_loss_mlp": 1.02210927, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.4443508171461497, + "language_loss": 0.72706622, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74794918, + "num_input_tokens_seen": 247928480, + "step": 11487, + "time_per_iteration": 2.6261744499206543 + }, + { + "auxiliary_loss_clip": 0.01020328, + "auxiliary_loss_mlp": 0.01005645, + "balance_loss_clip": 1.01458728, + "balance_loss_mlp": 1.00430965, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.8939961652801243, + "language_loss": 0.66649783, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68675756, + "num_input_tokens_seen": 247988855, + "step": 11488, + "time_per_iteration": 4.528810024261475 + }, + { + "auxiliary_loss_clip": 0.01090251, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.0391233, + "balance_loss_mlp": 1.02372026, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 1.7703139440476976, + "language_loss": 0.74674803, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76803011, + "num_input_tokens_seen": 248007685, + "step": 11489, + "time_per_iteration": 2.4984772205352783 + }, + { + "auxiliary_loss_clip": 0.01102366, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.03654408, + "balance_loss_mlp": 1.02161312, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 1.9761578398896642, + "language_loss": 0.62331903, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64469683, + "num_input_tokens_seen": 248025145, + "step": 11490, + "time_per_iteration": 3.8446710109710693 + }, + { + "auxiliary_loss_clip": 0.01092617, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.04061246, + "balance_loss_mlp": 1.01786232, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.628501040114125, + "language_loss": 0.72923613, + "learning_rate": 9.212662280920937e-07, + "loss": 0.7504651, + "num_input_tokens_seen": 248043750, + "step": 11491, + "time_per_iteration": 2.562941312789917 + }, + { + "auxiliary_loss_clip": 0.01089189, + "auxiliary_loss_mlp": 0.0077788, + "balance_loss_clip": 1.03759193, + "balance_loss_mlp": 1.00039601, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.4840950778966955, + "language_loss": 0.70236087, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72103155, + "num_input_tokens_seen": 248065765, + "step": 11492, + "time_per_iteration": 2.6115779876708984 + }, + { + "auxiliary_loss_clip": 0.01080295, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.04488349, + "balance_loss_mlp": 1.02209067, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.944632367922043, + "language_loss": 0.74424398, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76539397, + "num_input_tokens_seen": 248083810, + "step": 11493, + "time_per_iteration": 2.59448504447937 + }, + { + "auxiliary_loss_clip": 0.01113047, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.03960323, + "balance_loss_mlp": 1.01882696, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 4.14748131893112, + "language_loss": 0.74119043, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76263535, + "num_input_tokens_seen": 248103185, + "step": 11494, + "time_per_iteration": 2.4294440746307373 + }, + { + "auxiliary_loss_clip": 0.0108857, + "auxiliary_loss_mlp": 0.01031917, + "balance_loss_clip": 1.03904116, + "balance_loss_mlp": 1.0191381, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.8661207966094189, + "language_loss": 0.68249655, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70370138, + "num_input_tokens_seen": 248125665, + "step": 11495, + "time_per_iteration": 2.6094796657562256 + }, + { + "auxiliary_loss_clip": 0.01091311, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.03731513, + "balance_loss_mlp": 1.01898468, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.8920126852059185, + "language_loss": 0.74331117, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76454127, + "num_input_tokens_seen": 248142545, + "step": 11496, + "time_per_iteration": 2.50315260887146 + }, + { + "auxiliary_loss_clip": 0.01075456, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.03223729, + "balance_loss_mlp": 1.02038205, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.7179781337486433, + "language_loss": 0.80382669, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82490665, + "num_input_tokens_seen": 248160225, + "step": 11497, + "time_per_iteration": 2.512352705001831 + }, + { + "auxiliary_loss_clip": 0.01075087, + "auxiliary_loss_mlp": 0.01040807, + "balance_loss_clip": 1.03519762, + "balance_loss_mlp": 1.02765858, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.5770188049164235, + "language_loss": 0.80652368, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82768261, + "num_input_tokens_seen": 248180430, + "step": 11498, + "time_per_iteration": 2.564617872238159 + }, + { + "auxiliary_loss_clip": 0.01100117, + "auxiliary_loss_mlp": 0.01035008, + "balance_loss_clip": 1.03886926, + "balance_loss_mlp": 1.02261627, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.5440295975716365, + "language_loss": 0.86128402, + "learning_rate": 9.186439034169915e-07, + "loss": 0.8826353, + "num_input_tokens_seen": 248202365, + "step": 11499, + "time_per_iteration": 2.576618194580078 + }, + { + "auxiliary_loss_clip": 0.01083505, + "auxiliary_loss_mlp": 0.0077738, + "balance_loss_clip": 1.04198837, + "balance_loss_mlp": 1.00046539, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.6280786761616943, + "language_loss": 0.75554448, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77415335, + "num_input_tokens_seen": 248221750, + "step": 11500, + "time_per_iteration": 2.5477676391601562 + }, + { + "auxiliary_loss_clip": 0.01061322, + "auxiliary_loss_mlp": 0.01041821, + "balance_loss_clip": 1.036219, + "balance_loss_mlp": 1.02739632, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.8754365202548344, + "language_loss": 0.77174491, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79277635, + "num_input_tokens_seen": 248239535, + "step": 11501, + "time_per_iteration": 4.185120105743408 + }, + { + "auxiliary_loss_clip": 0.01099464, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.03726757, + "balance_loss_mlp": 1.02805328, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.7916730238729068, + "language_loss": 0.73949695, + "learning_rate": 9.176612079067458e-07, + "loss": 0.76090765, + "num_input_tokens_seen": 248259055, + "step": 11502, + "time_per_iteration": 2.5144166946411133 + }, + { + "auxiliary_loss_clip": 0.01046409, + "auxiliary_loss_mlp": 0.01041545, + "balance_loss_clip": 1.03592467, + "balance_loss_mlp": 1.02663207, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 2.501261365090615, + "language_loss": 0.73547208, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75635159, + "num_input_tokens_seen": 248276765, + "step": 11503, + "time_per_iteration": 2.659975528717041 + }, + { + "auxiliary_loss_clip": 0.01098316, + "auxiliary_loss_mlp": 0.0103817, + "balance_loss_clip": 1.036659, + "balance_loss_mlp": 1.02487206, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 1.740950082729235, + "language_loss": 0.76998115, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79134607, + "num_input_tokens_seen": 248295310, + "step": 11504, + "time_per_iteration": 2.504387140274048 + }, + { + "auxiliary_loss_clip": 0.01070027, + "auxiliary_loss_mlp": 0.01039326, + "balance_loss_clip": 1.03228784, + "balance_loss_mlp": 1.02488995, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.7089577558153324, + "language_loss": 0.7346614, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75575495, + "num_input_tokens_seen": 248315230, + "step": 11505, + "time_per_iteration": 2.5927655696868896 + }, + { + "auxiliary_loss_clip": 0.01052952, + "auxiliary_loss_mlp": 0.00779657, + "balance_loss_clip": 1.03094614, + "balance_loss_mlp": 1.00044858, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.992575572980247, + "language_loss": 0.87996203, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89828813, + "num_input_tokens_seen": 248332980, + "step": 11506, + "time_per_iteration": 2.6465110778808594 + }, + { + "auxiliary_loss_clip": 0.01090743, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.04364204, + "balance_loss_mlp": 1.01567793, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 2.207503381210473, + "language_loss": 0.70153606, + "learning_rate": 9.160242030697856e-07, + "loss": 0.72272003, + "num_input_tokens_seen": 248352865, + "step": 11507, + "time_per_iteration": 2.6618876457214355 + }, + { + "auxiliary_loss_clip": 0.01087335, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.0365206, + "balance_loss_mlp": 1.0229435, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 2.149565332025701, + "language_loss": 0.77100897, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79224062, + "num_input_tokens_seen": 248371125, + "step": 11508, + "time_per_iteration": 2.5635552406311035 + }, + { + "auxiliary_loss_clip": 0.01093207, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.03401875, + "balance_loss_mlp": 1.01854777, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 1.587245042266412, + "language_loss": 0.75105643, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77229524, + "num_input_tokens_seen": 248390455, + "step": 11509, + "time_per_iteration": 2.5139715671539307 + }, + { + "auxiliary_loss_clip": 0.01060254, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.03714132, + "balance_loss_mlp": 1.02096474, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.4971203389441334, + "language_loss": 0.63933396, + "learning_rate": 9.150424933219425e-07, + "loss": 0.660267, + "num_input_tokens_seen": 248411305, + "step": 11510, + "time_per_iteration": 2.6267127990722656 + }, + { + "auxiliary_loss_clip": 0.01082844, + "auxiliary_loss_mlp": 0.01036371, + "balance_loss_clip": 1.03742099, + "balance_loss_mlp": 1.02206624, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 1.7118524623053764, + "language_loss": 0.75634849, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77754062, + "num_input_tokens_seen": 248430190, + "step": 11511, + "time_per_iteration": 2.540635347366333 + }, + { + "auxiliary_loss_clip": 0.01085688, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.04161072, + "balance_loss_mlp": 1.02043605, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 1.549595796989009, + "language_loss": 0.62561613, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64679229, + "num_input_tokens_seen": 248450830, + "step": 11512, + "time_per_iteration": 2.6447699069976807 + }, + { + "auxiliary_loss_clip": 0.01081576, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.03751123, + "balance_loss_mlp": 1.01912308, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.7198425070254988, + "language_loss": 0.83132207, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85245609, + "num_input_tokens_seen": 248468585, + "step": 11513, + "time_per_iteration": 2.526773452758789 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01026379, + "balance_loss_clip": 1.03749478, + "balance_loss_mlp": 1.01516104, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.4415395721129067, + "language_loss": 0.78375721, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80452597, + "num_input_tokens_seen": 248490535, + "step": 11514, + "time_per_iteration": 2.725203037261963 + }, + { + "auxiliary_loss_clip": 0.01067152, + "auxiliary_loss_mlp": 0.01031925, + "balance_loss_clip": 1.03351808, + "balance_loss_mlp": 1.01939583, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 1.9767969223520394, + "language_loss": 0.74637544, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76736623, + "num_input_tokens_seen": 248508575, + "step": 11515, + "time_per_iteration": 2.5846121311187744 + }, + { + "auxiliary_loss_clip": 0.01068171, + "auxiliary_loss_mlp": 0.01029522, + "balance_loss_clip": 1.03688991, + "balance_loss_mlp": 1.0176425, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 2.5393020156816695, + "language_loss": 0.54035234, + "learning_rate": 9.130801849869694e-07, + "loss": 0.56132925, + "num_input_tokens_seen": 248527025, + "step": 11516, + "time_per_iteration": 2.6707708835601807 + }, + { + "auxiliary_loss_clip": 0.0109612, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.03538895, + "balance_loss_mlp": 1.02311933, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.7939976489301792, + "language_loss": 0.73276734, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75409091, + "num_input_tokens_seen": 248544275, + "step": 11517, + "time_per_iteration": 3.9128801822662354 + }, + { + "auxiliary_loss_clip": 0.01113322, + "auxiliary_loss_mlp": 0.01036394, + "balance_loss_clip": 1.03785253, + "balance_loss_mlp": 1.02356744, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.8064955175149935, + "language_loss": 0.76179254, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78328967, + "num_input_tokens_seen": 248561870, + "step": 11518, + "time_per_iteration": 2.4201700687408447 + }, + { + "auxiliary_loss_clip": 0.01106682, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.04056501, + "balance_loss_mlp": 1.01922357, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 2.2317281014070165, + "language_loss": 0.64419162, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66559386, + "num_input_tokens_seen": 248588190, + "step": 11519, + "time_per_iteration": 2.6824865341186523 + }, + { + "auxiliary_loss_clip": 0.01080636, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.03606379, + "balance_loss_mlp": 1.02678394, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.7430520826714169, + "language_loss": 0.6295473, + "learning_rate": 9.117728035871212e-07, + "loss": 0.65075123, + "num_input_tokens_seen": 248606460, + "step": 11520, + "time_per_iteration": 2.548567056655884 + }, + { + "auxiliary_loss_clip": 0.01079712, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.03809655, + "balance_loss_mlp": 1.02280557, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 6.71482663865699, + "language_loss": 0.77815807, + "learning_rate": 9.114460613703887e-07, + "loss": 0.79932904, + "num_input_tokens_seen": 248623715, + "step": 11521, + "time_per_iteration": 2.5366008281707764 + }, + { + "auxiliary_loss_clip": 0.01099166, + "auxiliary_loss_mlp": 0.01038412, + "balance_loss_clip": 1.03474343, + "balance_loss_mlp": 1.02345133, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.8611426482747429, + "language_loss": 0.81973422, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84110999, + "num_input_tokens_seen": 248640575, + "step": 11522, + "time_per_iteration": 2.438030481338501 + }, + { + "auxiliary_loss_clip": 0.01098436, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.04189324, + "balance_loss_mlp": 1.01920784, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.4200771968756996, + "language_loss": 0.76722229, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78851855, + "num_input_tokens_seen": 248663535, + "step": 11523, + "time_per_iteration": 2.597172498703003 + }, + { + "auxiliary_loss_clip": 0.01079796, + "auxiliary_loss_mlp": 0.01033258, + "balance_loss_clip": 1.03557444, + "balance_loss_mlp": 1.02182007, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 1.7497258756026635, + "language_loss": 0.68402052, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70515108, + "num_input_tokens_seen": 248681125, + "step": 11524, + "time_per_iteration": 2.523315906524658 + }, + { + "auxiliary_loss_clip": 0.01082697, + "auxiliary_loss_mlp": 0.01037712, + "balance_loss_clip": 1.03853369, + "balance_loss_mlp": 1.02408624, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.8223739152346674, + "language_loss": 0.64251125, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66371524, + "num_input_tokens_seen": 248700555, + "step": 11525, + "time_per_iteration": 2.5486228466033936 + }, + { + "auxiliary_loss_clip": 0.01077825, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.04229617, + "balance_loss_mlp": 1.02312577, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 3.0236917142771524, + "language_loss": 0.70370448, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72484457, + "num_input_tokens_seen": 248716095, + "step": 11526, + "time_per_iteration": 2.625304937362671 + }, + { + "auxiliary_loss_clip": 0.01088637, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.03611207, + "balance_loss_mlp": 1.01846004, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.6482498751759374, + "language_loss": 0.76593411, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78712797, + "num_input_tokens_seen": 248735330, + "step": 11527, + "time_per_iteration": 4.076358079910278 + }, + { + "auxiliary_loss_clip": 0.01083467, + "auxiliary_loss_mlp": 0.0103499, + "balance_loss_clip": 1.03350735, + "balance_loss_mlp": 1.02237153, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.6289126285365783, + "language_loss": 0.79419756, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81538218, + "num_input_tokens_seen": 248754530, + "step": 11528, + "time_per_iteration": 2.6059179306030273 + }, + { + "auxiliary_loss_clip": 0.01096609, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.0373466, + "balance_loss_mlp": 1.02043343, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.3765051513072055, + "language_loss": 0.76001382, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78130674, + "num_input_tokens_seen": 248775825, + "step": 11529, + "time_per_iteration": 3.9292101860046387 + }, + { + "auxiliary_loss_clip": 0.01108787, + "auxiliary_loss_mlp": 0.00776807, + "balance_loss_clip": 1.03750396, + "balance_loss_mlp": 1.00056899, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.5909790175834846, + "language_loss": 0.72645211, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74530804, + "num_input_tokens_seen": 248796180, + "step": 11530, + "time_per_iteration": 2.5404932498931885 + }, + { + "auxiliary_loss_clip": 0.01097294, + "auxiliary_loss_mlp": 0.01035127, + "balance_loss_clip": 1.04143751, + "balance_loss_mlp": 1.02023757, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 1.6273790004866422, + "language_loss": 0.7828337, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80415791, + "num_input_tokens_seen": 248814735, + "step": 11531, + "time_per_iteration": 2.5107815265655518 + }, + { + "auxiliary_loss_clip": 0.01099667, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.04260993, + "balance_loss_mlp": 1.02248371, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.3587102140109186, + "language_loss": 0.69582164, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71715796, + "num_input_tokens_seen": 248839140, + "step": 11532, + "time_per_iteration": 2.621000289916992 + }, + { + "auxiliary_loss_clip": 0.01086352, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.0352838, + "balance_loss_mlp": 1.01738596, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.368005824305412, + "language_loss": 0.66826987, + "learning_rate": 9.075283780014082e-07, + "loss": 0.6894412, + "num_input_tokens_seen": 248858300, + "step": 11533, + "time_per_iteration": 2.577910900115967 + }, + { + "auxiliary_loss_clip": 0.01088503, + "auxiliary_loss_mlp": 0.01035058, + "balance_loss_clip": 1.04028642, + "balance_loss_mlp": 1.02208209, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 2.9438982106114566, + "language_loss": 0.59087253, + "learning_rate": 9.072021733655007e-07, + "loss": 0.61210811, + "num_input_tokens_seen": 248876310, + "step": 11534, + "time_per_iteration": 2.5045955181121826 + }, + { + "auxiliary_loss_clip": 0.01081409, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.03961527, + "balance_loss_mlp": 1.01759338, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 2.0099273843084955, + "language_loss": 0.71031964, + "learning_rate": 9.068760101685971e-07, + "loss": 0.7314446, + "num_input_tokens_seen": 248895650, + "step": 11535, + "time_per_iteration": 2.5350592136383057 + }, + { + "auxiliary_loss_clip": 0.01013544, + "auxiliary_loss_mlp": 0.01005866, + "balance_loss_clip": 1.00786376, + "balance_loss_mlp": 1.0044291, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7138198470971929, + "language_loss": 0.59071553, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61090958, + "num_input_tokens_seen": 248963920, + "step": 11536, + "time_per_iteration": 3.173114776611328 + }, + { + "auxiliary_loss_clip": 0.01101119, + "auxiliary_loss_mlp": 0.00777722, + "balance_loss_clip": 1.04060245, + "balance_loss_mlp": 1.00058794, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.6605276780011318, + "language_loss": 0.72666091, + "learning_rate": 9.062238081412692e-07, + "loss": 0.7454493, + "num_input_tokens_seen": 248983380, + "step": 11537, + "time_per_iteration": 2.472898483276367 + }, + { + "auxiliary_loss_clip": 0.01024099, + "auxiliary_loss_mlp": 0.00754202, + "balance_loss_clip": 1.00850844, + "balance_loss_mlp": 1.00023794, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7577373917908445, + "language_loss": 0.55568039, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57346338, + "num_input_tokens_seen": 249044680, + "step": 11538, + "time_per_iteration": 3.0424158573150635 + }, + { + "auxiliary_loss_clip": 0.01095291, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.03646636, + "balance_loss_mlp": 1.02229691, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.5178907747816768, + "language_loss": 0.77759671, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79888541, + "num_input_tokens_seen": 249061060, + "step": 11539, + "time_per_iteration": 2.5046331882476807 + }, + { + "auxiliary_loss_clip": 0.01087976, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.03665137, + "balance_loss_mlp": 1.01701128, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.731578505608482, + "language_loss": 0.64280629, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66397059, + "num_input_tokens_seen": 249081430, + "step": 11540, + "time_per_iteration": 4.077820301055908 + }, + { + "auxiliary_loss_clip": 0.01066716, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.03498793, + "balance_loss_mlp": 1.01797533, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 2.8141988659936956, + "language_loss": 0.86918414, + "learning_rate": 9.049199018987437e-07, + "loss": 0.89015841, + "num_input_tokens_seen": 249103020, + "step": 11541, + "time_per_iteration": 2.5899791717529297 + }, + { + "auxiliary_loss_clip": 0.01111107, + "auxiliary_loss_mlp": 0.0077746, + "balance_loss_clip": 1.03786278, + "balance_loss_mlp": 1.00049925, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 2.032557458437275, + "language_loss": 0.84380352, + "learning_rate": 9.04594029121081e-07, + "loss": 0.8626892, + "num_input_tokens_seen": 249120810, + "step": 11542, + "time_per_iteration": 2.424868583679199 + }, + { + "auxiliary_loss_clip": 0.01100267, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.03777087, + "balance_loss_mlp": 1.0190773, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 2.3538496982070285, + "language_loss": 0.75072163, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77204931, + "num_input_tokens_seen": 249138050, + "step": 11543, + "time_per_iteration": 2.4652163982391357 + }, + { + "auxiliary_loss_clip": 0.01090531, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.03756082, + "balance_loss_mlp": 1.01809359, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.7059858762462234, + "language_loss": 0.76201111, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78321636, + "num_input_tokens_seen": 249155570, + "step": 11544, + "time_per_iteration": 2.4845409393310547 + }, + { + "auxiliary_loss_clip": 0.0106345, + "auxiliary_loss_mlp": 0.01040966, + "balance_loss_clip": 1.03651845, + "balance_loss_mlp": 1.02695858, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.788456925989345, + "language_loss": 0.71135485, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73239899, + "num_input_tokens_seen": 249172960, + "step": 11545, + "time_per_iteration": 2.5298879146575928 + }, + { + "auxiliary_loss_clip": 0.01097825, + "auxiliary_loss_mlp": 0.01025411, + "balance_loss_clip": 1.0388279, + "balance_loss_mlp": 1.01409805, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.854425502558874, + "language_loss": 0.79550916, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81674153, + "num_input_tokens_seen": 249192450, + "step": 11546, + "time_per_iteration": 2.4652113914489746 + }, + { + "auxiliary_loss_clip": 0.0107836, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.03549504, + "balance_loss_mlp": 1.01814866, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 1.3459687531905191, + "language_loss": 0.78708291, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80816716, + "num_input_tokens_seen": 249214320, + "step": 11547, + "time_per_iteration": 2.5726683139801025 + }, + { + "auxiliary_loss_clip": 0.01086403, + "auxiliary_loss_mlp": 0.00778269, + "balance_loss_clip": 1.03790402, + "balance_loss_mlp": 1.00055885, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.146835966814812, + "language_loss": 0.80958235, + "learning_rate": 9.026396651834834e-07, + "loss": 0.82822901, + "num_input_tokens_seen": 249230925, + "step": 11548, + "time_per_iteration": 2.51826810836792 + }, + { + "auxiliary_loss_clip": 0.01031397, + "auxiliary_loss_mlp": 0.00753259, + "balance_loss_clip": 1.00733614, + "balance_loss_mlp": 1.00023866, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6924513736995099, + "language_loss": 0.53723073, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55507731, + "num_input_tokens_seen": 249293975, + "step": 11549, + "time_per_iteration": 3.013474941253662 + }, + { + "auxiliary_loss_clip": 0.01094049, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.03379679, + "balance_loss_mlp": 1.01948762, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.7004858155860962, + "language_loss": 0.73947668, + "learning_rate": 9.01988543302e-07, + "loss": 0.76074326, + "num_input_tokens_seen": 249315285, + "step": 11550, + "time_per_iteration": 2.5382895469665527 + }, + { + "auxiliary_loss_clip": 0.01088388, + "auxiliary_loss_mlp": 0.010376, + "balance_loss_clip": 1.04048824, + "balance_loss_mlp": 1.02486801, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 10.372840211144092, + "language_loss": 0.74160117, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76286101, + "num_input_tokens_seen": 249333505, + "step": 11551, + "time_per_iteration": 2.4758460521698 + }, + { + "auxiliary_loss_clip": 0.0111164, + "auxiliary_loss_mlp": 0.01038213, + "balance_loss_clip": 1.03884006, + "balance_loss_mlp": 1.02590489, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.6733240559243365, + "language_loss": 0.84916466, + "learning_rate": 9.01337587967333e-07, + "loss": 0.87066317, + "num_input_tokens_seen": 249354180, + "step": 11552, + "time_per_iteration": 2.4483611583709717 + }, + { + "auxiliary_loss_clip": 0.01111033, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.03840065, + "balance_loss_mlp": 1.02152503, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 1.654626451798713, + "language_loss": 0.67452973, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69598365, + "num_input_tokens_seen": 249377035, + "step": 11553, + "time_per_iteration": 2.513355255126953 + }, + { + "auxiliary_loss_clip": 0.01097048, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.04115593, + "balance_loss_mlp": 1.01571715, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.592808116976314, + "language_loss": 0.79637265, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81763047, + "num_input_tokens_seen": 249396155, + "step": 11554, + "time_per_iteration": 2.502797842025757 + }, + { + "auxiliary_loss_clip": 0.01102021, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.03677583, + "balance_loss_mlp": 1.01715088, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 1.683770166253637, + "language_loss": 0.72611642, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74743629, + "num_input_tokens_seen": 249414555, + "step": 11555, + "time_per_iteration": 2.441182851791382 + }, + { + "auxiliary_loss_clip": 0.01072909, + "auxiliary_loss_mlp": 0.01027533, + "balance_loss_clip": 1.03387785, + "balance_loss_mlp": 1.01558828, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.7160259996824565, + "language_loss": 0.7789008, + "learning_rate": 9.000361773333705e-07, + "loss": 0.79990524, + "num_input_tokens_seen": 249433570, + "step": 11556, + "time_per_iteration": 3.973144054412842 + }, + { + "auxiliary_loss_clip": 0.0105212, + "auxiliary_loss_mlp": 0.01039662, + "balance_loss_clip": 1.03323495, + "balance_loss_mlp": 1.02733541, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.9896449995085717, + "language_loss": 0.60622072, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62713856, + "num_input_tokens_seen": 249453735, + "step": 11557, + "time_per_iteration": 2.6440722942352295 + }, + { + "auxiliary_loss_clip": 0.01089302, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.04237413, + "balance_loss_mlp": 1.02150702, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.8103153119245512, + "language_loss": 0.8549425, + "learning_rate": 8.993857222314752e-07, + "loss": 0.87616944, + "num_input_tokens_seen": 249470805, + "step": 11558, + "time_per_iteration": 2.5360751152038574 + }, + { + "auxiliary_loss_clip": 0.01103164, + "auxiliary_loss_mlp": 0.01034455, + "balance_loss_clip": 1.0366807, + "balance_loss_mlp": 1.02044761, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 2.022402657215457, + "language_loss": 0.70097411, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72235024, + "num_input_tokens_seen": 249491150, + "step": 11559, + "time_per_iteration": 2.534464120864868 + }, + { + "auxiliary_loss_clip": 0.01077653, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.03789091, + "balance_loss_mlp": 1.01816773, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.4864190095522558, + "language_loss": 0.78830791, + "learning_rate": 8.987354340711921e-07, + "loss": 0.80937976, + "num_input_tokens_seen": 249511560, + "step": 11560, + "time_per_iteration": 2.5572850704193115 + }, + { + "auxiliary_loss_clip": 0.01087314, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.03824854, + "balance_loss_mlp": 1.02194095, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 2.2761057466555985, + "language_loss": 0.76673198, + "learning_rate": 8.9841035262498e-07, + "loss": 0.78793991, + "num_input_tokens_seen": 249531910, + "step": 11561, + "time_per_iteration": 2.541372537612915 + }, + { + "auxiliary_loss_clip": 0.01108726, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.03604364, + "balance_loss_mlp": 1.01891708, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 2.540417440736725, + "language_loss": 0.784145, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80555713, + "num_input_tokens_seen": 249550300, + "step": 11562, + "time_per_iteration": 2.469512701034546 + }, + { + "auxiliary_loss_clip": 0.01103131, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.03728986, + "balance_loss_mlp": 1.01936293, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 3.1759803655274363, + "language_loss": 0.69267386, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71402383, + "num_input_tokens_seen": 249567740, + "step": 11563, + "time_per_iteration": 2.5633010864257812 + }, + { + "auxiliary_loss_clip": 0.01089255, + "auxiliary_loss_mlp": 0.01025753, + "balance_loss_clip": 1.03550637, + "balance_loss_mlp": 1.01404119, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.0278244243910897, + "language_loss": 0.73552758, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75667769, + "num_input_tokens_seen": 249582700, + "step": 11564, + "time_per_iteration": 2.4169602394104004 + }, + { + "auxiliary_loss_clip": 0.01087043, + "auxiliary_loss_mlp": 0.01036327, + "balance_loss_clip": 1.04409802, + "balance_loss_mlp": 1.02119923, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.8091552528153911, + "language_loss": 0.71827263, + "learning_rate": 8.971104446872785e-07, + "loss": 0.7395063, + "num_input_tokens_seen": 249602920, + "step": 11565, + "time_per_iteration": 2.6770846843719482 + }, + { + "auxiliary_loss_clip": 0.01016001, + "auxiliary_loss_mlp": 0.01001754, + "balance_loss_clip": 1.00947022, + "balance_loss_mlp": 1.00023365, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9163685992599081, + "language_loss": 0.58490872, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60508627, + "num_input_tokens_seen": 249660400, + "step": 11566, + "time_per_iteration": 2.97255802154541 + }, + { + "auxiliary_loss_clip": 0.01078347, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.0380379, + "balance_loss_mlp": 1.01722407, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 2.2010235136092486, + "language_loss": 0.74119091, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76227498, + "num_input_tokens_seen": 249679335, + "step": 11567, + "time_per_iteration": 4.106271505355835 + }, + { + "auxiliary_loss_clip": 0.01080389, + "auxiliary_loss_mlp": 0.01038237, + "balance_loss_clip": 1.03312039, + "balance_loss_mlp": 1.02460575, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.4734806337174735, + "language_loss": 0.76795161, + "learning_rate": 8.961359528185313e-07, + "loss": 0.78913784, + "num_input_tokens_seen": 249701805, + "step": 11568, + "time_per_iteration": 4.004446744918823 + }, + { + "auxiliary_loss_clip": 0.01096551, + "auxiliary_loss_mlp": 0.01035991, + "balance_loss_clip": 1.03969789, + "balance_loss_mlp": 1.02426124, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 1.7670169473164552, + "language_loss": 0.7236129, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74493837, + "num_input_tokens_seen": 249720550, + "step": 11569, + "time_per_iteration": 2.4487850666046143 + }, + { + "auxiliary_loss_clip": 0.01088718, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.04007673, + "balance_loss_mlp": 1.01693892, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 1.5750161569462975, + "language_loss": 0.76963693, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79082209, + "num_input_tokens_seen": 249740325, + "step": 11570, + "time_per_iteration": 2.5318121910095215 + }, + { + "auxiliary_loss_clip": 0.01100822, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03614044, + "balance_loss_mlp": 1.02052665, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 2.4718836498271304, + "language_loss": 0.74152339, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76286215, + "num_input_tokens_seen": 249760570, + "step": 11571, + "time_per_iteration": 2.5209851264953613 + }, + { + "auxiliary_loss_clip": 0.01094895, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.03495026, + "balance_loss_mlp": 1.01791811, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 1.761741300530647, + "language_loss": 0.74467087, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76592088, + "num_input_tokens_seen": 249778290, + "step": 11572, + "time_per_iteration": 2.4238624572753906 + }, + { + "auxiliary_loss_clip": 0.01087168, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.03379405, + "balance_loss_mlp": 1.01691139, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 2.128724227261278, + "language_loss": 0.69884193, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72000873, + "num_input_tokens_seen": 249800925, + "step": 11573, + "time_per_iteration": 2.5980260372161865 + }, + { + "auxiliary_loss_clip": 0.01089386, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.03814197, + "balance_loss_mlp": 1.0219804, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.8873035024388611, + "language_loss": 0.74771547, + "learning_rate": 8.941880995966095e-07, + "loss": 0.76896071, + "num_input_tokens_seen": 249820500, + "step": 11574, + "time_per_iteration": 2.495624542236328 + }, + { + "auxiliary_loss_clip": 0.01079881, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.03421116, + "balance_loss_mlp": 1.01765752, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.6540388328224607, + "language_loss": 0.74639869, + "learning_rate": 8.938636040849014e-07, + "loss": 0.7674951, + "num_input_tokens_seen": 249839845, + "step": 11575, + "time_per_iteration": 2.5299861431121826 + }, + { + "auxiliary_loss_clip": 0.01101398, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.03695548, + "balance_loss_mlp": 1.01749444, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 2.00447402667576, + "language_loss": 0.79068971, + "learning_rate": 8.935391505179966e-07, + "loss": 0.81201327, + "num_input_tokens_seen": 249857400, + "step": 11576, + "time_per_iteration": 2.4854495525360107 + }, + { + "auxiliary_loss_clip": 0.01069287, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.03656292, + "balance_loss_mlp": 1.0162096, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.2784413660969918, + "language_loss": 0.56911981, + "learning_rate": 8.932147389081985e-07, + "loss": 0.5900929, + "num_input_tokens_seen": 249871645, + "step": 11577, + "time_per_iteration": 2.5456767082214355 + }, + { + "auxiliary_loss_clip": 0.01035608, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.03307033, + "balance_loss_mlp": 1.01966667, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.3588589045292474, + "language_loss": 0.76701498, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78767931, + "num_input_tokens_seen": 249894215, + "step": 11578, + "time_per_iteration": 2.776095390319824 + }, + { + "auxiliary_loss_clip": 0.0107838, + "auxiliary_loss_mlp": 0.01035242, + "balance_loss_clip": 1.03628242, + "balance_loss_mlp": 1.02298141, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 1.8705397591419481, + "language_loss": 0.79855549, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81969178, + "num_input_tokens_seen": 249912850, + "step": 11579, + "time_per_iteration": 2.5276923179626465 + }, + { + "auxiliary_loss_clip": 0.01075042, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.03759289, + "balance_loss_mlp": 1.01622486, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 1.9477757781611953, + "language_loss": 0.72727829, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74831831, + "num_input_tokens_seen": 249932650, + "step": 11580, + "time_per_iteration": 4.047279596328735 + }, + { + "auxiliary_loss_clip": 0.01092889, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.03779888, + "balance_loss_mlp": 1.01620722, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 1.9053697817740178, + "language_loss": 0.6571694, + "learning_rate": 8.919175122860787e-07, + "loss": 0.67839384, + "num_input_tokens_seen": 249951205, + "step": 11581, + "time_per_iteration": 2.5169053077697754 + }, + { + "auxiliary_loss_clip": 0.01111329, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.03771281, + "balance_loss_mlp": 1.0181911, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.308896976291666, + "language_loss": 0.76789188, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78930867, + "num_input_tokens_seen": 249967045, + "step": 11582, + "time_per_iteration": 2.4444525241851807 + }, + { + "auxiliary_loss_clip": 0.01086907, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.03467679, + "balance_loss_mlp": 1.01918221, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 2.002649982461307, + "language_loss": 0.69773471, + "learning_rate": 8.91269151037425e-07, + "loss": 0.71890962, + "num_input_tokens_seen": 249984565, + "step": 11583, + "time_per_iteration": 2.471349000930786 + }, + { + "auxiliary_loss_clip": 0.01085899, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.04280901, + "balance_loss_mlp": 1.02086973, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 2.2644669179922845, + "language_loss": 0.82276505, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84395629, + "num_input_tokens_seen": 250004235, + "step": 11584, + "time_per_iteration": 2.5539474487304688 + }, + { + "auxiliary_loss_clip": 0.01067629, + "auxiliary_loss_mlp": 0.01036403, + "balance_loss_clip": 1.04238725, + "balance_loss_mlp": 1.02249098, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.352006669772486, + "language_loss": 0.79943931, + "learning_rate": 8.906209579615107e-07, + "loss": 0.82047963, + "num_input_tokens_seen": 250017645, + "step": 11585, + "time_per_iteration": 2.723280668258667 + }, + { + "auxiliary_loss_clip": 0.01106793, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.03578389, + "balance_loss_mlp": 1.01966751, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.5795293396466752, + "language_loss": 0.77572459, + "learning_rate": 8.90296924519055e-07, + "loss": 0.7971065, + "num_input_tokens_seen": 250037640, + "step": 11586, + "time_per_iteration": 2.473461866378784 + }, + { + "auxiliary_loss_clip": 0.01096087, + "auxiliary_loss_mlp": 0.01030546, + "balance_loss_clip": 1.03664088, + "balance_loss_mlp": 1.01914406, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 2.3657370246618616, + "language_loss": 0.78742099, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80868733, + "num_input_tokens_seen": 250056490, + "step": 11587, + "time_per_iteration": 2.490121841430664 + }, + { + "auxiliary_loss_clip": 0.01085522, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.03705776, + "balance_loss_mlp": 1.02144396, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 2.047962257560019, + "language_loss": 0.72665524, + "learning_rate": 8.896489838865857e-07, + "loss": 0.74784684, + "num_input_tokens_seen": 250074285, + "step": 11588, + "time_per_iteration": 2.5940473079681396 + }, + { + "auxiliary_loss_clip": 0.01089415, + "auxiliary_loss_mlp": 0.01025559, + "balance_loss_clip": 1.0415597, + "balance_loss_mlp": 1.01431191, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 2.3142693675247337, + "language_loss": 0.75251049, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77366018, + "num_input_tokens_seen": 250093350, + "step": 11589, + "time_per_iteration": 2.577009916305542 + }, + { + "auxiliary_loss_clip": 0.010903, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.03625369, + "balance_loss_mlp": 1.02238679, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 1.8696442293298918, + "language_loss": 0.64114463, + "learning_rate": 8.890012116726012e-07, + "loss": 0.66238832, + "num_input_tokens_seen": 250114170, + "step": 11590, + "time_per_iteration": 2.5983994007110596 + }, + { + "auxiliary_loss_clip": 0.01005585, + "auxiliary_loss_mlp": 0.01002829, + "balance_loss_clip": 1.02522254, + "balance_loss_mlp": 1.00152338, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7614232202734135, + "language_loss": 0.61212206, + "learning_rate": 8.88677388753248e-07, + "loss": 0.6322062, + "num_input_tokens_seen": 250178250, + "step": 11591, + "time_per_iteration": 3.3934051990509033 + }, + { + "auxiliary_loss_clip": 0.01072988, + "auxiliary_loss_mlp": 0.00777746, + "balance_loss_clip": 1.0494436, + "balance_loss_mlp": 1.00052786, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 1.7751875563506443, + "language_loss": 0.69397748, + "learning_rate": 8.883536079753582e-07, + "loss": 0.71248484, + "num_input_tokens_seen": 250198420, + "step": 11592, + "time_per_iteration": 3.0315723419189453 + }, + { + "auxiliary_loss_clip": 0.01076146, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.03459275, + "balance_loss_mlp": 1.01869917, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.5085072367212566, + "language_loss": 0.62479198, + "learning_rate": 8.880298693512109e-07, + "loss": 0.6458596, + "num_input_tokens_seen": 250220650, + "step": 11593, + "time_per_iteration": 2.6202733516693115 + }, + { + "auxiliary_loss_clip": 0.01084602, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.03601575, + "balance_loss_mlp": 1.01804173, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 2.686155860986982, + "language_loss": 0.54532373, + "learning_rate": 8.877061728930832e-07, + "loss": 0.5664621, + "num_input_tokens_seen": 250241750, + "step": 11594, + "time_per_iteration": 2.618335485458374 + }, + { + "auxiliary_loss_clip": 0.01098402, + "auxiliary_loss_mlp": 0.01026928, + "balance_loss_clip": 1.03666615, + "balance_loss_mlp": 1.01527548, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 2.3149018502884227, + "language_loss": 0.76864254, + "learning_rate": 8.87382518613248e-07, + "loss": 0.78989583, + "num_input_tokens_seen": 250259445, + "step": 11595, + "time_per_iteration": 4.152539014816284 + }, + { + "auxiliary_loss_clip": 0.01089711, + "auxiliary_loss_mlp": 0.00778574, + "balance_loss_clip": 1.03874648, + "balance_loss_mlp": 1.00051475, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.2254933604428144, + "language_loss": 0.71751982, + "learning_rate": 8.870589065239793e-07, + "loss": 0.7362026, + "num_input_tokens_seen": 250275640, + "step": 11596, + "time_per_iteration": 2.550100803375244 + }, + { + "auxiliary_loss_clip": 0.01112622, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.03953934, + "balance_loss_mlp": 1.01999569, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.7961054514792296, + "language_loss": 0.76110488, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78255683, + "num_input_tokens_seen": 250296435, + "step": 11597, + "time_per_iteration": 2.4536750316619873 + }, + { + "auxiliary_loss_clip": 0.01100556, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.03738439, + "balance_loss_mlp": 1.02099156, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.9686374640144848, + "language_loss": 0.74909168, + "learning_rate": 8.864118089662267e-07, + "loss": 0.77042907, + "num_input_tokens_seen": 250314035, + "step": 11598, + "time_per_iteration": 2.474306106567383 + }, + { + "auxiliary_loss_clip": 0.01095062, + "auxiliary_loss_mlp": 0.01036525, + "balance_loss_clip": 1.03826332, + "balance_loss_mlp": 1.02348924, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 1.7901828011873642, + "language_loss": 0.89753044, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91884625, + "num_input_tokens_seen": 250332995, + "step": 11599, + "time_per_iteration": 2.5578770637512207 + }, + { + "auxiliary_loss_clip": 0.01107664, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_clip": 1.03952742, + "balance_loss_mlp": 1.02717924, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.319301949909786, + "language_loss": 0.69742608, + "learning_rate": 8.85764880317974e-07, + "loss": 0.71890974, + "num_input_tokens_seen": 250352120, + "step": 11600, + "time_per_iteration": 2.5153234004974365 + }, + { + "auxiliary_loss_clip": 0.01073201, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.03587091, + "balance_loss_mlp": 1.02009392, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 1.6732781904433267, + "language_loss": 0.7675612, + "learning_rate": 8.854414793655771e-07, + "loss": 0.78861058, + "num_input_tokens_seen": 250371705, + "step": 11601, + "time_per_iteration": 2.5788865089416504 + }, + { + "auxiliary_loss_clip": 0.01092429, + "auxiliary_loss_mlp": 0.00775964, + "balance_loss_clip": 1.03447747, + "balance_loss_mlp": 1.0005095, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.7802274336349577, + "language_loss": 0.7192229, + "learning_rate": 8.851181206773508e-07, + "loss": 0.73790681, + "num_input_tokens_seen": 250390485, + "step": 11602, + "time_per_iteration": 2.436586618423462 + }, + { + "auxiliary_loss_clip": 0.01089586, + "auxiliary_loss_mlp": 0.00776765, + "balance_loss_clip": 1.0362947, + "balance_loss_mlp": 1.00058126, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 2.083348115359369, + "language_loss": 0.76732993, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78599346, + "num_input_tokens_seen": 250407020, + "step": 11603, + "time_per_iteration": 2.512373685836792 + }, + { + "auxiliary_loss_clip": 0.01063934, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.03411078, + "balance_loss_mlp": 1.02097106, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 1.5870050521868793, + "language_loss": 0.62277281, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64374197, + "num_input_tokens_seen": 250425880, + "step": 11604, + "time_per_iteration": 2.5884175300598145 + }, + { + "auxiliary_loss_clip": 0.01094969, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.0369184, + "balance_loss_mlp": 1.02197576, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.357382402828444, + "language_loss": 0.80907381, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83037716, + "num_input_tokens_seen": 250442925, + "step": 11605, + "time_per_iteration": 2.5238263607025146 + }, + { + "auxiliary_loss_clip": 0.0110168, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.03915155, + "balance_loss_mlp": 1.0215714, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.6359534445443111, + "language_loss": 0.70321524, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72456563, + "num_input_tokens_seen": 250461220, + "step": 11606, + "time_per_iteration": 4.088248252868652 + }, + { + "auxiliary_loss_clip": 0.01092524, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.03882432, + "balance_loss_mlp": 1.01808763, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 2.0256536639478586, + "language_loss": 0.82683778, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84806621, + "num_input_tokens_seen": 250480975, + "step": 11607, + "time_per_iteration": 2.5077672004699707 + }, + { + "auxiliary_loss_clip": 0.0108843, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.03596067, + "balance_loss_mlp": 1.01819623, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 1.968768997698839, + "language_loss": 0.79151571, + "learning_rate": 8.831788567821265e-07, + "loss": 0.81271142, + "num_input_tokens_seen": 250497980, + "step": 11608, + "time_per_iteration": 3.8862884044647217 + }, + { + "auxiliary_loss_clip": 0.01092588, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.03568172, + "balance_loss_mlp": 1.01553583, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 2.0305708725789504, + "language_loss": 0.90228313, + "learning_rate": 8.828557942863357e-07, + "loss": 0.92348725, + "num_input_tokens_seen": 250511910, + "step": 11609, + "time_per_iteration": 2.4608211517333984 + }, + { + "auxiliary_loss_clip": 0.01083386, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.04187739, + "balance_loss_mlp": 1.01772332, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 1.8358159063565458, + "language_loss": 0.64183909, + "learning_rate": 8.82532774152765e-07, + "loss": 0.66297758, + "num_input_tokens_seen": 250531090, + "step": 11610, + "time_per_iteration": 2.553774356842041 + }, + { + "auxiliary_loss_clip": 0.01075868, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.03399611, + "balance_loss_mlp": 1.01894069, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.8709465111072567, + "language_loss": 0.84615284, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86721385, + "num_input_tokens_seen": 250551565, + "step": 11611, + "time_per_iteration": 2.6244945526123047 + }, + { + "auxiliary_loss_clip": 0.0110054, + "auxiliary_loss_mlp": 0.01034209, + "balance_loss_clip": 1.0367136, + "balance_loss_mlp": 1.02150118, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 2.022215955577379, + "language_loss": 0.70930672, + "learning_rate": 8.818868610212793e-07, + "loss": 0.73065424, + "num_input_tokens_seen": 250569625, + "step": 11612, + "time_per_iteration": 2.4195854663848877 + }, + { + "auxiliary_loss_clip": 0.01092566, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.03402376, + "balance_loss_mlp": 1.02107513, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.5248097786372623, + "language_loss": 0.80838454, + "learning_rate": 8.815639680478573e-07, + "loss": 0.82964671, + "num_input_tokens_seen": 250586960, + "step": 11613, + "time_per_iteration": 2.427046060562134 + }, + { + "auxiliary_loss_clip": 0.01098912, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.03597021, + "balance_loss_mlp": 1.02207708, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 2.02128752855708, + "language_loss": 0.75513774, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77646297, + "num_input_tokens_seen": 250605080, + "step": 11614, + "time_per_iteration": 2.481281280517578 + }, + { + "auxiliary_loss_clip": 0.01055361, + "auxiliary_loss_mlp": 0.01026946, + "balance_loss_clip": 1.04381096, + "balance_loss_mlp": 1.01490021, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.817136518925074, + "language_loss": 0.77548146, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79630452, + "num_input_tokens_seen": 250623965, + "step": 11615, + "time_per_iteration": 2.6532881259918213 + }, + { + "auxiliary_loss_clip": 0.01085151, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.04039514, + "balance_loss_mlp": 1.01919901, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 1.952389163635907, + "language_loss": 0.72740489, + "learning_rate": 8.80595543643797e-07, + "loss": 0.74856895, + "num_input_tokens_seen": 250640675, + "step": 11616, + "time_per_iteration": 2.4956812858581543 + }, + { + "auxiliary_loss_clip": 0.01111491, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.03984845, + "balance_loss_mlp": 1.02566838, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 2.359448920201206, + "language_loss": 0.84260929, + "learning_rate": 8.802728203886487e-07, + "loss": 0.8641026, + "num_input_tokens_seen": 250660295, + "step": 11617, + "time_per_iteration": 2.4679088592529297 + }, + { + "auxiliary_loss_clip": 0.01076433, + "auxiliary_loss_mlp": 0.01044547, + "balance_loss_clip": 1.0355227, + "balance_loss_mlp": 1.03085601, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 3.107996070203094, + "language_loss": 0.59835708, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61956692, + "num_input_tokens_seen": 250678155, + "step": 11618, + "time_per_iteration": 2.534926652908325 + }, + { + "auxiliary_loss_clip": 0.01086773, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.03673899, + "balance_loss_mlp": 1.02348053, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 2.1264206375805617, + "language_loss": 0.82990819, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85112876, + "num_input_tokens_seen": 250697230, + "step": 11619, + "time_per_iteration": 4.0169525146484375 + }, + { + "auxiliary_loss_clip": 0.01095631, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.03688049, + "balance_loss_mlp": 1.02007663, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 1.75962727833151, + "language_loss": 0.67491686, + "learning_rate": 8.793049054331494e-07, + "loss": 0.696181, + "num_input_tokens_seen": 250719865, + "step": 11620, + "time_per_iteration": 2.6165313720703125 + }, + { + "auxiliary_loss_clip": 0.01065513, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.03846073, + "balance_loss_mlp": 1.01701093, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 3.1616312169447163, + "language_loss": 0.7283904, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74933755, + "num_input_tokens_seen": 250736565, + "step": 11621, + "time_per_iteration": 2.589367151260376 + }, + { + "auxiliary_loss_clip": 0.01061623, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.03525972, + "balance_loss_mlp": 1.02542639, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.7257961847033956, + "language_loss": 0.6837464, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70474422, + "num_input_tokens_seen": 250757235, + "step": 11622, + "time_per_iteration": 2.6078100204467773 + }, + { + "auxiliary_loss_clip": 0.01050996, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.04267335, + "balance_loss_mlp": 1.01488233, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 1.5428868690585915, + "language_loss": 0.62964404, + "learning_rate": 8.783373729494721e-07, + "loss": 0.65041602, + "num_input_tokens_seen": 250775585, + "step": 11623, + "time_per_iteration": 2.605917453765869 + }, + { + "auxiliary_loss_clip": 0.01113716, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.03685915, + "balance_loss_mlp": 1.01500702, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 2.0645922652013002, + "language_loss": 0.60967726, + "learning_rate": 8.780149471723932e-07, + "loss": 0.63109595, + "num_input_tokens_seen": 250795725, + "step": 11624, + "time_per_iteration": 2.6210060119628906 + }, + { + "auxiliary_loss_clip": 0.01100164, + "auxiliary_loss_mlp": 0.01043088, + "balance_loss_clip": 1.03518391, + "balance_loss_mlp": 1.02982044, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 2.0129036027574805, + "language_loss": 0.7819308, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80336332, + "num_input_tokens_seen": 250814555, + "step": 11625, + "time_per_iteration": 2.524259567260742 + }, + { + "auxiliary_loss_clip": 0.01076802, + "auxiliary_loss_mlp": 0.0103155, + "balance_loss_clip": 1.03734088, + "balance_loss_mlp": 1.02000475, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 4.327385892915869, + "language_loss": 0.6600889, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68117243, + "num_input_tokens_seen": 250833105, + "step": 11626, + "time_per_iteration": 2.5577335357666016 + }, + { + "auxiliary_loss_clip": 0.01090317, + "auxiliary_loss_mlp": 0.00777398, + "balance_loss_clip": 1.03770924, + "balance_loss_mlp": 1.00063634, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 1.8095778759738192, + "language_loss": 0.70765877, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72633588, + "num_input_tokens_seen": 250852570, + "step": 11627, + "time_per_iteration": 2.5586040019989014 + }, + { + "auxiliary_loss_clip": 0.01107288, + "auxiliary_loss_mlp": 0.01028982, + "balance_loss_clip": 1.03798437, + "balance_loss_mlp": 1.01833701, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.7072116592907822, + "language_loss": 0.62626916, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64763188, + "num_input_tokens_seen": 250870500, + "step": 11628, + "time_per_iteration": 2.465705394744873 + }, + { + "auxiliary_loss_clip": 0.011031, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.03732371, + "balance_loss_mlp": 1.02080262, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.207212262487834, + "language_loss": 0.67953408, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70090115, + "num_input_tokens_seen": 250892745, + "step": 11629, + "time_per_iteration": 2.5732216835021973 + }, + { + "auxiliary_loss_clip": 0.01109857, + "auxiliary_loss_mlp": 0.01038263, + "balance_loss_clip": 1.03805971, + "balance_loss_mlp": 1.0256145, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.7057740602144051, + "language_loss": 0.72691077, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74839199, + "num_input_tokens_seen": 250910225, + "step": 11630, + "time_per_iteration": 2.3986244201660156 + }, + { + "auxiliary_loss_clip": 0.01110338, + "auxiliary_loss_mlp": 0.01037226, + "balance_loss_clip": 1.03911209, + "balance_loss_mlp": 1.02538872, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.7214498172291197, + "language_loss": 0.74161512, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76309079, + "num_input_tokens_seen": 250929715, + "step": 11631, + "time_per_iteration": 2.4288344383239746 + }, + { + "auxiliary_loss_clip": 0.01104104, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.03935993, + "balance_loss_mlp": 1.02032387, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.26581526809344, + "language_loss": 0.89142489, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91279995, + "num_input_tokens_seen": 250944230, + "step": 11632, + "time_per_iteration": 2.4158146381378174 + }, + { + "auxiliary_loss_clip": 0.01090964, + "auxiliary_loss_mlp": 0.01036222, + "balance_loss_clip": 1.03865087, + "balance_loss_mlp": 1.02455699, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.500091257947911, + "language_loss": 0.80078465, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82205647, + "num_input_tokens_seen": 250961865, + "step": 11633, + "time_per_iteration": 2.4973509311676025 + }, + { + "auxiliary_loss_clip": 0.01114162, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.03866076, + "balance_loss_mlp": 1.02001452, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 1.8680766285441506, + "language_loss": 0.67280799, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69428116, + "num_input_tokens_seen": 250982025, + "step": 11634, + "time_per_iteration": 3.864320993423462 + }, + { + "auxiliary_loss_clip": 0.01009385, + "auxiliary_loss_mlp": 0.01001773, + "balance_loss_clip": 1.01206446, + "balance_loss_mlp": 1.0001992, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.6908490241730637, + "language_loss": 0.53225625, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55236781, + "num_input_tokens_seen": 251046900, + "step": 11635, + "time_per_iteration": 3.213982582092285 + }, + { + "auxiliary_loss_clip": 0.01089312, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.0420078, + "balance_loss_mlp": 1.01827884, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 2.317616388466827, + "language_loss": 0.82277393, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84397507, + "num_input_tokens_seen": 251065050, + "step": 11636, + "time_per_iteration": 2.4892547130584717 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.01029712, + "balance_loss_clip": 1.03812957, + "balance_loss_mlp": 1.0178268, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 2.796174358298677, + "language_loss": 0.82838345, + "learning_rate": 8.738272881850801e-07, + "loss": 0.84980124, + "num_input_tokens_seen": 251083355, + "step": 11637, + "time_per_iteration": 2.4100334644317627 + }, + { + "auxiliary_loss_clip": 0.01059808, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.03308988, + "balance_loss_mlp": 1.02174926, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 2.100222204325451, + "language_loss": 0.67787623, + "learning_rate": 8.735054591608704e-07, + "loss": 0.69881105, + "num_input_tokens_seen": 251096420, + "step": 11638, + "time_per_iteration": 2.5166175365448 + }, + { + "auxiliary_loss_clip": 0.01105151, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.03770983, + "balance_loss_mlp": 1.02102363, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 2.399159320722933, + "language_loss": 0.78161585, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80301309, + "num_input_tokens_seen": 251115410, + "step": 11639, + "time_per_iteration": 2.5214850902557373 + }, + { + "auxiliary_loss_clip": 0.01092065, + "auxiliary_loss_mlp": 0.01041705, + "balance_loss_clip": 1.04106247, + "balance_loss_mlp": 1.02848518, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.5038944034567177, + "language_loss": 0.82156217, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84289986, + "num_input_tokens_seen": 251133530, + "step": 11640, + "time_per_iteration": 2.5139405727386475 + }, + { + "auxiliary_loss_clip": 0.01076404, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.03497934, + "balance_loss_mlp": 1.01544762, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.6605142087938956, + "language_loss": 0.75902283, + "learning_rate": 8.725402284377619e-07, + "loss": 0.78005391, + "num_input_tokens_seen": 251153985, + "step": 11641, + "time_per_iteration": 2.585996627807617 + }, + { + "auxiliary_loss_clip": 0.01088955, + "auxiliary_loss_mlp": 0.01025593, + "balance_loss_clip": 1.03696454, + "balance_loss_mlp": 1.01222956, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 1.9993726879553675, + "language_loss": 0.77706891, + "learning_rate": 8.722185703539022e-07, + "loss": 0.79821444, + "num_input_tokens_seen": 251173225, + "step": 11642, + "time_per_iteration": 2.5146071910858154 + }, + { + "auxiliary_loss_clip": 0.01109399, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.04097414, + "balance_loss_mlp": 1.02058804, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 2.4911433249606945, + "language_loss": 0.74638259, + "learning_rate": 8.718969550356266e-07, + "loss": 0.76782537, + "num_input_tokens_seen": 251192485, + "step": 11643, + "time_per_iteration": 2.519815444946289 + }, + { + "auxiliary_loss_clip": 0.01076966, + "auxiliary_loss_mlp": 0.01024748, + "balance_loss_clip": 1.036057, + "balance_loss_mlp": 1.01230907, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 2.0076948772289525, + "language_loss": 0.60233611, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62335324, + "num_input_tokens_seen": 251214965, + "step": 11644, + "time_per_iteration": 2.595547676086426 + }, + { + "auxiliary_loss_clip": 0.01097842, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.03575945, + "balance_loss_mlp": 1.01770627, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.7909322856966345, + "language_loss": 0.82249618, + "learning_rate": 8.712538527446119e-07, + "loss": 0.84376872, + "num_input_tokens_seen": 251234500, + "step": 11645, + "time_per_iteration": 2.469792127609253 + }, + { + "auxiliary_loss_clip": 0.01100225, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.0367434, + "balance_loss_mlp": 1.01980162, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 1.7715148940031225, + "language_loss": 0.6850754, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70639682, + "num_input_tokens_seen": 251254360, + "step": 11646, + "time_per_iteration": 4.056995868682861 + }, + { + "auxiliary_loss_clip": 0.01094966, + "auxiliary_loss_mlp": 0.01043391, + "balance_loss_clip": 1.03473973, + "balance_loss_mlp": 1.03028989, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.487405225906481, + "language_loss": 0.70814389, + "learning_rate": 8.706109216622635e-07, + "loss": 0.72952747, + "num_input_tokens_seen": 251274790, + "step": 11647, + "time_per_iteration": 3.893533229827881 + }, + { + "auxiliary_loss_clip": 0.01102967, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.03907692, + "balance_loss_mlp": 1.02162361, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.6988134653955995, + "language_loss": 0.71580362, + "learning_rate": 8.702895203548155e-07, + "loss": 0.7371732, + "num_input_tokens_seen": 251296275, + "step": 11648, + "time_per_iteration": 2.6174871921539307 + }, + { + "auxiliary_loss_clip": 0.01061619, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.03472233, + "balance_loss_mlp": 1.0229249, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.8311357640020685, + "language_loss": 0.77183688, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79280686, + "num_input_tokens_seen": 251317375, + "step": 11649, + "time_per_iteration": 2.616408109664917 + }, + { + "auxiliary_loss_clip": 0.01088079, + "auxiliary_loss_mlp": 0.0103007, + "balance_loss_clip": 1.03506708, + "balance_loss_mlp": 1.01809525, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 2.0928875956367072, + "language_loss": 0.78762829, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80880976, + "num_input_tokens_seen": 251333570, + "step": 11650, + "time_per_iteration": 2.4707045555114746 + }, + { + "auxiliary_loss_clip": 0.01087525, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.03648281, + "balance_loss_mlp": 1.01578665, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 2.2181225409219594, + "language_loss": 0.78795409, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80910289, + "num_input_tokens_seen": 251351070, + "step": 11651, + "time_per_iteration": 2.483658790588379 + }, + { + "auxiliary_loss_clip": 0.01077067, + "auxiliary_loss_mlp": 0.01039216, + "balance_loss_clip": 1.03553545, + "balance_loss_mlp": 1.02616298, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.6168535131356612, + "language_loss": 0.69394058, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71510339, + "num_input_tokens_seen": 251370005, + "step": 11652, + "time_per_iteration": 2.5170071125030518 + }, + { + "auxiliary_loss_clip": 0.01101649, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.03802145, + "balance_loss_mlp": 1.01820707, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.3421127978050509, + "language_loss": 0.74306834, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76438653, + "num_input_tokens_seen": 251391210, + "step": 11653, + "time_per_iteration": 2.5132741928100586 + }, + { + "auxiliary_loss_clip": 0.01084965, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.03743601, + "balance_loss_mlp": 1.01883578, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 3.111551609808647, + "language_loss": 0.7037223, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72489816, + "num_input_tokens_seen": 251411505, + "step": 11654, + "time_per_iteration": 2.4889464378356934 + }, + { + "auxiliary_loss_clip": 0.01071184, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.0405817, + "balance_loss_mlp": 1.01602459, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 5.458924097633404, + "language_loss": 0.72911441, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75012112, + "num_input_tokens_seen": 251428975, + "step": 11655, + "time_per_iteration": 2.5571749210357666 + }, + { + "auxiliary_loss_clip": 0.01110431, + "auxiliary_loss_mlp": 0.0104064, + "balance_loss_clip": 1.04124737, + "balance_loss_mlp": 1.02687144, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 2.440955546122827, + "language_loss": 0.70471591, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72622663, + "num_input_tokens_seen": 251446940, + "step": 11656, + "time_per_iteration": 2.4761500358581543 + }, + { + "auxiliary_loss_clip": 0.01066582, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.03495169, + "balance_loss_mlp": 1.01774287, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.6974501585722648, + "language_loss": 0.78005463, + "learning_rate": 8.673988377928092e-07, + "loss": 0.80101353, + "num_input_tokens_seen": 251466205, + "step": 11657, + "time_per_iteration": 2.6312596797943115 + }, + { + "auxiliary_loss_clip": 0.01116582, + "auxiliary_loss_mlp": 0.01035989, + "balance_loss_clip": 1.03890288, + "balance_loss_mlp": 1.02220297, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 2.0236467299123095, + "language_loss": 0.78080893, + "learning_rate": 8.670778654208797e-07, + "loss": 0.80233467, + "num_input_tokens_seen": 251484820, + "step": 11658, + "time_per_iteration": 3.934671640396118 + }, + { + "auxiliary_loss_clip": 0.01085132, + "auxiliary_loss_mlp": 0.01026989, + "balance_loss_clip": 1.03408504, + "balance_loss_mlp": 1.0146687, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 1.8570421366072944, + "language_loss": 0.83105832, + "learning_rate": 8.667569360094713e-07, + "loss": 0.85217953, + "num_input_tokens_seen": 251502670, + "step": 11659, + "time_per_iteration": 2.4924213886260986 + }, + { + "auxiliary_loss_clip": 0.01069803, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.03465331, + "balance_loss_mlp": 1.01611495, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 1.8982707009009574, + "language_loss": 0.6937502, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71473002, + "num_input_tokens_seen": 251521630, + "step": 11660, + "time_per_iteration": 2.519771099090576 + }, + { + "auxiliary_loss_clip": 0.01111974, + "auxiliary_loss_mlp": 0.01035457, + "balance_loss_clip": 1.03610611, + "balance_loss_mlp": 1.0221231, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 3.1629669416381003, + "language_loss": 0.81106544, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83253968, + "num_input_tokens_seen": 251540105, + "step": 11661, + "time_per_iteration": 2.42181658744812 + }, + { + "auxiliary_loss_clip": 0.01098307, + "auxiliary_loss_mlp": 0.01034199, + "balance_loss_clip": 1.03533292, + "balance_loss_mlp": 1.02203941, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 1.7900611276703713, + "language_loss": 0.79283327, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81415832, + "num_input_tokens_seen": 251560530, + "step": 11662, + "time_per_iteration": 2.5919570922851562 + }, + { + "auxiliary_loss_clip": 0.01099719, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.03627574, + "balance_loss_mlp": 1.01941299, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 2.056325662110884, + "language_loss": 0.8338089, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85514402, + "num_input_tokens_seen": 251577930, + "step": 11663, + "time_per_iteration": 2.4343936443328857 + }, + { + "auxiliary_loss_clip": 0.01024327, + "auxiliary_loss_mlp": 0.01000573, + "balance_loss_clip": 1.01117301, + "balance_loss_mlp": 0.99938697, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8609331822557627, + "language_loss": 0.53749251, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55774152, + "num_input_tokens_seen": 251638820, + "step": 11664, + "time_per_iteration": 3.0363171100616455 + }, + { + "auxiliary_loss_clip": 0.01090358, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.03458416, + "balance_loss_mlp": 1.01915908, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 2.7463811796416167, + "language_loss": 0.78984034, + "learning_rate": 8.64832262393344e-07, + "loss": 0.8110671, + "num_input_tokens_seen": 251658070, + "step": 11665, + "time_per_iteration": 2.568904399871826 + }, + { + "auxiliary_loss_clip": 0.01096778, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.03429294, + "balance_loss_mlp": 1.01845765, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.127396307263423, + "language_loss": 0.76933759, + "learning_rate": 8.645116340462404e-07, + "loss": 0.790622, + "num_input_tokens_seen": 251671575, + "step": 11666, + "time_per_iteration": 2.4690849781036377 + }, + { + "auxiliary_loss_clip": 0.01097177, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.03570652, + "balance_loss_mlp": 1.02127707, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 2.0254518838732385, + "language_loss": 0.81010926, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83141738, + "num_input_tokens_seen": 251689350, + "step": 11667, + "time_per_iteration": 2.466310739517212 + }, + { + "auxiliary_loss_clip": 0.01078888, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.03588247, + "balance_loss_mlp": 1.02395809, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.281382502625684, + "language_loss": 0.65341324, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67456424, + "num_input_tokens_seen": 251704635, + "step": 11668, + "time_per_iteration": 2.5688095092773438 + }, + { + "auxiliary_loss_clip": 0.01092253, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.03568721, + "balance_loss_mlp": 1.01790524, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 2.0217102214236653, + "language_loss": 0.76679826, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78802794, + "num_input_tokens_seen": 251723035, + "step": 11669, + "time_per_iteration": 2.5415730476379395 + }, + { + "auxiliary_loss_clip": 0.01021127, + "auxiliary_loss_mlp": 0.01004057, + "balance_loss_clip": 1.01577234, + "balance_loss_mlp": 1.00275803, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6941504981075698, + "language_loss": 0.54496139, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56521326, + "num_input_tokens_seen": 251791630, + "step": 11670, + "time_per_iteration": 3.2009010314941406 + }, + { + "auxiliary_loss_clip": 0.01091607, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.04387641, + "balance_loss_mlp": 1.02502775, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.6258088298518767, + "language_loss": 0.81760466, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83889836, + "num_input_tokens_seen": 251809840, + "step": 11671, + "time_per_iteration": 2.5026869773864746 + }, + { + "auxiliary_loss_clip": 0.01105808, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.03949428, + "balance_loss_mlp": 1.01972961, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 3.0992320745218103, + "language_loss": 0.75370979, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77509308, + "num_input_tokens_seen": 251827550, + "step": 11672, + "time_per_iteration": 2.4370498657226562 + }, + { + "auxiliary_loss_clip": 0.01096361, + "auxiliary_loss_mlp": 0.01035915, + "balance_loss_clip": 1.03503084, + "balance_loss_mlp": 1.022856, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 2.2452680465690356, + "language_loss": 0.8699652, + "learning_rate": 8.622684419164883e-07, + "loss": 0.89128786, + "num_input_tokens_seen": 251844880, + "step": 11673, + "time_per_iteration": 2.4614601135253906 + }, + { + "auxiliary_loss_clip": 0.01096094, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.03411412, + "balance_loss_mlp": 1.01950133, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 1.902015698707555, + "language_loss": 0.7303772, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75165808, + "num_input_tokens_seen": 251861025, + "step": 11674, + "time_per_iteration": 3.8689568042755127 + }, + { + "auxiliary_loss_clip": 0.01093969, + "auxiliary_loss_mlp": 0.00775848, + "balance_loss_clip": 1.03909278, + "balance_loss_mlp": 1.00051737, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.8577399866890068, + "language_loss": 0.72090513, + "learning_rate": 8.616279179832329e-07, + "loss": 0.73960328, + "num_input_tokens_seen": 251880175, + "step": 11675, + "time_per_iteration": 2.47851300239563 + }, + { + "auxiliary_loss_clip": 0.01075233, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.03584385, + "balance_loss_mlp": 1.01594591, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 2.478212259452434, + "language_loss": 0.50965953, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53069955, + "num_input_tokens_seen": 251899005, + "step": 11676, + "time_per_iteration": 2.5148894786834717 + }, + { + "auxiliary_loss_clip": 0.01018263, + "auxiliary_loss_mlp": 0.00754302, + "balance_loss_clip": 1.01505637, + "balance_loss_mlp": 1.00047302, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7295322239093925, + "language_loss": 0.59166378, + "learning_rate": 8.609875667187079e-07, + "loss": 0.60938942, + "num_input_tokens_seen": 251966790, + "step": 11677, + "time_per_iteration": 3.124403715133667 + }, + { + "auxiliary_loss_clip": 0.01099431, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.03506827, + "balance_loss_mlp": 1.02120018, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 2.050863011132652, + "language_loss": 0.62664914, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64799905, + "num_input_tokens_seen": 251989315, + "step": 11678, + "time_per_iteration": 2.513248920440674 + }, + { + "auxiliary_loss_clip": 0.0111, + "auxiliary_loss_mlp": 0.01032263, + "balance_loss_clip": 1.0369668, + "balance_loss_mlp": 1.01956749, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.6588087507903289, + "language_loss": 0.79530489, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81672752, + "num_input_tokens_seen": 252006620, + "step": 11679, + "time_per_iteration": 2.40425181388855 + }, + { + "auxiliary_loss_clip": 0.01086133, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.03599274, + "balance_loss_mlp": 1.03416371, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.1675181816536946, + "language_loss": 0.71138406, + "learning_rate": 8.600273637882567e-07, + "loss": 0.73271751, + "num_input_tokens_seen": 252024570, + "step": 11680, + "time_per_iteration": 2.46807861328125 + }, + { + "auxiliary_loss_clip": 0.0107578, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.0340662, + "balance_loss_mlp": 1.02070594, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.8530272272070447, + "language_loss": 0.7491594, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77026051, + "num_input_tokens_seen": 252042775, + "step": 11681, + "time_per_iteration": 2.5062367916107178 + }, + { + "auxiliary_loss_clip": 0.01093709, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.03806829, + "balance_loss_mlp": 1.01909971, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.5010155618985697, + "language_loss": 0.76688838, + "learning_rate": 8.593874446204434e-07, + "loss": 0.7881335, + "num_input_tokens_seen": 252063690, + "step": 11682, + "time_per_iteration": 2.5825023651123047 + }, + { + "auxiliary_loss_clip": 0.01080601, + "auxiliary_loss_mlp": 0.00779126, + "balance_loss_clip": 1.03648162, + "balance_loss_mlp": 1.00057793, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 2.131642529131338, + "language_loss": 0.73775816, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75635552, + "num_input_tokens_seen": 252080335, + "step": 11683, + "time_per_iteration": 2.5034985542297363 + }, + { + "auxiliary_loss_clip": 0.01078254, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.03804553, + "balance_loss_mlp": 1.01875794, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 1.8238720357285299, + "language_loss": 0.7130748, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73417628, + "num_input_tokens_seen": 252101075, + "step": 11684, + "time_per_iteration": 4.04178524017334 + }, + { + "auxiliary_loss_clip": 0.01102758, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.03823924, + "balance_loss_mlp": 1.01878929, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.8756279484793428, + "language_loss": 0.7210682, + "learning_rate": 8.584278902901128e-07, + "loss": 0.74241924, + "num_input_tokens_seen": 252120510, + "step": 11685, + "time_per_iteration": 2.490433931350708 + }, + { + "auxiliary_loss_clip": 0.01099767, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.03466392, + "balance_loss_mlp": 1.01858652, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 2.5054921344818175, + "language_loss": 0.84403956, + "learning_rate": 8.581081254075582e-07, + "loss": 0.8653447, + "num_input_tokens_seen": 252137590, + "step": 11686, + "time_per_iteration": 3.9111037254333496 + }, + { + "auxiliary_loss_clip": 0.01027016, + "auxiliary_loss_mlp": 0.01003885, + "balance_loss_clip": 1.01775992, + "balance_loss_mlp": 1.0027703, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.9775368839820341, + "language_loss": 0.69891578, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71922481, + "num_input_tokens_seen": 252199830, + "step": 11687, + "time_per_iteration": 3.187086820602417 + }, + { + "auxiliary_loss_clip": 0.01076808, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.03351903, + "balance_loss_mlp": 1.01592159, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 2.088829558668863, + "language_loss": 0.77314395, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79420209, + "num_input_tokens_seen": 252217200, + "step": 11688, + "time_per_iteration": 2.5361263751983643 + }, + { + "auxiliary_loss_clip": 0.0111133, + "auxiliary_loss_mlp": 0.01033173, + "balance_loss_clip": 1.03715134, + "balance_loss_mlp": 1.0205251, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.019385973518352, + "language_loss": 0.68506289, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70650792, + "num_input_tokens_seen": 252236105, + "step": 11689, + "time_per_iteration": 2.466540813446045 + }, + { + "auxiliary_loss_clip": 0.01095766, + "auxiliary_loss_mlp": 0.01038624, + "balance_loss_clip": 1.04076397, + "balance_loss_mlp": 1.02477741, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 3.167427583823211, + "language_loss": 0.79920155, + "learning_rate": 8.568294990051086e-07, + "loss": 0.82054543, + "num_input_tokens_seen": 252253315, + "step": 11690, + "time_per_iteration": 2.4716269969940186 + }, + { + "auxiliary_loss_clip": 0.01112508, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.038427, + "balance_loss_mlp": 1.02065659, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 1.717758125518239, + "language_loss": 0.75847495, + "learning_rate": 8.56509950747047e-07, + "loss": 0.7799328, + "num_input_tokens_seen": 252272765, + "step": 11691, + "time_per_iteration": 2.4167563915252686 + }, + { + "auxiliary_loss_clip": 0.01087638, + "auxiliary_loss_mlp": 0.0102745, + "balance_loss_clip": 1.03646362, + "balance_loss_mlp": 1.01549315, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 2.146779808066257, + "language_loss": 0.81472635, + "learning_rate": 8.561904458502429e-07, + "loss": 0.83587724, + "num_input_tokens_seen": 252290510, + "step": 11692, + "time_per_iteration": 2.4996814727783203 + }, + { + "auxiliary_loss_clip": 0.01084114, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.03506374, + "balance_loss_mlp": 1.01745677, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.694600390442854, + "language_loss": 0.7634123, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78455549, + "num_input_tokens_seen": 252309365, + "step": 11693, + "time_per_iteration": 2.5030198097229004 + }, + { + "auxiliary_loss_clip": 0.01086016, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.03857136, + "balance_loss_mlp": 1.02009785, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.5689589583885626, + "language_loss": 0.68292803, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70411426, + "num_input_tokens_seen": 252333010, + "step": 11694, + "time_per_iteration": 2.6582651138305664 + }, + { + "auxiliary_loss_clip": 0.01111284, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.0367589, + "balance_loss_mlp": 1.02077401, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.259721989098259, + "language_loss": 0.75917143, + "learning_rate": 8.552321914485203e-07, + "loss": 0.7806164, + "num_input_tokens_seen": 252351330, + "step": 11695, + "time_per_iteration": 2.389888286590576 + }, + { + "auxiliary_loss_clip": 0.01090938, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.04062915, + "balance_loss_mlp": 1.02693057, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 1.9947853536625757, + "language_loss": 0.73901862, + "learning_rate": 8.549128601178852e-07, + "loss": 0.76032937, + "num_input_tokens_seen": 252369580, + "step": 11696, + "time_per_iteration": 2.4929399490356445 + }, + { + "auxiliary_loss_clip": 0.0109655, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.0386529, + "balance_loss_mlp": 1.01709795, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.5703044913820317, + "language_loss": 0.7558161, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77708423, + "num_input_tokens_seen": 252390525, + "step": 11697, + "time_per_iteration": 2.546316385269165 + }, + { + "auxiliary_loss_clip": 0.01065205, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.03889966, + "balance_loss_mlp": 1.02434254, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 1.8754346772826935, + "language_loss": 0.80857098, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82961941, + "num_input_tokens_seen": 252407470, + "step": 11698, + "time_per_iteration": 4.1074957847595215 + }, + { + "auxiliary_loss_clip": 0.01083372, + "auxiliary_loss_mlp": 0.01039878, + "balance_loss_clip": 1.03354943, + "balance_loss_mlp": 1.02516794, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.3902386766434596, + "language_loss": 0.84477812, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86601067, + "num_input_tokens_seen": 252427025, + "step": 11699, + "time_per_iteration": 2.47949481010437 + }, + { + "auxiliary_loss_clip": 0.01095815, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.03579664, + "balance_loss_mlp": 1.0172255, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 1.7817135570572726, + "language_loss": 0.789505, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81077659, + "num_input_tokens_seen": 252445410, + "step": 11700, + "time_per_iteration": 2.462397575378418 + }, + { + "auxiliary_loss_clip": 0.01100883, + "auxiliary_loss_mlp": 0.01026037, + "balance_loss_clip": 1.03696287, + "balance_loss_mlp": 1.01282287, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 2.057998483837266, + "language_loss": 0.74769044, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76895964, + "num_input_tokens_seen": 252463905, + "step": 11701, + "time_per_iteration": 2.564664363861084 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.038975, + "balance_loss_mlp": 1.01440239, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.196566825528594, + "language_loss": 0.84278357, + "learning_rate": 8.529977844159769e-07, + "loss": 0.86412156, + "num_input_tokens_seen": 252478655, + "step": 11702, + "time_per_iteration": 2.4041295051574707 + }, + { + "auxiliary_loss_clip": 0.01111321, + "auxiliary_loss_mlp": 0.0103966, + "balance_loss_clip": 1.03650784, + "balance_loss_mlp": 1.02590883, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 1.712936250173259, + "language_loss": 0.61121976, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63272959, + "num_input_tokens_seen": 252498740, + "step": 11703, + "time_per_iteration": 2.4318740367889404 + }, + { + "auxiliary_loss_clip": 0.01109725, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.03497636, + "balance_loss_mlp": 1.01504767, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 2.161233992349614, + "language_loss": 0.61292648, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63430995, + "num_input_tokens_seen": 252517800, + "step": 11704, + "time_per_iteration": 2.476705312728882 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.03957558, + "balance_loss_mlp": 1.02062201, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.6607012136248012, + "language_loss": 0.70652425, + "learning_rate": 8.520408335765719e-07, + "loss": 0.72780442, + "num_input_tokens_seen": 252539620, + "step": 11705, + "time_per_iteration": 2.5454349517822266 + }, + { + "auxiliary_loss_clip": 0.01098994, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.03672814, + "balance_loss_mlp": 1.01864696, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 2.483215676698111, + "language_loss": 0.61974651, + "learning_rate": 8.517219370087645e-07, + "loss": 0.64104605, + "num_input_tokens_seen": 252557300, + "step": 11706, + "time_per_iteration": 2.444932699203491 + }, + { + "auxiliary_loss_clip": 0.01102483, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.03788495, + "balance_loss_mlp": 1.01498818, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 2.028220785260117, + "language_loss": 0.68074811, + "learning_rate": 8.514030839837756e-07, + "loss": 0.70204496, + "num_input_tokens_seen": 252576715, + "step": 11707, + "time_per_iteration": 2.4838242530822754 + }, + { + "auxiliary_loss_clip": 0.01108786, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.03636789, + "balance_loss_mlp": 1.01871061, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.9554449012851347, + "language_loss": 0.76842809, + "learning_rate": 8.510842745136974e-07, + "loss": 0.7898221, + "num_input_tokens_seen": 252596190, + "step": 11708, + "time_per_iteration": 2.458420753479004 + }, + { + "auxiliary_loss_clip": 0.01089037, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.03576624, + "balance_loss_mlp": 1.0167824, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 1.8885588415368968, + "language_loss": 0.72152227, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74270201, + "num_input_tokens_seen": 252613410, + "step": 11709, + "time_per_iteration": 2.4712488651275635 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.0344187, + "balance_loss_mlp": 1.0156076, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.4026128081669067, + "language_loss": 0.79068381, + "learning_rate": 8.504467862866267e-07, + "loss": 0.8119272, + "num_input_tokens_seen": 252629150, + "step": 11710, + "time_per_iteration": 2.4231672286987305 + }, + { + "auxiliary_loss_clip": 0.01103189, + "auxiliary_loss_mlp": 0.0103197, + "balance_loss_clip": 1.03783035, + "balance_loss_mlp": 1.01881528, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.5886918218368709, + "language_loss": 0.77107787, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79242945, + "num_input_tokens_seen": 252648225, + "step": 11711, + "time_per_iteration": 2.4646897315979004 + }, + { + "auxiliary_loss_clip": 0.01075427, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.03429842, + "balance_loss_mlp": 1.01763511, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 2.5613912416437636, + "language_loss": 0.74528933, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76633286, + "num_input_tokens_seen": 252665380, + "step": 11712, + "time_per_iteration": 2.484421730041504 + }, + { + "auxiliary_loss_clip": 0.00994675, + "auxiliary_loss_mlp": 0.01006133, + "balance_loss_clip": 1.00923645, + "balance_loss_mlp": 1.00475621, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8897010566205292, + "language_loss": 0.64733183, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66733992, + "num_input_tokens_seen": 252727950, + "step": 11713, + "time_per_iteration": 3.1365067958831787 + }, + { + "auxiliary_loss_clip": 0.01095863, + "auxiliary_loss_mlp": 0.01027833, + "balance_loss_clip": 1.03374052, + "balance_loss_mlp": 1.01587033, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.0042352143847335, + "language_loss": 0.72885394, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75009084, + "num_input_tokens_seen": 252746770, + "step": 11714, + "time_per_iteration": 3.9336512088775635 + }, + { + "auxiliary_loss_clip": 0.01083529, + "auxiliary_loss_mlp": 0.00778059, + "balance_loss_clip": 1.03508556, + "balance_loss_mlp": 1.00053346, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.7916523876465356, + "language_loss": 0.79425561, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81287146, + "num_input_tokens_seen": 252765610, + "step": 11715, + "time_per_iteration": 2.48049259185791 + }, + { + "auxiliary_loss_clip": 0.01084652, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.03667116, + "balance_loss_mlp": 1.02485776, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.469067907522702, + "language_loss": 0.71390212, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73513114, + "num_input_tokens_seen": 252781610, + "step": 11716, + "time_per_iteration": 2.4554312229156494 + }, + { + "auxiliary_loss_clip": 0.01080468, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.04076946, + "balance_loss_mlp": 1.0186516, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 2.0425201742728984, + "language_loss": 0.66389352, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68501073, + "num_input_tokens_seen": 252800600, + "step": 11717, + "time_per_iteration": 2.6826696395874023 + }, + { + "auxiliary_loss_clip": 0.01111647, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.0372088, + "balance_loss_mlp": 1.01895237, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.4983195909114544, + "language_loss": 0.74202323, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76345062, + "num_input_tokens_seen": 252822310, + "step": 11718, + "time_per_iteration": 2.472723960876465 + }, + { + "auxiliary_loss_clip": 0.01097757, + "auxiliary_loss_mlp": 0.01028289, + "balance_loss_clip": 1.03522372, + "balance_loss_mlp": 1.01657081, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 1.9057454472365318, + "language_loss": 0.80058289, + "learning_rate": 8.475802484232606e-07, + "loss": 0.82184339, + "num_input_tokens_seen": 252842355, + "step": 11719, + "time_per_iteration": 2.4954006671905518 + }, + { + "auxiliary_loss_clip": 0.01099491, + "auxiliary_loss_mlp": 0.01037983, + "balance_loss_clip": 1.03688097, + "balance_loss_mlp": 1.02506077, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 2.1066777416330833, + "language_loss": 0.65710121, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67847592, + "num_input_tokens_seen": 252866785, + "step": 11720, + "time_per_iteration": 2.647274971008301 + }, + { + "auxiliary_loss_clip": 0.01094997, + "auxiliary_loss_mlp": 0.01029936, + "balance_loss_clip": 1.0397141, + "balance_loss_mlp": 1.01718104, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 1.965717549877096, + "language_loss": 0.79866427, + "learning_rate": 8.46943720397872e-07, + "loss": 0.81991363, + "num_input_tokens_seen": 252881870, + "step": 11721, + "time_per_iteration": 2.4685440063476562 + }, + { + "auxiliary_loss_clip": 0.01001291, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.00941598, + "balance_loss_mlp": 1.00146472, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7687937282861531, + "language_loss": 0.6481573, + "learning_rate": 8.466255219651582e-07, + "loss": 0.6681965, + "num_input_tokens_seen": 252951300, + "step": 11722, + "time_per_iteration": 3.2647030353546143 + }, + { + "auxiliary_loss_clip": 0.01093557, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.03860998, + "balance_loss_mlp": 1.01797724, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 2.8775526150902406, + "language_loss": 0.65609074, + "learning_rate": 8.463073672685211e-07, + "loss": 0.67732775, + "num_input_tokens_seen": 252971400, + "step": 11723, + "time_per_iteration": 2.550457000732422 + }, + { + "auxiliary_loss_clip": 0.01083716, + "auxiliary_loss_mlp": 0.01030158, + "balance_loss_clip": 1.03728807, + "balance_loss_mlp": 1.01740289, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.9783177342979144, + "language_loss": 0.81301039, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83414912, + "num_input_tokens_seen": 252989475, + "step": 11724, + "time_per_iteration": 4.013768911361694 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.03662682, + "balance_loss_mlp": 1.02109265, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 2.36904976361577, + "language_loss": 0.7340771, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75542414, + "num_input_tokens_seen": 253007220, + "step": 11725, + "time_per_iteration": 3.870364189147949 + }, + { + "auxiliary_loss_clip": 0.0106903, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.03325224, + "balance_loss_mlp": 1.01593852, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.3949274000862792, + "language_loss": 0.78004974, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80103898, + "num_input_tokens_seen": 253025410, + "step": 11726, + "time_per_iteration": 2.555119514465332 + }, + { + "auxiliary_loss_clip": 0.01091809, + "auxiliary_loss_mlp": 0.01032896, + "balance_loss_clip": 1.03606272, + "balance_loss_mlp": 1.0207305, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 2.0907804013422933, + "language_loss": 0.70520627, + "learning_rate": 8.450351860839931e-07, + "loss": 0.7264533, + "num_input_tokens_seen": 253043305, + "step": 11727, + "time_per_iteration": 2.486119270324707 + }, + { + "auxiliary_loss_clip": 0.01102211, + "auxiliary_loss_mlp": 0.00776392, + "balance_loss_clip": 1.03358209, + "balance_loss_mlp": 1.00041056, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.4485093914726457, + "language_loss": 0.69186425, + "learning_rate": 8.44717250248668e-07, + "loss": 0.71065032, + "num_input_tokens_seen": 253062790, + "step": 11728, + "time_per_iteration": 2.483318328857422 + }, + { + "auxiliary_loss_clip": 0.01079667, + "auxiliary_loss_mlp": 0.00777185, + "balance_loss_clip": 1.03680205, + "balance_loss_mlp": 1.00051987, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 1.8592527966020622, + "language_loss": 0.73377204, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75234056, + "num_input_tokens_seen": 253082055, + "step": 11729, + "time_per_iteration": 2.589953660964966 + }, + { + "auxiliary_loss_clip": 0.01101429, + "auxiliary_loss_mlp": 0.01030042, + "balance_loss_clip": 1.04136264, + "balance_loss_mlp": 1.01687574, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.5464880152156275, + "language_loss": 0.779405, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80071974, + "num_input_tokens_seen": 253102575, + "step": 11730, + "time_per_iteration": 2.542104482650757 + }, + { + "auxiliary_loss_clip": 0.01112059, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.03663957, + "balance_loss_mlp": 1.02052927, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.018569442182835, + "language_loss": 0.63542747, + "learning_rate": 8.437637056415359e-07, + "loss": 0.65687883, + "num_input_tokens_seen": 253121290, + "step": 11731, + "time_per_iteration": 2.426828145980835 + }, + { + "auxiliary_loss_clip": 0.01062279, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.03766179, + "balance_loss_mlp": 1.01687741, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.014065806714385, + "language_loss": 0.74622715, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76715374, + "num_input_tokens_seen": 253139720, + "step": 11732, + "time_per_iteration": 2.5454330444335938 + }, + { + "auxiliary_loss_clip": 0.01100382, + "auxiliary_loss_mlp": 0.01031032, + "balance_loss_clip": 1.03765059, + "balance_loss_mlp": 1.01900423, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 2.8854307995051256, + "language_loss": 0.71271777, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73403192, + "num_input_tokens_seen": 253160250, + "step": 11733, + "time_per_iteration": 2.4692130088806152 + }, + { + "auxiliary_loss_clip": 0.01073802, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.03513384, + "balance_loss_mlp": 1.01878905, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 2.1871377656278654, + "language_loss": 0.73895031, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75999713, + "num_input_tokens_seen": 253178710, + "step": 11734, + "time_per_iteration": 2.4990506172180176 + }, + { + "auxiliary_loss_clip": 0.0108065, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.03728116, + "balance_loss_mlp": 1.0305264, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.763510263605912, + "language_loss": 0.69108701, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71235025, + "num_input_tokens_seen": 253194805, + "step": 11735, + "time_per_iteration": 2.4755215644836426 + }, + { + "auxiliary_loss_clip": 0.01085741, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.03489876, + "balance_loss_mlp": 1.02306557, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 1.8303831276901152, + "language_loss": 0.72621322, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74745041, + "num_input_tokens_seen": 253213895, + "step": 11736, + "time_per_iteration": 2.5019795894622803 + }, + { + "auxiliary_loss_clip": 0.01089904, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.03564143, + "balance_loss_mlp": 1.01583874, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 2.46081434839755, + "language_loss": 0.69012493, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71129978, + "num_input_tokens_seen": 253231620, + "step": 11737, + "time_per_iteration": 2.5062503814697266 + }, + { + "auxiliary_loss_clip": 0.01078125, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.03807271, + "balance_loss_mlp": 1.0259043, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 2.4469336359077203, + "language_loss": 0.67978489, + "learning_rate": 8.415403033479332e-07, + "loss": 0.70095122, + "num_input_tokens_seen": 253249590, + "step": 11738, + "time_per_iteration": 3.976874351501465 + }, + { + "auxiliary_loss_clip": 0.01112327, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.03811574, + "balance_loss_mlp": 1.02030325, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.8155978011020093, + "language_loss": 0.74759328, + "learning_rate": 8.41222850068145e-07, + "loss": 0.76905155, + "num_input_tokens_seen": 253273870, + "step": 11739, + "time_per_iteration": 2.675320863723755 + }, + { + "auxiliary_loss_clip": 0.01083425, + "auxiliary_loss_mlp": 0.00778293, + "balance_loss_clip": 1.03431737, + "balance_loss_mlp": 1.00047934, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.6846288366110926, + "language_loss": 0.713027, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73164421, + "num_input_tokens_seen": 253293720, + "step": 11740, + "time_per_iteration": 2.529585599899292 + }, + { + "auxiliary_loss_clip": 0.01079212, + "auxiliary_loss_mlp": 0.01029426, + "balance_loss_clip": 1.03693271, + "balance_loss_mlp": 1.01842272, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.9389411242838903, + "language_loss": 0.82262117, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84370756, + "num_input_tokens_seen": 253313700, + "step": 11741, + "time_per_iteration": 2.5290486812591553 + }, + { + "auxiliary_loss_clip": 0.01088967, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.0366447, + "balance_loss_mlp": 1.01630116, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 2.2101686833033733, + "language_loss": 0.7806167, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80179119, + "num_input_tokens_seen": 253332425, + "step": 11742, + "time_per_iteration": 2.5316569805145264 + }, + { + "auxiliary_loss_clip": 0.01114356, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.0376507, + "balance_loss_mlp": 1.01882887, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.9296743273327537, + "language_loss": 0.64352, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66498345, + "num_input_tokens_seen": 253353620, + "step": 11743, + "time_per_iteration": 2.4865236282348633 + }, + { + "auxiliary_loss_clip": 0.01086425, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.0329895, + "balance_loss_mlp": 1.02120066, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 1.929340576777945, + "language_loss": 0.65832543, + "learning_rate": 8.396362430240902e-07, + "loss": 0.67954028, + "num_input_tokens_seen": 253370930, + "step": 11744, + "time_per_iteration": 2.500223159790039 + }, + { + "auxiliary_loss_clip": 0.01099642, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.03617072, + "balance_loss_mlp": 1.02036023, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 2.437864138231368, + "language_loss": 0.63830793, + "learning_rate": 8.393190535704857e-07, + "loss": 0.6596359, + "num_input_tokens_seen": 253389810, + "step": 11745, + "time_per_iteration": 2.4549448490142822 + }, + { + "auxiliary_loss_clip": 0.01080386, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.03500962, + "balance_loss_mlp": 1.02178288, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.6796667074761726, + "language_loss": 0.72024506, + "learning_rate": 8.390019081300188e-07, + "loss": 0.74138993, + "num_input_tokens_seen": 253408685, + "step": 11746, + "time_per_iteration": 2.5888922214508057 + }, + { + "auxiliary_loss_clip": 0.01061127, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.04214966, + "balance_loss_mlp": 1.02508736, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 2.200405592453935, + "language_loss": 0.79214281, + "learning_rate": 8.386848067147175e-07, + "loss": 0.8131305, + "num_input_tokens_seen": 253429685, + "step": 11747, + "time_per_iteration": 2.6551477909088135 + }, + { + "auxiliary_loss_clip": 0.01099492, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.03670812, + "balance_loss_mlp": 1.02336252, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 1.8530728541648387, + "language_loss": 0.65783823, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67918313, + "num_input_tokens_seen": 253448260, + "step": 11748, + "time_per_iteration": 2.4759902954101562 + }, + { + "auxiliary_loss_clip": 0.01070762, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.03757524, + "balance_loss_mlp": 1.02205992, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 2.244538691733073, + "language_loss": 0.79851365, + "learning_rate": 8.380507360077003e-07, + "loss": 0.81956661, + "num_input_tokens_seen": 253467725, + "step": 11749, + "time_per_iteration": 2.5758352279663086 + }, + { + "auxiliary_loss_clip": 0.01028912, + "auxiliary_loss_mlp": 0.01001625, + "balance_loss_clip": 1.00502706, + "balance_loss_mlp": 1.00043869, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.8175458845867998, + "language_loss": 0.54038078, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56068611, + "num_input_tokens_seen": 253526940, + "step": 11750, + "time_per_iteration": 2.9634459018707275 + }, + { + "auxiliary_loss_clip": 0.01091037, + "auxiliary_loss_mlp": 0.01034862, + "balance_loss_clip": 1.03714418, + "balance_loss_mlp": 1.02236843, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 1.7075560072262015, + "language_loss": 0.78974402, + "learning_rate": 8.37416841545612e-07, + "loss": 0.81100309, + "num_input_tokens_seen": 253546160, + "step": 11751, + "time_per_iteration": 2.5465426445007324 + }, + { + "auxiliary_loss_clip": 0.01074264, + "auxiliary_loss_mlp": 0.01030132, + "balance_loss_clip": 1.03581786, + "balance_loss_mlp": 1.01856852, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 4.808762873253703, + "language_loss": 0.6787827, + "learning_rate": 8.370999604364634e-07, + "loss": 0.6998266, + "num_input_tokens_seen": 253565505, + "step": 11752, + "time_per_iteration": 2.527852773666382 + }, + { + "auxiliary_loss_clip": 0.01061492, + "auxiliary_loss_mlp": 0.00779284, + "balance_loss_clip": 1.03950536, + "balance_loss_mlp": 1.00058651, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 2.0428139207437326, + "language_loss": 0.76584804, + "learning_rate": 8.367831234246025e-07, + "loss": 0.7842558, + "num_input_tokens_seen": 253585125, + "step": 11753, + "time_per_iteration": 4.065051555633545 + }, + { + "auxiliary_loss_clip": 0.01081563, + "auxiliary_loss_mlp": 0.00776273, + "balance_loss_clip": 1.03759038, + "balance_loss_mlp": 1.00054109, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.614559425467177, + "language_loss": 0.70888764, + "learning_rate": 8.364663305220405e-07, + "loss": 0.72746605, + "num_input_tokens_seen": 253604815, + "step": 11754, + "time_per_iteration": 2.554068088531494 + }, + { + "auxiliary_loss_clip": 0.01073284, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_clip": 1.03446448, + "balance_loss_mlp": 1.02978802, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 1.562164204469044, + "language_loss": 0.89078432, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91195774, + "num_input_tokens_seen": 253622855, + "step": 11755, + "time_per_iteration": 2.5271613597869873 + }, + { + "auxiliary_loss_clip": 0.01085783, + "auxiliary_loss_mlp": 0.00777536, + "balance_loss_clip": 1.03469062, + "balance_loss_mlp": 1.00054038, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.636245347141311, + "language_loss": 0.79465795, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81329119, + "num_input_tokens_seen": 253642760, + "step": 11756, + "time_per_iteration": 2.50343918800354 + }, + { + "auxiliary_loss_clip": 0.00994864, + "auxiliary_loss_mlp": 0.01002953, + "balance_loss_clip": 1.00702369, + "balance_loss_mlp": 1.00167739, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 1.0145528314764178, + "language_loss": 0.60414666, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62412488, + "num_input_tokens_seen": 253695685, + "step": 11757, + "time_per_iteration": 2.9031786918640137 + }, + { + "auxiliary_loss_clip": 0.01081531, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.0401721, + "balance_loss_mlp": 1.01935542, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 1.8506271383423594, + "language_loss": 0.80277741, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82391417, + "num_input_tokens_seen": 253713305, + "step": 11758, + "time_per_iteration": 2.5249886512756348 + }, + { + "auxiliary_loss_clip": 0.01068291, + "auxiliary_loss_mlp": 0.00778043, + "balance_loss_clip": 1.0329864, + "balance_loss_mlp": 1.00045419, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 1.7187975364208858, + "language_loss": 0.77561992, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79408324, + "num_input_tokens_seen": 253736100, + "step": 11759, + "time_per_iteration": 2.7094404697418213 + }, + { + "auxiliary_loss_clip": 0.0110299, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.03723311, + "balance_loss_mlp": 1.02030849, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 1.5885791458660783, + "language_loss": 0.67883134, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70019424, + "num_input_tokens_seen": 253757350, + "step": 11760, + "time_per_iteration": 2.486348867416382 + }, + { + "auxiliary_loss_clip": 0.01076244, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.03698397, + "balance_loss_mlp": 1.018677, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 2.5044905070694647, + "language_loss": 0.79941761, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82048523, + "num_input_tokens_seen": 253772855, + "step": 11761, + "time_per_iteration": 2.483809232711792 + }, + { + "auxiliary_loss_clip": 0.01084892, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.03206062, + "balance_loss_mlp": 1.02610874, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.5540941628021487, + "language_loss": 0.74855191, + "learning_rate": 8.33933576677553e-07, + "loss": 0.76981413, + "num_input_tokens_seen": 253790360, + "step": 11762, + "time_per_iteration": 2.47413969039917 + }, + { + "auxiliary_loss_clip": 0.01087936, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.03554738, + "balance_loss_mlp": 1.022259, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.735500263647391, + "language_loss": 0.76913124, + "learning_rate": 8.336171812990724e-07, + "loss": 0.79035306, + "num_input_tokens_seen": 253810585, + "step": 11763, + "time_per_iteration": 3.9718618392944336 + }, + { + "auxiliary_loss_clip": 0.01085275, + "auxiliary_loss_mlp": 0.00778038, + "balance_loss_clip": 1.03936982, + "balance_loss_mlp": 1.00049329, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 2.708074986411051, + "language_loss": 0.79411805, + "learning_rate": 8.333008301499453e-07, + "loss": 0.81275117, + "num_input_tokens_seen": 253829080, + "step": 11764, + "time_per_iteration": 2.609278917312622 + }, + { + "auxiliary_loss_clip": 0.01072183, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.04060459, + "balance_loss_mlp": 1.02353477, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.56850672687889, + "language_loss": 0.79681504, + "learning_rate": 8.32984523242167e-07, + "loss": 0.81790376, + "num_input_tokens_seen": 253846780, + "step": 11765, + "time_per_iteration": 4.016761779785156 + }, + { + "auxiliary_loss_clip": 0.01107578, + "auxiliary_loss_mlp": 0.01027136, + "balance_loss_clip": 1.0369513, + "balance_loss_mlp": 1.01588833, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.6188399150495267, + "language_loss": 0.6831888, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70453596, + "num_input_tokens_seen": 253867075, + "step": 11766, + "time_per_iteration": 2.484861373901367 + }, + { + "auxiliary_loss_clip": 0.01089744, + "auxiliary_loss_mlp": 0.01038128, + "balance_loss_clip": 1.03593135, + "balance_loss_mlp": 1.02473474, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 2.8014518272472566, + "language_loss": 0.63743716, + "learning_rate": 8.323520421986352e-07, + "loss": 0.6587159, + "num_input_tokens_seen": 253885790, + "step": 11767, + "time_per_iteration": 2.5165510177612305 + }, + { + "auxiliary_loss_clip": 0.0109884, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.0359534, + "balance_loss_mlp": 1.01620054, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 4.355114882732623, + "language_loss": 0.52698207, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54826236, + "num_input_tokens_seen": 253907070, + "step": 11768, + "time_per_iteration": 2.52779221534729 + }, + { + "auxiliary_loss_clip": 0.01087684, + "auxiliary_loss_mlp": 0.0077751, + "balance_loss_clip": 1.036973, + "balance_loss_mlp": 1.00046849, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 1.7600250769349182, + "language_loss": 0.75931495, + "learning_rate": 8.317197382644119e-07, + "loss": 0.77796686, + "num_input_tokens_seen": 253927290, + "step": 11769, + "time_per_iteration": 2.496389150619507 + }, + { + "auxiliary_loss_clip": 0.01012548, + "auxiliary_loss_mlp": 0.01016633, + "balance_loss_clip": 1.00573397, + "balance_loss_mlp": 1.01503551, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8515212306178561, + "language_loss": 0.61983931, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64013112, + "num_input_tokens_seen": 253983440, + "step": 11770, + "time_per_iteration": 3.02351450920105 + }, + { + "auxiliary_loss_clip": 0.01080173, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.03566766, + "balance_loss_mlp": 1.02375031, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.6918972629599829, + "language_loss": 0.76325703, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78442073, + "num_input_tokens_seen": 254003825, + "step": 11771, + "time_per_iteration": 2.5841140747070312 + }, + { + "auxiliary_loss_clip": 0.01097016, + "auxiliary_loss_mlp": 0.01030861, + "balance_loss_clip": 1.03622639, + "balance_loss_mlp": 1.01947618, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.6529592789006888, + "language_loss": 0.7114687, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73274744, + "num_input_tokens_seen": 254023345, + "step": 11772, + "time_per_iteration": 2.478694200515747 + }, + { + "auxiliary_loss_clip": 0.01067707, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.03772342, + "balance_loss_mlp": 1.01734841, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 1.7496169071413568, + "language_loss": 0.69718325, + "learning_rate": 8.30455662107496e-07, + "loss": 0.71816397, + "num_input_tokens_seen": 254041815, + "step": 11773, + "time_per_iteration": 2.5695619583129883 + }, + { + "auxiliary_loss_clip": 0.01101559, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.03763008, + "balance_loss_mlp": 1.02154899, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.4498307763017166, + "language_loss": 0.70130575, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72265422, + "num_input_tokens_seen": 254062065, + "step": 11774, + "time_per_iteration": 2.4788620471954346 + }, + { + "auxiliary_loss_clip": 0.01079643, + "auxiliary_loss_mlp": 0.01026917, + "balance_loss_clip": 1.03735042, + "balance_loss_mlp": 1.01550245, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.562206094697094, + "language_loss": 0.74266225, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76372784, + "num_input_tokens_seen": 254080605, + "step": 11775, + "time_per_iteration": 2.480696201324463 + }, + { + "auxiliary_loss_clip": 0.01076704, + "auxiliary_loss_mlp": 0.00777506, + "balance_loss_clip": 1.03767228, + "balance_loss_mlp": 1.00059152, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.7751016867675815, + "language_loss": 0.86795771, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88649982, + "num_input_tokens_seen": 254098710, + "step": 11776, + "time_per_iteration": 2.5100972652435303 + }, + { + "auxiliary_loss_clip": 0.01093359, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.03415167, + "balance_loss_mlp": 1.02053928, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.5999924289138356, + "language_loss": 0.7532357, + "learning_rate": 8.291922955383641e-07, + "loss": 0.77449596, + "num_input_tokens_seen": 254117200, + "step": 11777, + "time_per_iteration": 3.9057817459106445 + }, + { + "auxiliary_loss_clip": 0.01093899, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.03926587, + "balance_loss_mlp": 1.01813924, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.168940659482236, + "language_loss": 0.8178103, + "learning_rate": 8.288765648590066e-07, + "loss": 0.83905709, + "num_input_tokens_seen": 254132115, + "step": 11778, + "time_per_iteration": 2.4484703540802 + }, + { + "auxiliary_loss_clip": 0.01083877, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.03457594, + "balance_loss_mlp": 1.0198977, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.8353613154435318, + "language_loss": 0.84926814, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87041444, + "num_input_tokens_seen": 254152285, + "step": 11779, + "time_per_iteration": 2.487502098083496 + }, + { + "auxiliary_loss_clip": 0.01092541, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.04159868, + "balance_loss_mlp": 1.02181101, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.8347262584754747, + "language_loss": 0.71782786, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73909187, + "num_input_tokens_seen": 254172805, + "step": 11780, + "time_per_iteration": 2.6535470485687256 + }, + { + "auxiliary_loss_clip": 0.01061815, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.03437519, + "balance_loss_mlp": 1.01724553, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 2.2548033063667874, + "language_loss": 0.73085749, + "learning_rate": 8.279296393235256e-07, + "loss": 0.75177002, + "num_input_tokens_seen": 254191890, + "step": 11781, + "time_per_iteration": 2.5598740577697754 + }, + { + "auxiliary_loss_clip": 0.01099628, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.03819513, + "balance_loss_mlp": 1.02259326, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.6030451736817488, + "language_loss": 0.77200639, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79334223, + "num_input_tokens_seen": 254210150, + "step": 11782, + "time_per_iteration": 2.4441049098968506 + }, + { + "auxiliary_loss_clip": 0.01086308, + "auxiliary_loss_mlp": 0.01028211, + "balance_loss_clip": 1.03664923, + "balance_loss_mlp": 1.01733887, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 1.5227899467216324, + "language_loss": 0.69953728, + "learning_rate": 8.272985778383828e-07, + "loss": 0.7206825, + "num_input_tokens_seen": 254233015, + "step": 11783, + "time_per_iteration": 2.5589993000030518 + }, + { + "auxiliary_loss_clip": 0.01075376, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.03763223, + "balance_loss_mlp": 1.01984477, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.7688810226941811, + "language_loss": 0.79090917, + "learning_rate": 8.269831137932632e-07, + "loss": 0.8119818, + "num_input_tokens_seen": 254251345, + "step": 11784, + "time_per_iteration": 2.5753893852233887 + }, + { + "auxiliary_loss_clip": 0.01110589, + "auxiliary_loss_mlp": 0.01030073, + "balance_loss_clip": 1.03781438, + "balance_loss_mlp": 1.01805055, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 1.896216504458928, + "language_loss": 0.76993859, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79134518, + "num_input_tokens_seen": 254269905, + "step": 11785, + "time_per_iteration": 2.451550245285034 + }, + { + "auxiliary_loss_clip": 0.01085985, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.03575373, + "balance_loss_mlp": 1.02112746, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.5713245894571788, + "language_loss": 0.77989972, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80109859, + "num_input_tokens_seen": 254289990, + "step": 11786, + "time_per_iteration": 2.5334975719451904 + }, + { + "auxiliary_loss_clip": 0.01113282, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.03812814, + "balance_loss_mlp": 1.01896524, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.3492336634130955, + "language_loss": 0.79085487, + "learning_rate": 8.260369885912526e-07, + "loss": 0.81230015, + "num_input_tokens_seen": 254309085, + "step": 11787, + "time_per_iteration": 2.4735655784606934 + }, + { + "auxiliary_loss_clip": 0.01100402, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.03764796, + "balance_loss_mlp": 1.01863718, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 1.9451726811344987, + "language_loss": 0.76325321, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78456295, + "num_input_tokens_seen": 254327045, + "step": 11788, + "time_per_iteration": 2.485262632369995 + }, + { + "auxiliary_loss_clip": 0.01076025, + "auxiliary_loss_mlp": 0.01043525, + "balance_loss_clip": 1.03322744, + "balance_loss_mlp": 1.02801585, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 1.8727478817858698, + "language_loss": 0.68293709, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70413262, + "num_input_tokens_seen": 254344585, + "step": 11789, + "time_per_iteration": 2.5029611587524414 + }, + { + "auxiliary_loss_clip": 0.01059584, + "auxiliary_loss_mlp": 0.01031887, + "balance_loss_clip": 1.03992844, + "balance_loss_mlp": 1.01931071, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.6077145324372317, + "language_loss": 0.77876431, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79967904, + "num_input_tokens_seen": 254362470, + "step": 11790, + "time_per_iteration": 2.595331907272339 + }, + { + "auxiliary_loss_clip": 0.01091829, + "auxiliary_loss_mlp": 0.01031006, + "balance_loss_clip": 1.03540993, + "balance_loss_mlp": 1.01777971, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 13.222750142557144, + "language_loss": 0.71365952, + "learning_rate": 8.247761116128085e-07, + "loss": 0.7348879, + "num_input_tokens_seen": 254383190, + "step": 11791, + "time_per_iteration": 2.561521291732788 + }, + { + "auxiliary_loss_clip": 0.01100465, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.03687716, + "balance_loss_mlp": 1.02067804, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 1.6503114405612476, + "language_loss": 0.81967914, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84101713, + "num_input_tokens_seen": 254403115, + "step": 11792, + "time_per_iteration": 3.93424654006958 + }, + { + "auxiliary_loss_clip": 0.01073507, + "auxiliary_loss_mlp": 0.01032164, + "balance_loss_clip": 1.03889179, + "balance_loss_mlp": 1.01872945, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 2.185338465966153, + "language_loss": 0.64584374, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66690052, + "num_input_tokens_seen": 254421875, + "step": 11793, + "time_per_iteration": 2.5706634521484375 + }, + { + "auxiliary_loss_clip": 0.0109693, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.03609216, + "balance_loss_mlp": 1.02241087, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 3.0334215786410734, + "language_loss": 0.7042951, + "learning_rate": 8.238309217655133e-07, + "loss": 0.72561598, + "num_input_tokens_seen": 254440765, + "step": 11794, + "time_per_iteration": 2.4872775077819824 + }, + { + "auxiliary_loss_clip": 0.01091693, + "auxiliary_loss_mlp": 0.01033714, + "balance_loss_clip": 1.03884554, + "balance_loss_mlp": 1.02225828, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.9093187867045018, + "language_loss": 0.75755513, + "learning_rate": 8.23515947668052e-07, + "loss": 0.77880913, + "num_input_tokens_seen": 254459480, + "step": 11795, + "time_per_iteration": 2.4888558387756348 + }, + { + "auxiliary_loss_clip": 0.01079274, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.0418241, + "balance_loss_mlp": 1.02422297, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.2787748186508945, + "language_loss": 0.75189507, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77304947, + "num_input_tokens_seen": 254473985, + "step": 11796, + "time_per_iteration": 2.498728036880493 + }, + { + "auxiliary_loss_clip": 0.01104458, + "auxiliary_loss_mlp": 0.01050453, + "balance_loss_clip": 1.0384717, + "balance_loss_mlp": 1.03435326, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.7097814561170657, + "language_loss": 0.74167329, + "learning_rate": 8.228861333222523e-07, + "loss": 0.7632224, + "num_input_tokens_seen": 254492135, + "step": 11797, + "time_per_iteration": 2.469754695892334 + }, + { + "auxiliary_loss_clip": 0.01066611, + "auxiliary_loss_mlp": 0.01031487, + "balance_loss_clip": 1.0374552, + "balance_loss_mlp": 1.01963782, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.4714300298428191, + "language_loss": 0.7935366, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81451762, + "num_input_tokens_seen": 254512865, + "step": 11798, + "time_per_iteration": 2.603407144546509 + }, + { + "auxiliary_loss_clip": 0.01081557, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.03283978, + "balance_loss_mlp": 1.02292824, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.7939105082456999, + "language_loss": 0.66465873, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68584049, + "num_input_tokens_seen": 254532605, + "step": 11799, + "time_per_iteration": 2.4920716285705566 + }, + { + "auxiliary_loss_clip": 0.01112548, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.03858483, + "balance_loss_mlp": 1.0159955, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 2.760806772741612, + "language_loss": 0.81509686, + "learning_rate": 8.219417466054622e-07, + "loss": 0.83651161, + "num_input_tokens_seen": 254553780, + "step": 11800, + "time_per_iteration": 2.519860029220581 + }, + { + "auxiliary_loss_clip": 0.01086697, + "auxiliary_loss_mlp": 0.01029824, + "balance_loss_clip": 1.03567839, + "balance_loss_mlp": 1.01842189, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 3.4232860033007895, + "language_loss": 0.86220694, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88337219, + "num_input_tokens_seen": 254567510, + "step": 11801, + "time_per_iteration": 2.448633909225464 + }, + { + "auxiliary_loss_clip": 0.01111936, + "auxiliary_loss_mlp": 0.01036165, + "balance_loss_clip": 1.03786349, + "balance_loss_mlp": 1.02435744, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 1.7063066088693024, + "language_loss": 0.76064765, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78212863, + "num_input_tokens_seen": 254585565, + "step": 11802, + "time_per_iteration": 3.8725342750549316 + }, + { + "auxiliary_loss_clip": 0.01098583, + "auxiliary_loss_mlp": 0.0104452, + "balance_loss_clip": 1.03695536, + "balance_loss_mlp": 1.0313834, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 1.647608623034729, + "language_loss": 0.81425291, + "learning_rate": 8.209977619374462e-07, + "loss": 0.83568394, + "num_input_tokens_seen": 254603465, + "step": 11803, + "time_per_iteration": 2.4651358127593994 + }, + { + "auxiliary_loss_clip": 0.01112577, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.0363636, + "balance_loss_mlp": 1.01846707, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.4408005582735983, + "language_loss": 0.67173827, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69317853, + "num_input_tokens_seen": 254620500, + "step": 11804, + "time_per_iteration": 3.8243141174316406 + }, + { + "auxiliary_loss_clip": 0.0109626, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.03530335, + "balance_loss_mlp": 1.01765394, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.780263079528648, + "language_loss": 0.77736771, + "learning_rate": 8.203686623449637e-07, + "loss": 0.79861963, + "num_input_tokens_seen": 254638565, + "step": 11805, + "time_per_iteration": 2.518906593322754 + }, + { + "auxiliary_loss_clip": 0.01090286, + "auxiliary_loss_mlp": 0.00777901, + "balance_loss_clip": 1.036008, + "balance_loss_mlp": 1.00050116, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 1.97051888161027, + "language_loss": 0.79130197, + "learning_rate": 8.200541796403667e-07, + "loss": 0.80998385, + "num_input_tokens_seen": 254657505, + "step": 11806, + "time_per_iteration": 2.477315664291382 + }, + { + "auxiliary_loss_clip": 0.01080809, + "auxiliary_loss_mlp": 0.01045214, + "balance_loss_clip": 1.03407466, + "balance_loss_mlp": 1.03226244, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 1.8657047853644748, + "language_loss": 0.56226391, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58352411, + "num_input_tokens_seen": 254674730, + "step": 11807, + "time_per_iteration": 2.5070600509643555 + }, + { + "auxiliary_loss_clip": 0.0111574, + "auxiliary_loss_mlp": 0.01042447, + "balance_loss_clip": 1.03725553, + "balance_loss_mlp": 1.02984643, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 2.0845317306047995, + "language_loss": 0.68298447, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70456636, + "num_input_tokens_seen": 254691665, + "step": 11808, + "time_per_iteration": 2.3927855491638184 + }, + { + "auxiliary_loss_clip": 0.01101611, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.04061496, + "balance_loss_mlp": 1.02014756, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 1.8623458433147448, + "language_loss": 0.71399873, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73533338, + "num_input_tokens_seen": 254711610, + "step": 11809, + "time_per_iteration": 2.483017921447754 + }, + { + "auxiliary_loss_clip": 0.01032479, + "auxiliary_loss_mlp": 0.0100749, + "balance_loss_clip": 1.00860023, + "balance_loss_mlp": 1.00605905, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7551856190192182, + "language_loss": 0.59448385, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61488354, + "num_input_tokens_seen": 254772615, + "step": 11810, + "time_per_iteration": 3.102682113647461 + }, + { + "auxiliary_loss_clip": 0.01037735, + "auxiliary_loss_mlp": 0.01045199, + "balance_loss_clip": 1.03160822, + "balance_loss_mlp": 1.03225911, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.6298693795315196, + "language_loss": 0.74219084, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76302016, + "num_input_tokens_seen": 254791375, + "step": 11811, + "time_per_iteration": 2.672398328781128 + }, + { + "auxiliary_loss_clip": 0.01073058, + "auxiliary_loss_mlp": 0.01028699, + "balance_loss_clip": 1.03739798, + "balance_loss_mlp": 1.01708841, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 1.8155380405727426, + "language_loss": 0.83555877, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85657626, + "num_input_tokens_seen": 254809300, + "step": 11812, + "time_per_iteration": 2.539778709411621 + }, + { + "auxiliary_loss_clip": 0.01114127, + "auxiliary_loss_mlp": 0.01031916, + "balance_loss_clip": 1.03885984, + "balance_loss_mlp": 1.01890457, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.5677167204207212, + "language_loss": 0.70186418, + "learning_rate": 8.178540541983716e-07, + "loss": 0.72332454, + "num_input_tokens_seen": 254829325, + "step": 11813, + "time_per_iteration": 2.4548521041870117 + }, + { + "auxiliary_loss_clip": 0.01107728, + "auxiliary_loss_mlp": 0.01026778, + "balance_loss_clip": 1.03629029, + "balance_loss_mlp": 1.01529765, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 1.813141826208394, + "language_loss": 0.81539994, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83674502, + "num_input_tokens_seen": 254847690, + "step": 11814, + "time_per_iteration": 2.4234960079193115 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01029443, + "balance_loss_clip": 1.03800607, + "balance_loss_mlp": 1.01674712, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 1.8846628465406543, + "language_loss": 0.7611233, + "learning_rate": 8.172258501943301e-07, + "loss": 0.78252465, + "num_input_tokens_seen": 254865960, + "step": 11815, + "time_per_iteration": 2.4239227771759033 + }, + { + "auxiliary_loss_clip": 0.01067927, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.03611588, + "balance_loss_mlp": 1.01855624, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.9000448296817483, + "language_loss": 0.78473985, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80572331, + "num_input_tokens_seen": 254882815, + "step": 11816, + "time_per_iteration": 2.536864757537842 + }, + { + "auxiliary_loss_clip": 0.01089565, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.04055083, + "balance_loss_mlp": 1.02666128, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.882870457660323, + "language_loss": 0.86055028, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88183612, + "num_input_tokens_seen": 254898705, + "step": 11817, + "time_per_iteration": 3.977057456970215 + }, + { + "auxiliary_loss_clip": 0.01065416, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.03989041, + "balance_loss_mlp": 1.01880646, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 2.0763497913867837, + "language_loss": 0.84542239, + "learning_rate": 8.162838805998897e-07, + "loss": 0.8663758, + "num_input_tokens_seen": 254913665, + "step": 11818, + "time_per_iteration": 2.529162645339966 + }, + { + "auxiliary_loss_clip": 0.01109695, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.03584814, + "balance_loss_mlp": 1.02250814, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 2.1032628416847823, + "language_loss": 0.76037973, + "learning_rate": 8.159699804924709e-07, + "loss": 0.78182971, + "num_input_tokens_seen": 254932140, + "step": 11819, + "time_per_iteration": 2.396634101867676 + }, + { + "auxiliary_loss_clip": 0.01072684, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.0380652, + "balance_loss_mlp": 1.01754069, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.542317828148232, + "language_loss": 0.71032876, + "learning_rate": 8.156561252835883e-07, + "loss": 0.73136836, + "num_input_tokens_seen": 254951580, + "step": 11820, + "time_per_iteration": 2.6062278747558594 + }, + { + "auxiliary_loss_clip": 0.01099153, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.03751802, + "balance_loss_mlp": 1.01688409, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 2.139257829966819, + "language_loss": 0.75439012, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77567381, + "num_input_tokens_seen": 254969425, + "step": 11821, + "time_per_iteration": 2.455502510070801 + }, + { + "auxiliary_loss_clip": 0.00988064, + "auxiliary_loss_mlp": 0.01001038, + "balance_loss_clip": 1.01384234, + "balance_loss_mlp": 0.99979848, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7665040280853131, + "language_loss": 0.55136526, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57125628, + "num_input_tokens_seen": 255032680, + "step": 11822, + "time_per_iteration": 3.284703016281128 + }, + { + "auxiliary_loss_clip": 0.01092463, + "auxiliary_loss_mlp": 0.01031573, + "balance_loss_clip": 1.03531563, + "balance_loss_mlp": 1.01918769, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 1.8772593401832744, + "language_loss": 0.60407817, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62531853, + "num_input_tokens_seen": 255054400, + "step": 11823, + "time_per_iteration": 2.734445810317993 + }, + { + "auxiliary_loss_clip": 0.01100587, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.03862584, + "balance_loss_mlp": 1.01851344, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 1.9606120407617424, + "language_loss": 0.71191132, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73321855, + "num_input_tokens_seen": 255072785, + "step": 11824, + "time_per_iteration": 2.5800528526306152 + }, + { + "auxiliary_loss_clip": 0.01077991, + "auxiliary_loss_mlp": 0.007803, + "balance_loss_clip": 1.03155589, + "balance_loss_mlp": 1.00052655, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.6991943428653047, + "language_loss": 0.72628909, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74487197, + "num_input_tokens_seen": 255091820, + "step": 11825, + "time_per_iteration": 2.4959521293640137 + }, + { + "auxiliary_loss_clip": 0.01081467, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.03584313, + "balance_loss_mlp": 1.02082038, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.6572440022197088, + "language_loss": 0.79497695, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81612265, + "num_input_tokens_seen": 255111720, + "step": 11826, + "time_per_iteration": 2.551039934158325 + }, + { + "auxiliary_loss_clip": 0.01096933, + "auxiliary_loss_mlp": 0.01033659, + "balance_loss_clip": 1.03604865, + "balance_loss_mlp": 1.0218631, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.744535217225179, + "language_loss": 0.83190429, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85321021, + "num_input_tokens_seen": 255133495, + "step": 11827, + "time_per_iteration": 2.5113561153411865 + }, + { + "auxiliary_loss_clip": 0.01081497, + "auxiliary_loss_mlp": 0.01037919, + "balance_loss_clip": 1.03703666, + "balance_loss_mlp": 1.02398956, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 1.6320660967620242, + "language_loss": 0.62409616, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64529032, + "num_input_tokens_seen": 255156880, + "step": 11828, + "time_per_iteration": 2.5891668796539307 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.01030293, + "balance_loss_clip": 1.03672171, + "balance_loss_mlp": 1.01774025, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.6317313068283499, + "language_loss": 0.72123736, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74263215, + "num_input_tokens_seen": 255178920, + "step": 11829, + "time_per_iteration": 2.4721479415893555 + }, + { + "auxiliary_loss_clip": 0.01109628, + "auxiliary_loss_mlp": 0.010286, + "balance_loss_clip": 1.03734708, + "balance_loss_mlp": 1.0168817, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.7221577709132596, + "language_loss": 0.80354971, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82493198, + "num_input_tokens_seen": 255198095, + "step": 11830, + "time_per_iteration": 2.4723904132843018 + }, + { + "auxiliary_loss_clip": 0.01099728, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.0365175, + "balance_loss_mlp": 1.02223849, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 1.8138808297563842, + "language_loss": 0.84305477, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86439836, + "num_input_tokens_seen": 255215860, + "step": 11831, + "time_per_iteration": 2.4579131603240967 + }, + { + "auxiliary_loss_clip": 0.01090286, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.03523922, + "balance_loss_mlp": 1.01788712, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.097090215337076, + "language_loss": 0.77188098, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79308748, + "num_input_tokens_seen": 255235425, + "step": 11832, + "time_per_iteration": 3.985790252685547 + }, + { + "auxiliary_loss_clip": 0.0102371, + "auxiliary_loss_mlp": 0.00998174, + "balance_loss_clip": 1.00980592, + "balance_loss_mlp": 0.99689811, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7466838291760199, + "language_loss": 0.56568539, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58590424, + "num_input_tokens_seen": 255291680, + "step": 11833, + "time_per_iteration": 2.9735794067382812 + }, + { + "auxiliary_loss_clip": 0.01065749, + "auxiliary_loss_mlp": 0.01032807, + "balance_loss_clip": 1.03781581, + "balance_loss_mlp": 1.02132106, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.6719536086167364, + "language_loss": 0.7054292, + "learning_rate": 8.11266873367315e-07, + "loss": 0.7264148, + "num_input_tokens_seen": 255313880, + "step": 11834, + "time_per_iteration": 2.6564691066741943 + }, + { + "auxiliary_loss_clip": 0.01113445, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.03884387, + "balance_loss_mlp": 1.01997685, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 1.84372152501153, + "language_loss": 0.79257989, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81403804, + "num_input_tokens_seen": 255332390, + "step": 11835, + "time_per_iteration": 2.436101198196411 + }, + { + "auxiliary_loss_clip": 0.01098999, + "auxiliary_loss_mlp": 0.01031943, + "balance_loss_clip": 1.03795683, + "balance_loss_mlp": 1.02013564, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.5569958987074701, + "language_loss": 0.76401472, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78532422, + "num_input_tokens_seen": 255354025, + "step": 11836, + "time_per_iteration": 2.5401296615600586 + }, + { + "auxiliary_loss_clip": 0.01043954, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.03611839, + "balance_loss_mlp": 1.01925945, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.9621773600075196, + "language_loss": 0.70372021, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72447205, + "num_input_tokens_seen": 255371400, + "step": 11837, + "time_per_iteration": 2.662991523742676 + }, + { + "auxiliary_loss_clip": 0.01103643, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.03828168, + "balance_loss_mlp": 1.02036774, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 2.2008717076679827, + "language_loss": 0.6241734, + "learning_rate": 8.100144227328958e-07, + "loss": 0.64555085, + "num_input_tokens_seen": 255390710, + "step": 11838, + "time_per_iteration": 2.5029773712158203 + }, + { + "auxiliary_loss_clip": 0.01101419, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.03886604, + "balance_loss_mlp": 1.01880908, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.165430537052388, + "language_loss": 0.67395175, + "learning_rate": 8.097014228555426e-07, + "loss": 0.6952759, + "num_input_tokens_seen": 255408790, + "step": 11839, + "time_per_iteration": 2.5079734325408936 + }, + { + "auxiliary_loss_clip": 0.01112335, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.03892362, + "balance_loss_mlp": 1.02050459, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 2.3912481949889153, + "language_loss": 0.84391344, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86536145, + "num_input_tokens_seen": 255426280, + "step": 11840, + "time_per_iteration": 2.4187984466552734 + }, + { + "auxiliary_loss_clip": 0.01089856, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.0357554, + "balance_loss_mlp": 1.02252603, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 3.209976963339771, + "language_loss": 0.76767504, + "learning_rate": 8.090755585214277e-07, + "loss": 0.78892398, + "num_input_tokens_seen": 255442935, + "step": 11841, + "time_per_iteration": 4.1975343227386475 + }, + { + "auxiliary_loss_clip": 0.01095182, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.03858924, + "balance_loss_mlp": 1.02119386, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.773247473821631, + "language_loss": 0.75252444, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77381188, + "num_input_tokens_seen": 255460925, + "step": 11842, + "time_per_iteration": 2.540527820587158 + }, + { + "auxiliary_loss_clip": 0.01033434, + "auxiliary_loss_mlp": 0.01008214, + "balance_loss_clip": 1.02491534, + "balance_loss_mlp": 1.00701606, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.7874821090540364, + "language_loss": 0.61608326, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63649976, + "num_input_tokens_seen": 255521360, + "step": 11843, + "time_per_iteration": 3.0419793128967285 + }, + { + "auxiliary_loss_clip": 0.01108752, + "auxiliary_loss_mlp": 0.01027287, + "balance_loss_clip": 1.03710473, + "balance_loss_mlp": 1.01549089, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.7244639940522077, + "language_loss": 0.80270612, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82406652, + "num_input_tokens_seen": 255541435, + "step": 11844, + "time_per_iteration": 3.751542806625366 + }, + { + "auxiliary_loss_clip": 0.01059313, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.03084779, + "balance_loss_mlp": 1.0184505, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.4476711487673835, + "language_loss": 0.78812408, + "learning_rate": 8.078243718677873e-07, + "loss": 0.80903244, + "num_input_tokens_seen": 255558505, + "step": 11845, + "time_per_iteration": 2.554741621017456 + }, + { + "auxiliary_loss_clip": 0.01094387, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.03699923, + "balance_loss_mlp": 1.02225244, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 3.615060688984417, + "language_loss": 0.77485168, + "learning_rate": 8.075116881932762e-07, + "loss": 0.7961427, + "num_input_tokens_seen": 255577815, + "step": 11846, + "time_per_iteration": 2.5135529041290283 + }, + { + "auxiliary_loss_clip": 0.01102464, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.037992, + "balance_loss_mlp": 1.02065682, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.8821454213951243, + "language_loss": 0.58499885, + "learning_rate": 8.071990497380421e-07, + "loss": 0.60635662, + "num_input_tokens_seen": 255595885, + "step": 11847, + "time_per_iteration": 2.4362282752990723 + }, + { + "auxiliary_loss_clip": 0.01096348, + "auxiliary_loss_mlp": 0.00777366, + "balance_loss_clip": 1.0375644, + "balance_loss_mlp": 1.00053334, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.3584302374022772, + "language_loss": 0.71701837, + "learning_rate": 8.068864565139395e-07, + "loss": 0.7357555, + "num_input_tokens_seen": 255616750, + "step": 11848, + "time_per_iteration": 2.510056972503662 + }, + { + "auxiliary_loss_clip": 0.01022654, + "auxiliary_loss_mlp": 0.01002622, + "balance_loss_clip": 1.00769985, + "balance_loss_mlp": 1.00138235, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8393989761624036, + "language_loss": 0.62997818, + "learning_rate": 8.065739085328211e-07, + "loss": 0.650231, + "num_input_tokens_seen": 255677900, + "step": 11849, + "time_per_iteration": 3.023348808288574 + }, + { + "auxiliary_loss_clip": 0.01087379, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.03792977, + "balance_loss_mlp": 1.02433062, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.642480639240929, + "language_loss": 0.64039195, + "learning_rate": 8.0626140580654e-07, + "loss": 0.66163546, + "num_input_tokens_seen": 255699140, + "step": 11850, + "time_per_iteration": 2.6805806159973145 + }, + { + "auxiliary_loss_clip": 0.01101272, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.0370748, + "balance_loss_mlp": 1.01843166, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.6131159178459016, + "language_loss": 0.69852519, + "learning_rate": 8.05948948346946e-07, + "loss": 0.71984649, + "num_input_tokens_seen": 255719640, + "step": 11851, + "time_per_iteration": 2.522926092147827 + }, + { + "auxiliary_loss_clip": 0.01100102, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.03781283, + "balance_loss_mlp": 1.02247691, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 2.1013978345560744, + "language_loss": 0.8334316, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85476959, + "num_input_tokens_seen": 255740450, + "step": 11852, + "time_per_iteration": 2.528515338897705 + }, + { + "auxiliary_loss_clip": 0.01101409, + "auxiliary_loss_mlp": 0.00780206, + "balance_loss_clip": 1.03590405, + "balance_loss_mlp": 1.00063944, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.1942291551524558, + "language_loss": 0.72842002, + "learning_rate": 8.053241692752126e-07, + "loss": 0.74723619, + "num_input_tokens_seen": 255758070, + "step": 11853, + "time_per_iteration": 2.435331344604492 + }, + { + "auxiliary_loss_clip": 0.0107132, + "auxiliary_loss_mlp": 0.01033489, + "balance_loss_clip": 1.03507113, + "balance_loss_mlp": 1.02110291, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 3.8143414499556516, + "language_loss": 0.92427659, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94532478, + "num_input_tokens_seen": 255775685, + "step": 11854, + "time_per_iteration": 2.5038344860076904 + }, + { + "auxiliary_loss_clip": 0.01098721, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.03813529, + "balance_loss_mlp": 1.01892352, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 1.8342650378078977, + "language_loss": 0.79483044, + "learning_rate": 8.046995714123856e-07, + "loss": 0.8161211, + "num_input_tokens_seen": 255794750, + "step": 11855, + "time_per_iteration": 2.4917988777160645 + }, + { + "auxiliary_loss_clip": 0.01065844, + "auxiliary_loss_mlp": 0.0103874, + "balance_loss_clip": 1.03437889, + "balance_loss_mlp": 1.0259186, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.869852204188039, + "language_loss": 0.73092973, + "learning_rate": 8.043873404639192e-07, + "loss": 0.75197554, + "num_input_tokens_seen": 255813325, + "step": 11856, + "time_per_iteration": 4.149824619293213 + }, + { + "auxiliary_loss_clip": 0.01102581, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.03881168, + "balance_loss_mlp": 1.02261543, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.5386078701026968, + "language_loss": 0.69916654, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72054034, + "num_input_tokens_seen": 255832470, + "step": 11857, + "time_per_iteration": 2.4849703311920166 + }, + { + "auxiliary_loss_clip": 0.01098986, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03628635, + "balance_loss_mlp": 1.01831436, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.151753387877818, + "language_loss": 0.85239869, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87369883, + "num_input_tokens_seen": 255849740, + "step": 11858, + "time_per_iteration": 2.442762613296509 + }, + { + "auxiliary_loss_clip": 0.01116114, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.04009891, + "balance_loss_mlp": 1.01960289, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.822084980420229, + "language_loss": 0.80109048, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82257664, + "num_input_tokens_seen": 255866975, + "step": 11859, + "time_per_iteration": 2.4020397663116455 + }, + { + "auxiliary_loss_clip": 0.01087216, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.03703547, + "balance_loss_mlp": 1.02409375, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.2763941082970642, + "language_loss": 0.68917847, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71041471, + "num_input_tokens_seen": 255892915, + "step": 11860, + "time_per_iteration": 2.819298267364502 + }, + { + "auxiliary_loss_clip": 0.01100366, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.03807127, + "balance_loss_mlp": 1.0192647, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 2.7972159501488, + "language_loss": 0.6422596, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66358662, + "num_input_tokens_seen": 255911480, + "step": 11861, + "time_per_iteration": 2.4717750549316406 + }, + { + "auxiliary_loss_clip": 0.01095261, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.03927052, + "balance_loss_mlp": 1.01508558, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.8829152399010562, + "language_loss": 0.67148662, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69271761, + "num_input_tokens_seen": 255931140, + "step": 11862, + "time_per_iteration": 2.529834270477295 + }, + { + "auxiliary_loss_clip": 0.01084468, + "auxiliary_loss_mlp": 0.01035637, + "balance_loss_clip": 1.03608453, + "balance_loss_mlp": 1.02424693, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 1.9626932084048683, + "language_loss": 0.66375721, + "learning_rate": 8.022029939445214e-07, + "loss": 0.68495828, + "num_input_tokens_seen": 255951665, + "step": 11863, + "time_per_iteration": 2.5615570545196533 + }, + { + "auxiliary_loss_clip": 0.01073517, + "auxiliary_loss_mlp": 0.01047611, + "balance_loss_clip": 1.03995943, + "balance_loss_mlp": 1.03329396, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 1.8983755603050037, + "language_loss": 0.65762013, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67883146, + "num_input_tokens_seen": 255970055, + "step": 11864, + "time_per_iteration": 2.589965581893921 + }, + { + "auxiliary_loss_clip": 0.01105714, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.03910732, + "balance_loss_mlp": 1.0204916, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 1.8660210335202798, + "language_loss": 0.855811, + "learning_rate": 8.015793035467697e-07, + "loss": 0.87720513, + "num_input_tokens_seen": 255987720, + "step": 11865, + "time_per_iteration": 2.448333740234375 + }, + { + "auxiliary_loss_clip": 0.01074464, + "auxiliary_loss_mlp": 0.01032472, + "balance_loss_clip": 1.03405404, + "balance_loss_mlp": 1.01870978, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 1.7959750542109367, + "language_loss": 0.74947327, + "learning_rate": 8.012675265083304e-07, + "loss": 0.77054262, + "num_input_tokens_seen": 256005490, + "step": 11866, + "time_per_iteration": 2.5351462364196777 + }, + { + "auxiliary_loss_clip": 0.01078757, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.03875732, + "balance_loss_mlp": 1.02053952, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 2.103588901225047, + "language_loss": 0.70748258, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72860575, + "num_input_tokens_seen": 256026030, + "step": 11867, + "time_per_iteration": 2.5706374645233154 + }, + { + "auxiliary_loss_clip": 0.01099064, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.03832865, + "balance_loss_mlp": 1.01440692, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 5.287083377742422, + "language_loss": 0.71540087, + "learning_rate": 8.006441088114397e-07, + "loss": 0.73664975, + "num_input_tokens_seen": 256043680, + "step": 11868, + "time_per_iteration": 2.4808831214904785 + }, + { + "auxiliary_loss_clip": 0.01064347, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.03638554, + "balance_loss_mlp": 1.01824081, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.2638119544767985, + "language_loss": 0.6578868, + "learning_rate": 8.003324681766286e-07, + "loss": 0.67886543, + "num_input_tokens_seen": 256059705, + "step": 11869, + "time_per_iteration": 2.535893201828003 + }, + { + "auxiliary_loss_clip": 0.01084945, + "auxiliary_loss_mlp": 0.01029118, + "balance_loss_clip": 1.03242683, + "balance_loss_mlp": 1.01645756, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.5774291517396382, + "language_loss": 0.77665162, + "learning_rate": 8.000208730333298e-07, + "loss": 0.79779226, + "num_input_tokens_seen": 256079785, + "step": 11870, + "time_per_iteration": 2.571667432785034 + }, + { + "auxiliary_loss_clip": 0.01061935, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.03446102, + "balance_loss_mlp": 1.0204488, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.74411724078012, + "language_loss": 0.80990005, + "learning_rate": 7.997093233933597e-07, + "loss": 0.83085763, + "num_input_tokens_seen": 256099000, + "step": 11871, + "time_per_iteration": 4.03711462020874 + }, + { + "auxiliary_loss_clip": 0.01080733, + "auxiliary_loss_mlp": 0.01039815, + "balance_loss_clip": 1.03604984, + "balance_loss_mlp": 1.02688122, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 1.6526029702860254, + "language_loss": 0.79072714, + "learning_rate": 7.993978192685331e-07, + "loss": 0.81193262, + "num_input_tokens_seen": 256117985, + "step": 11872, + "time_per_iteration": 2.543868064880371 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.03802621, + "balance_loss_mlp": 1.01569796, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.36589338116436, + "language_loss": 0.83907086, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86038089, + "num_input_tokens_seen": 256134350, + "step": 11873, + "time_per_iteration": 2.462777614593506 + }, + { + "auxiliary_loss_clip": 0.01075033, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.03490841, + "balance_loss_mlp": 1.02040637, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 2.034720657110695, + "language_loss": 0.86212814, + "learning_rate": 7.987749476115539e-07, + "loss": 0.883196, + "num_input_tokens_seen": 256150610, + "step": 11874, + "time_per_iteration": 2.5193748474121094 + }, + { + "auxiliary_loss_clip": 0.01103548, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.03752279, + "balance_loss_mlp": 1.01930678, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 2.0918394040368034, + "language_loss": 0.83476222, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85611421, + "num_input_tokens_seen": 256168620, + "step": 11875, + "time_per_iteration": 2.438324213027954 + }, + { + "auxiliary_loss_clip": 0.01093233, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.0375005, + "balance_loss_mlp": 1.02635574, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 2.0252141851080903, + "language_loss": 0.6922583, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71361393, + "num_input_tokens_seen": 256186700, + "step": 11876, + "time_per_iteration": 2.504262685775757 + }, + { + "auxiliary_loss_clip": 0.01114989, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.03943169, + "balance_loss_mlp": 1.02065182, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 2.7664275530176075, + "language_loss": 0.78183889, + "learning_rate": 7.978409817849079e-07, + "loss": 0.80332869, + "num_input_tokens_seen": 256205390, + "step": 11877, + "time_per_iteration": 2.39587140083313 + }, + { + "auxiliary_loss_clip": 0.01100845, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.03853953, + "balance_loss_mlp": 1.02084589, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 1.8687410242864666, + "language_loss": 0.69643748, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71777123, + "num_input_tokens_seen": 256224575, + "step": 11878, + "time_per_iteration": 2.4444496631622314 + }, + { + "auxiliary_loss_clip": 0.01074228, + "auxiliary_loss_mlp": 0.01032042, + "balance_loss_clip": 1.03824961, + "balance_loss_mlp": 1.0207231, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 2.6911223631723393, + "language_loss": 0.67592156, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69698429, + "num_input_tokens_seen": 256242130, + "step": 11879, + "time_per_iteration": 2.553189992904663 + }, + { + "auxiliary_loss_clip": 0.01057291, + "auxiliary_loss_mlp": 0.01043878, + "balance_loss_clip": 1.03671479, + "balance_loss_mlp": 1.02957892, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 1.838128584089996, + "language_loss": 0.6928249, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71383655, + "num_input_tokens_seen": 256261920, + "step": 11880, + "time_per_iteration": 2.595174551010132 + }, + { + "auxiliary_loss_clip": 0.01086347, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.03592324, + "balance_loss_mlp": 1.02228165, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.5804627136875635, + "language_loss": 0.81065476, + "learning_rate": 7.965963322749674e-07, + "loss": 0.83186531, + "num_input_tokens_seen": 256277970, + "step": 11881, + "time_per_iteration": 3.9865176677703857 + }, + { + "auxiliary_loss_clip": 0.0107395, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.0354501, + "balance_loss_mlp": 1.02296376, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.847072647785972, + "language_loss": 0.63788933, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65897232, + "num_input_tokens_seen": 256298205, + "step": 11882, + "time_per_iteration": 2.566450357437134 + }, + { + "auxiliary_loss_clip": 0.01113991, + "auxiliary_loss_mlp": 0.0102961, + "balance_loss_clip": 1.0390228, + "balance_loss_mlp": 1.01731384, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 1.7367726640833796, + "language_loss": 0.68601638, + "learning_rate": 7.959742812719304e-07, + "loss": 0.70745236, + "num_input_tokens_seen": 256316685, + "step": 11883, + "time_per_iteration": 2.396766185760498 + }, + { + "auxiliary_loss_clip": 0.0110199, + "auxiliary_loss_mlp": 0.01037694, + "balance_loss_clip": 1.03825176, + "balance_loss_mlp": 1.02434278, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 1.7543778964077525, + "language_loss": 0.77659345, + "learning_rate": 7.956633242496788e-07, + "loss": 0.79799032, + "num_input_tokens_seen": 256334205, + "step": 11884, + "time_per_iteration": 3.7862870693206787 + }, + { + "auxiliary_loss_clip": 0.01108016, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.0380106, + "balance_loss_mlp": 1.01815605, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 2.4631943418524442, + "language_loss": 0.73904967, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76045263, + "num_input_tokens_seen": 256353340, + "step": 11885, + "time_per_iteration": 2.461412191390991 + }, + { + "auxiliary_loss_clip": 0.01014257, + "auxiliary_loss_mlp": 0.01003206, + "balance_loss_clip": 1.008883, + "balance_loss_mlp": 1.00186491, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8885935543320851, + "language_loss": 0.66303128, + "learning_rate": 7.95041547222669e-07, + "loss": 0.6832059, + "num_input_tokens_seen": 256411550, + "step": 11886, + "time_per_iteration": 3.053929567337036 + }, + { + "auxiliary_loss_clip": 0.0106957, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.03819585, + "balance_loss_mlp": 1.01723456, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 1.9886334913582366, + "language_loss": 0.7496314, + "learning_rate": 7.947307272414874e-07, + "loss": 0.7706244, + "num_input_tokens_seen": 256430360, + "step": 11887, + "time_per_iteration": 2.5550224781036377 + }, + { + "auxiliary_loss_clip": 0.01099606, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_clip": 1.03679919, + "balance_loss_mlp": 1.01483047, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.5387215895700805, + "language_loss": 0.71712482, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73838896, + "num_input_tokens_seen": 256449750, + "step": 11888, + "time_per_iteration": 2.4615862369537354 + }, + { + "auxiliary_loss_clip": 0.01097154, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.03469539, + "balance_loss_mlp": 1.02357328, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 2.169538619499874, + "language_loss": 0.84118104, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86252189, + "num_input_tokens_seen": 256467330, + "step": 11889, + "time_per_iteration": 2.543673515319824 + }, + { + "auxiliary_loss_clip": 0.01065519, + "auxiliary_loss_mlp": 0.01028813, + "balance_loss_clip": 1.03858554, + "balance_loss_mlp": 1.01629567, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 1.868161829117787, + "language_loss": 0.76114333, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78208667, + "num_input_tokens_seen": 256485705, + "step": 11890, + "time_per_iteration": 2.6173903942108154 + }, + { + "auxiliary_loss_clip": 0.01071476, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.03288782, + "balance_loss_mlp": 1.02602935, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.451202009182958, + "language_loss": 0.73679274, + "learning_rate": 7.934879044739147e-07, + "loss": 0.75789344, + "num_input_tokens_seen": 256504755, + "step": 11891, + "time_per_iteration": 2.5715548992156982 + }, + { + "auxiliary_loss_clip": 0.01067833, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.03560412, + "balance_loss_mlp": 1.02124548, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 6.6247057118232435, + "language_loss": 0.6757766, + "learning_rate": 7.931773131302211e-07, + "loss": 0.6967988, + "num_input_tokens_seen": 256523670, + "step": 11892, + "time_per_iteration": 2.547236204147339 + }, + { + "auxiliary_loss_clip": 0.01078654, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.03934598, + "balance_loss_mlp": 1.01714587, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 3.870088547347329, + "language_loss": 0.73815227, + "learning_rate": 7.928667675493632e-07, + "loss": 0.75924909, + "num_input_tokens_seen": 256542225, + "step": 11893, + "time_per_iteration": 2.5453100204467773 + }, + { + "auxiliary_loss_clip": 0.01115622, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.03894567, + "balance_loss_mlp": 1.01856017, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.0571657031993054, + "language_loss": 0.66489065, + "learning_rate": 7.925562677431185e-07, + "loss": 0.68636602, + "num_input_tokens_seen": 256560730, + "step": 11894, + "time_per_iteration": 2.412656784057617 + }, + { + "auxiliary_loss_clip": 0.01078709, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.03847098, + "balance_loss_mlp": 1.02031994, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.6900446831963596, + "language_loss": 0.7758013, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79691827, + "num_input_tokens_seen": 256580505, + "step": 11895, + "time_per_iteration": 2.5729105472564697 + }, + { + "auxiliary_loss_clip": 0.01103844, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.03932786, + "balance_loss_mlp": 1.02019644, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 1.9236870293036075, + "language_loss": 0.69479084, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71616745, + "num_input_tokens_seen": 256597330, + "step": 11896, + "time_per_iteration": 4.008653402328491 + }, + { + "auxiliary_loss_clip": 0.01091589, + "auxiliary_loss_mlp": 0.01041491, + "balance_loss_clip": 1.03759992, + "balance_loss_mlp": 1.02764523, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 2.5636922293042894, + "language_loss": 0.86693513, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88826585, + "num_input_tokens_seen": 256616030, + "step": 11897, + "time_per_iteration": 2.490389347076416 + }, + { + "auxiliary_loss_clip": 0.01087872, + "auxiliary_loss_mlp": 0.01036505, + "balance_loss_clip": 1.03603673, + "balance_loss_mlp": 1.02433395, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 2.786743728277686, + "language_loss": 0.77997541, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80121922, + "num_input_tokens_seen": 256635570, + "step": 11898, + "time_per_iteration": 2.489943027496338 + }, + { + "auxiliary_loss_clip": 0.01090768, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.03599715, + "balance_loss_mlp": 1.01609647, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.6127944338413789, + "language_loss": 0.72714305, + "learning_rate": 7.910044557431302e-07, + "loss": 0.74835151, + "num_input_tokens_seen": 256655290, + "step": 11899, + "time_per_iteration": 2.517857789993286 + }, + { + "auxiliary_loss_clip": 0.01100661, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.03590226, + "balance_loss_mlp": 1.02165651, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 2.6170524176695715, + "language_loss": 0.76253712, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78389418, + "num_input_tokens_seen": 256671605, + "step": 11900, + "time_per_iteration": 2.4741530418395996 + }, + { + "auxiliary_loss_clip": 0.01102833, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.03916967, + "balance_loss_mlp": 1.01610899, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.9207368988853895, + "language_loss": 0.80879068, + "learning_rate": 7.903840517773886e-07, + "loss": 0.83010399, + "num_input_tokens_seen": 256689680, + "step": 11901, + "time_per_iteration": 2.441649913787842 + }, + { + "auxiliary_loss_clip": 0.010807, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.03548014, + "balance_loss_mlp": 1.023242, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 1.7793440198949675, + "language_loss": 0.81451857, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83568931, + "num_input_tokens_seen": 256707760, + "step": 11902, + "time_per_iteration": 2.5301592350006104 + }, + { + "auxiliary_loss_clip": 0.01070447, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.03513765, + "balance_loss_mlp": 1.01881468, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 2.047044204022513, + "language_loss": 0.68109226, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70211357, + "num_input_tokens_seen": 256724150, + "step": 11903, + "time_per_iteration": 2.477769374847412 + }, + { + "auxiliary_loss_clip": 0.01075614, + "auxiliary_loss_mlp": 0.01031799, + "balance_loss_clip": 1.03870738, + "balance_loss_mlp": 1.01996112, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.6648614900970462, + "language_loss": 0.75945663, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78053069, + "num_input_tokens_seen": 256742780, + "step": 11904, + "time_per_iteration": 2.5341672897338867 + }, + { + "auxiliary_loss_clip": 0.01091199, + "auxiliary_loss_mlp": 0.01036938, + "balance_loss_clip": 1.03805828, + "balance_loss_mlp": 1.02315164, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 2.478005966754301, + "language_loss": 0.72339511, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74467647, + "num_input_tokens_seen": 256761355, + "step": 11905, + "time_per_iteration": 2.464578151702881 + }, + { + "auxiliary_loss_clip": 0.01075508, + "auxiliary_loss_mlp": 0.01033649, + "balance_loss_clip": 1.03639245, + "balance_loss_mlp": 1.02159095, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.508701223178089, + "language_loss": 0.77682924, + "learning_rate": 7.88833844772076e-07, + "loss": 0.79792082, + "num_input_tokens_seen": 256781335, + "step": 11906, + "time_per_iteration": 2.5343737602233887 + }, + { + "auxiliary_loss_clip": 0.01014042, + "auxiliary_loss_mlp": 0.01002696, + "balance_loss_clip": 1.01004934, + "balance_loss_mlp": 1.00161672, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.7350624996998165, + "language_loss": 0.5547691, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57493645, + "num_input_tokens_seen": 256838890, + "step": 11907, + "time_per_iteration": 3.001647472381592 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.03509545, + "balance_loss_mlp": 1.02619684, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.8239708682255265, + "language_loss": 0.69769561, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71905744, + "num_input_tokens_seen": 256858145, + "step": 11908, + "time_per_iteration": 2.448072910308838 + }, + { + "auxiliary_loss_clip": 0.01064695, + "auxiliary_loss_mlp": 0.01039057, + "balance_loss_clip": 1.0324986, + "balance_loss_mlp": 1.02504992, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 2.5301471553400483, + "language_loss": 0.71589446, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73693198, + "num_input_tokens_seen": 256878545, + "step": 11909, + "time_per_iteration": 2.590439558029175 + }, + { + "auxiliary_loss_clip": 0.01099665, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.03833175, + "balance_loss_mlp": 1.02220392, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 1.9703795124683303, + "language_loss": 0.7512548, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77259952, + "num_input_tokens_seen": 256899920, + "step": 11910, + "time_per_iteration": 4.015405893325806 + }, + { + "auxiliary_loss_clip": 0.01088329, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.03979731, + "balance_loss_mlp": 1.02207851, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 2.5301861442700626, + "language_loss": 0.76733637, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78855354, + "num_input_tokens_seen": 256918460, + "step": 11911, + "time_per_iteration": 2.511751651763916 + }, + { + "auxiliary_loss_clip": 0.01070902, + "auxiliary_loss_mlp": 0.01042163, + "balance_loss_clip": 1.03509712, + "balance_loss_mlp": 1.02736926, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.7448064750365193, + "language_loss": 0.5860126, + "learning_rate": 7.869751121037192e-07, + "loss": 0.60714322, + "num_input_tokens_seen": 256942015, + "step": 11912, + "time_per_iteration": 2.898501396179199 + }, + { + "auxiliary_loss_clip": 0.01101285, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.03920209, + "balance_loss_mlp": 1.01957488, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 3.6938841691792113, + "language_loss": 0.78240472, + "learning_rate": 7.866654842502376e-07, + "loss": 0.80374646, + "num_input_tokens_seen": 256961065, + "step": 11913, + "time_per_iteration": 2.484692335128784 + }, + { + "auxiliary_loss_clip": 0.01086365, + "auxiliary_loss_mlp": 0.0102671, + "balance_loss_clip": 1.03624189, + "balance_loss_mlp": 1.01559329, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.5103620843277066, + "language_loss": 0.73986685, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76099765, + "num_input_tokens_seen": 256982165, + "step": 11914, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.01075392, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.0360713, + "balance_loss_mlp": 1.02152157, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.814179011365894, + "language_loss": 0.74146473, + "learning_rate": 7.860463665843143e-07, + "loss": 0.7625531, + "num_input_tokens_seen": 256999825, + "step": 11915, + "time_per_iteration": 2.502432346343994 + }, + { + "auxiliary_loss_clip": 0.01111905, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.03692436, + "balance_loss_mlp": 1.02082634, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 1.7661056653218596, + "language_loss": 0.81296396, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83441448, + "num_input_tokens_seen": 257017450, + "step": 11916, + "time_per_iteration": 2.397294521331787 + }, + { + "auxiliary_loss_clip": 0.01037846, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.03456509, + "balance_loss_mlp": 1.0225147, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 2.147832487820179, + "language_loss": 0.68479633, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70552874, + "num_input_tokens_seen": 257035465, + "step": 11917, + "time_per_iteration": 2.6082065105438232 + }, + { + "auxiliary_loss_clip": 0.0108933, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.0357182, + "balance_loss_mlp": 1.01839137, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 1.572129084123218, + "language_loss": 0.75990909, + "learning_rate": 7.851180353640896e-07, + "loss": 0.78111315, + "num_input_tokens_seen": 257053750, + "step": 11918, + "time_per_iteration": 2.493542194366455 + }, + { + "auxiliary_loss_clip": 0.01014798, + "auxiliary_loss_mlp": 0.01002861, + "balance_loss_clip": 1.01038885, + "balance_loss_mlp": 1.00163937, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6257729174781608, + "language_loss": 0.53902125, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55919784, + "num_input_tokens_seen": 257121215, + "step": 11919, + "time_per_iteration": 3.1084508895874023 + }, + { + "auxiliary_loss_clip": 0.01095408, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.04267621, + "balance_loss_mlp": 1.0181911, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 3.284511522417224, + "language_loss": 0.69051039, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71176636, + "num_input_tokens_seen": 257143370, + "step": 11920, + "time_per_iteration": 4.098114252090454 + }, + { + "auxiliary_loss_clip": 0.01092324, + "auxiliary_loss_mlp": 0.01041896, + "balance_loss_clip": 1.03563464, + "balance_loss_mlp": 1.02839589, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 2.224411000621471, + "language_loss": 0.74924213, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77058429, + "num_input_tokens_seen": 257162160, + "step": 11921, + "time_per_iteration": 2.5568275451660156 + }, + { + "auxiliary_loss_clip": 0.0108213, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.03967929, + "balance_loss_mlp": 1.02018905, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 3.1836894350083615, + "language_loss": 0.75859249, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77976763, + "num_input_tokens_seen": 257179300, + "step": 11922, + "time_per_iteration": 2.5036449432373047 + }, + { + "auxiliary_loss_clip": 0.01013876, + "auxiliary_loss_mlp": 0.01004834, + "balance_loss_clip": 1.00769579, + "balance_loss_mlp": 1.00367188, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7816958626594314, + "language_loss": 0.5516175, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57180464, + "num_input_tokens_seen": 257235470, + "step": 11923, + "time_per_iteration": 4.143768548965454 + }, + { + "auxiliary_loss_clip": 0.01080041, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.03529656, + "balance_loss_mlp": 1.02283907, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.5561383339043018, + "language_loss": 0.77069515, + "learning_rate": 7.832626170883279e-07, + "loss": 0.7918511, + "num_input_tokens_seen": 257255850, + "step": 11924, + "time_per_iteration": 2.5843451023101807 + }, + { + "auxiliary_loss_clip": 0.01078254, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.03724229, + "balance_loss_mlp": 1.0200851, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.9216894103808053, + "language_loss": 0.68646932, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70756441, + "num_input_tokens_seen": 257275425, + "step": 11925, + "time_per_iteration": 2.5559914112091064 + }, + { + "auxiliary_loss_clip": 0.01079728, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.03636789, + "balance_loss_mlp": 1.01949883, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 4.141376572511832, + "language_loss": 0.77658963, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79769611, + "num_input_tokens_seen": 257295740, + "step": 11926, + "time_per_iteration": 2.546434164047241 + }, + { + "auxiliary_loss_clip": 0.01103119, + "auxiliary_loss_mlp": 0.00778802, + "balance_loss_clip": 1.03991103, + "balance_loss_mlp": 1.00059807, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 1.9528190860614127, + "language_loss": 0.77255094, + "learning_rate": 7.823355306660093e-07, + "loss": 0.79137015, + "num_input_tokens_seen": 257315970, + "step": 11927, + "time_per_iteration": 2.50309157371521 + }, + { + "auxiliary_loss_clip": 0.01100508, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.03796864, + "balance_loss_mlp": 1.01881909, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.6501409130190672, + "language_loss": 0.68934649, + "learning_rate": 7.820265941908642e-07, + "loss": 0.7106691, + "num_input_tokens_seen": 257334230, + "step": 11928, + "time_per_iteration": 2.4701125621795654 + }, + { + "auxiliary_loss_clip": 0.01064414, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.03612089, + "balance_loss_mlp": 1.01901257, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 2.3867085004868605, + "language_loss": 0.65376538, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67471462, + "num_input_tokens_seen": 257352145, + "step": 11929, + "time_per_iteration": 2.611354351043701 + }, + { + "auxiliary_loss_clip": 0.01087101, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.03466344, + "balance_loss_mlp": 1.01973104, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 2.0132478345301323, + "language_loss": 0.6943956, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71559131, + "num_input_tokens_seen": 257371460, + "step": 11930, + "time_per_iteration": 2.5032005310058594 + }, + { + "auxiliary_loss_clip": 0.01077633, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.03364849, + "balance_loss_mlp": 1.01717854, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 1.9017580106438043, + "language_loss": 0.80687469, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82794809, + "num_input_tokens_seen": 257390800, + "step": 11931, + "time_per_iteration": 2.5337226390838623 + }, + { + "auxiliary_loss_clip": 0.01099654, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.03825855, + "balance_loss_mlp": 1.02049518, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 1.9197934351131511, + "language_loss": 0.78220898, + "learning_rate": 7.80791310264143e-07, + "loss": 0.80352724, + "num_input_tokens_seen": 257407495, + "step": 11932, + "time_per_iteration": 2.4305880069732666 + }, + { + "auxiliary_loss_clip": 0.01097967, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_clip": 1.04007697, + "balance_loss_mlp": 1.01921523, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 1.4666241539830431, + "language_loss": 0.75072837, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77201998, + "num_input_tokens_seen": 257429675, + "step": 11933, + "time_per_iteration": 2.5164272785186768 + }, + { + "auxiliary_loss_clip": 0.01118548, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.03999996, + "balance_loss_mlp": 1.02055907, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.842050960730028, + "language_loss": 0.69955671, + "learning_rate": 7.801739456490388e-07, + "loss": 0.7211014, + "num_input_tokens_seen": 257442765, + "step": 11934, + "time_per_iteration": 2.371309757232666 + }, + { + "auxiliary_loss_clip": 0.01102183, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.03697634, + "balance_loss_mlp": 1.0203948, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 2.0895175770540364, + "language_loss": 0.86845195, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88980627, + "num_input_tokens_seen": 257459310, + "step": 11935, + "time_per_iteration": 3.963916063308716 + }, + { + "auxiliary_loss_clip": 0.01070921, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.03225505, + "balance_loss_mlp": 1.02150917, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.4988882725402624, + "language_loss": 0.73487222, + "learning_rate": 7.795567660576388e-07, + "loss": 0.7559253, + "num_input_tokens_seen": 257484750, + "step": 11936, + "time_per_iteration": 2.7004897594451904 + }, + { + "auxiliary_loss_clip": 0.01029889, + "auxiliary_loss_mlp": 0.01001185, + "balance_loss_clip": 1.00607967, + "balance_loss_mlp": 1.00005865, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7547215184880507, + "language_loss": 0.55867225, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57898301, + "num_input_tokens_seen": 257543110, + "step": 11937, + "time_per_iteration": 2.995461940765381 + }, + { + "auxiliary_loss_clip": 0.0110377, + "auxiliary_loss_mlp": 0.01035817, + "balance_loss_clip": 1.0386126, + "balance_loss_mlp": 1.02188742, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 1.9995699316177278, + "language_loss": 0.54446459, + "learning_rate": 7.789397715835542e-07, + "loss": 0.56586045, + "num_input_tokens_seen": 257567410, + "step": 11938, + "time_per_iteration": 2.6006484031677246 + }, + { + "auxiliary_loss_clip": 0.01095524, + "auxiliary_loss_mlp": 0.01029944, + "balance_loss_clip": 1.03458357, + "balance_loss_mlp": 1.01810646, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.5913044950255653, + "language_loss": 0.7665211, + "learning_rate": 7.786313437947527e-07, + "loss": 0.7877757, + "num_input_tokens_seen": 257586270, + "step": 11939, + "time_per_iteration": 2.459486961364746 + }, + { + "auxiliary_loss_clip": 0.01012027, + "auxiliary_loss_mlp": 0.00999454, + "balance_loss_clip": 1.00689197, + "balance_loss_mlp": 0.99811906, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7517313599973958, + "language_loss": 0.61352199, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63363677, + "num_input_tokens_seen": 257647415, + "step": 11940, + "time_per_iteration": 3.0367484092712402 + }, + { + "auxiliary_loss_clip": 0.01072575, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.0346067, + "balance_loss_mlp": 1.02063847, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.7018415573470511, + "language_loss": 0.59226859, + "learning_rate": 7.780146271721097e-07, + "loss": 0.6133213, + "num_input_tokens_seen": 257669795, + "step": 11941, + "time_per_iteration": 2.5698435306549072 + }, + { + "auxiliary_loss_clip": 0.01087828, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.0364213, + "balance_loss_mlp": 1.01569986, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 2.235826451871709, + "language_loss": 0.79317373, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81432664, + "num_input_tokens_seen": 257687415, + "step": 11942, + "time_per_iteration": 2.5069096088409424 + }, + { + "auxiliary_loss_clip": 0.01102011, + "auxiliary_loss_mlp": 0.01043192, + "balance_loss_clip": 1.03877056, + "balance_loss_mlp": 1.03059745, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 2.118873819598508, + "language_loss": 0.66063696, + "learning_rate": 7.773980959006968e-07, + "loss": 0.68208897, + "num_input_tokens_seen": 257706215, + "step": 11943, + "time_per_iteration": 2.4381160736083984 + }, + { + "auxiliary_loss_clip": 0.01108283, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.03638268, + "balance_loss_mlp": 1.01910138, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.8192220616014358, + "language_loss": 0.79097915, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81237769, + "num_input_tokens_seen": 257724740, + "step": 11944, + "time_per_iteration": 2.3990814685821533 + }, + { + "auxiliary_loss_clip": 0.01084989, + "auxiliary_loss_mlp": 0.00777957, + "balance_loss_clip": 1.0356071, + "balance_loss_mlp": 1.00060534, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.551751090068564, + "language_loss": 0.62710476, + "learning_rate": 7.767817500740277e-07, + "loss": 0.64573419, + "num_input_tokens_seen": 257742060, + "step": 11945, + "time_per_iteration": 2.4534809589385986 + }, + { + "auxiliary_loss_clip": 0.01027621, + "auxiliary_loss_mlp": 0.01002008, + "balance_loss_clip": 1.01844215, + "balance_loss_mlp": 1.00079811, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 1.0675809955333542, + "language_loss": 0.51034915, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53064549, + "num_input_tokens_seen": 257802250, + "step": 11946, + "time_per_iteration": 2.976539373397827 + }, + { + "auxiliary_loss_clip": 0.01079088, + "auxiliary_loss_mlp": 0.01033797, + "balance_loss_clip": 1.03548348, + "balance_loss_mlp": 1.01983738, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 1.7081223374552503, + "language_loss": 0.74511504, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76624388, + "num_input_tokens_seen": 257821155, + "step": 11947, + "time_per_iteration": 2.538278579711914 + }, + { + "auxiliary_loss_clip": 0.01064327, + "auxiliary_loss_mlp": 0.00777751, + "balance_loss_clip": 1.03197992, + "balance_loss_mlp": 1.00049114, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.7434028342501395, + "language_loss": 0.72489715, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74331796, + "num_input_tokens_seen": 257839905, + "step": 11948, + "time_per_iteration": 2.5741374492645264 + }, + { + "auxiliary_loss_clip": 0.01090629, + "auxiliary_loss_mlp": 0.01038821, + "balance_loss_clip": 1.03446341, + "balance_loss_mlp": 1.02579761, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.704350909704034, + "language_loss": 0.71473479, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73602927, + "num_input_tokens_seen": 257860055, + "step": 11949, + "time_per_iteration": 3.9757180213928223 + }, + { + "auxiliary_loss_clip": 0.01108612, + "auxiliary_loss_mlp": 0.00777963, + "balance_loss_clip": 1.03685725, + "balance_loss_mlp": 1.00053406, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 2.8398854218544396, + "language_loss": 0.76204145, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78090715, + "num_input_tokens_seen": 257879315, + "step": 11950, + "time_per_iteration": 2.471921682357788 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.04012024, + "balance_loss_mlp": 1.02207422, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.648807991662104, + "language_loss": 0.67584538, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69735169, + "num_input_tokens_seen": 257896570, + "step": 11951, + "time_per_iteration": 2.4092748165130615 + }, + { + "auxiliary_loss_clip": 0.01091553, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.03649819, + "balance_loss_mlp": 1.01816964, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.744957928808669, + "language_loss": 0.78123099, + "learning_rate": 7.746260014075286e-07, + "loss": 0.8024683, + "num_input_tokens_seen": 257916855, + "step": 11952, + "time_per_iteration": 2.510828733444214 + }, + { + "auxiliary_loss_clip": 0.0110546, + "auxiliary_loss_mlp": 0.01033813, + "balance_loss_clip": 1.0384202, + "balance_loss_mlp": 1.0209384, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 1.9692273016548023, + "language_loss": 0.75144619, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77283889, + "num_input_tokens_seen": 257937140, + "step": 11953, + "time_per_iteration": 2.5007808208465576 + }, + { + "auxiliary_loss_clip": 0.0110205, + "auxiliary_loss_mlp": 0.01031246, + "balance_loss_clip": 1.03638375, + "balance_loss_mlp": 1.01850259, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 1.75656602307128, + "language_loss": 0.72989464, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75122762, + "num_input_tokens_seen": 257956785, + "step": 11954, + "time_per_iteration": 2.446626663208008 + }, + { + "auxiliary_loss_clip": 0.01092539, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.04066801, + "balance_loss_mlp": 1.02398646, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.8962221537614068, + "language_loss": 0.74308002, + "learning_rate": 7.737028058829425e-07, + "loss": 0.7643764, + "num_input_tokens_seen": 257975455, + "step": 11955, + "time_per_iteration": 2.4950032234191895 + }, + { + "auxiliary_loss_clip": 0.01082673, + "auxiliary_loss_mlp": 0.01033896, + "balance_loss_clip": 1.04005754, + "balance_loss_mlp": 1.02152777, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.7148046382310436, + "language_loss": 0.73187542, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75304115, + "num_input_tokens_seen": 257996850, + "step": 11956, + "time_per_iteration": 2.6311497688293457 + }, + { + "auxiliary_loss_clip": 0.01029366, + "auxiliary_loss_mlp": 0.0104007, + "balance_loss_clip": 1.02859533, + "balance_loss_mlp": 1.02508569, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 1.7070113822565744, + "language_loss": 0.712506, + "learning_rate": 7.730875746869987e-07, + "loss": 0.73320037, + "num_input_tokens_seen": 258016145, + "step": 11957, + "time_per_iteration": 2.6530001163482666 + }, + { + "auxiliary_loss_clip": 0.01065587, + "auxiliary_loss_mlp": 0.01040892, + "balance_loss_clip": 1.03525782, + "balance_loss_mlp": 1.02742183, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.8545985916705197, + "language_loss": 0.7376886, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75875342, + "num_input_tokens_seen": 258035420, + "step": 11958, + "time_per_iteration": 2.6840388774871826 + }, + { + "auxiliary_loss_clip": 0.0109599, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.03405344, + "balance_loss_mlp": 1.02436805, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.6769705584496408, + "language_loss": 0.84240377, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86373758, + "num_input_tokens_seen": 258053520, + "step": 11959, + "time_per_iteration": 3.963736057281494 + }, + { + "auxiliary_loss_clip": 0.01117715, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.0414722, + "balance_loss_mlp": 1.02007723, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.4965456342509287, + "language_loss": 0.81934685, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84085834, + "num_input_tokens_seen": 258073020, + "step": 11960, + "time_per_iteration": 2.4643075466156006 + }, + { + "auxiliary_loss_clip": 0.01085038, + "auxiliary_loss_mlp": 0.01041293, + "balance_loss_clip": 1.03566873, + "balance_loss_mlp": 1.02742875, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.7479150775383663, + "language_loss": 0.78031659, + "learning_rate": 7.718576706841013e-07, + "loss": 0.80157995, + "num_input_tokens_seen": 258093155, + "step": 11961, + "time_per_iteration": 2.526754379272461 + }, + { + "auxiliary_loss_clip": 0.01091942, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.03647506, + "balance_loss_mlp": 1.01943791, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.6270266280703822, + "language_loss": 0.75113124, + "learning_rate": 7.715503110824326e-07, + "loss": 0.77235961, + "num_input_tokens_seen": 258113905, + "step": 11962, + "time_per_iteration": 2.479374408721924 + }, + { + "auxiliary_loss_clip": 0.01100854, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.0373559, + "balance_loss_mlp": 1.02067041, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.6182670440067806, + "language_loss": 0.75183403, + "learning_rate": 7.712429980637001e-07, + "loss": 0.7731905, + "num_input_tokens_seen": 258132820, + "step": 11963, + "time_per_iteration": 3.776919364929199 + }, + { + "auxiliary_loss_clip": 0.01076474, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.037503, + "balance_loss_mlp": 1.02154493, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 2.766546516882408, + "language_loss": 0.80696166, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82808197, + "num_input_tokens_seen": 258148055, + "step": 11964, + "time_per_iteration": 2.478213310241699 + }, + { + "auxiliary_loss_clip": 0.01098664, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.03722489, + "balance_loss_mlp": 1.01851809, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.7310987159930553, + "language_loss": 0.75058508, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77187985, + "num_input_tokens_seen": 258165995, + "step": 11965, + "time_per_iteration": 2.4493513107299805 + }, + { + "auxiliary_loss_clip": 0.01085888, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.03819537, + "balance_loss_mlp": 1.01973581, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.5646028721657368, + "language_loss": 0.77476764, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79595423, + "num_input_tokens_seen": 258186165, + "step": 11966, + "time_per_iteration": 2.53588604927063 + }, + { + "auxiliary_loss_clip": 0.01087957, + "auxiliary_loss_mlp": 0.01034708, + "balance_loss_clip": 1.03513706, + "balance_loss_mlp": 1.02118957, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 2.053512547500261, + "language_loss": 0.73103797, + "learning_rate": 7.700142120511619e-07, + "loss": 0.75226462, + "num_input_tokens_seen": 258204595, + "step": 11967, + "time_per_iteration": 2.4799156188964844 + }, + { + "auxiliary_loss_clip": 0.01082353, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.0365746, + "balance_loss_mlp": 1.01980448, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 1.7578333908613655, + "language_loss": 0.81625509, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83738744, + "num_input_tokens_seen": 258223110, + "step": 11968, + "time_per_iteration": 2.4837605953216553 + }, + { + "auxiliary_loss_clip": 0.01087008, + "auxiliary_loss_mlp": 0.01024215, + "balance_loss_clip": 1.0362258, + "balance_loss_mlp": 1.01253283, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 1.867496515247873, + "language_loss": 0.76132321, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78243542, + "num_input_tokens_seen": 258242660, + "step": 11969, + "time_per_iteration": 2.50846004486084 + }, + { + "auxiliary_loss_clip": 0.01070784, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.03044248, + "balance_loss_mlp": 1.01833177, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.643888428839624, + "language_loss": 0.70987886, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73090434, + "num_input_tokens_seen": 258261850, + "step": 11970, + "time_per_iteration": 2.5101239681243896 + }, + { + "auxiliary_loss_clip": 0.01013193, + "auxiliary_loss_mlp": 0.01000204, + "balance_loss_clip": 1.00810099, + "balance_loss_mlp": 0.99891037, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9263917079942814, + "language_loss": 0.60807705, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62821102, + "num_input_tokens_seen": 258312570, + "step": 11971, + "time_per_iteration": 2.988412618637085 + }, + { + "auxiliary_loss_clip": 0.0111817, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.03907371, + "balance_loss_mlp": 1.02039516, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 2.500458521150614, + "language_loss": 0.79996008, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82148135, + "num_input_tokens_seen": 258331600, + "step": 11972, + "time_per_iteration": 2.4321482181549072 + }, + { + "auxiliary_loss_clip": 0.01092733, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.0362426, + "balance_loss_mlp": 1.02219868, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.5384459414827838, + "language_loss": 0.75579083, + "learning_rate": 7.681724325006733e-07, + "loss": 0.7770735, + "num_input_tokens_seen": 258351785, + "step": 11973, + "time_per_iteration": 2.5384509563446045 + }, + { + "auxiliary_loss_clip": 0.00997824, + "auxiliary_loss_mlp": 0.01000446, + "balance_loss_clip": 1.01091743, + "balance_loss_mlp": 0.99920636, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8563360983663675, + "language_loss": 0.57170326, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59168595, + "num_input_tokens_seen": 258404035, + "step": 11974, + "time_per_iteration": 4.565528869628906 + }, + { + "auxiliary_loss_clip": 0.01085813, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.03582144, + "balance_loss_mlp": 1.01923621, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 1.9342678090908578, + "language_loss": 0.61741006, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63859034, + "num_input_tokens_seen": 258424850, + "step": 11975, + "time_per_iteration": 2.541072130203247 + }, + { + "auxiliary_loss_clip": 0.0109864, + "auxiliary_loss_mlp": 0.01033469, + "balance_loss_clip": 1.03440654, + "balance_loss_mlp": 1.02115464, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.9052085365492493, + "language_loss": 0.67699838, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69831955, + "num_input_tokens_seen": 258445485, + "step": 11976, + "time_per_iteration": 2.499866008758545 + }, + { + "auxiliary_loss_clip": 0.0108486, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.03986442, + "balance_loss_mlp": 1.01777148, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 5.15683298119589, + "language_loss": 0.67476571, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69590932, + "num_input_tokens_seen": 258464505, + "step": 11977, + "time_per_iteration": 2.5285983085632324 + }, + { + "auxiliary_loss_clip": 0.01090661, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.03598285, + "balance_loss_mlp": 1.01992214, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.7415867115435366, + "language_loss": 0.75515079, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77638197, + "num_input_tokens_seen": 258487190, + "step": 11978, + "time_per_iteration": 2.562349796295166 + }, + { + "auxiliary_loss_clip": 0.01109245, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03630495, + "balance_loss_mlp": 1.01829481, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 1.7918450440042049, + "language_loss": 0.78960085, + "learning_rate": 7.663323345468908e-07, + "loss": 0.81100762, + "num_input_tokens_seen": 258503790, + "step": 11979, + "time_per_iteration": 2.4517531394958496 + }, + { + "auxiliary_loss_clip": 0.01100751, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.03640151, + "balance_loss_mlp": 1.01799655, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.5975995468409014, + "language_loss": 0.6496762, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67099506, + "num_input_tokens_seen": 258527335, + "step": 11980, + "time_per_iteration": 2.518158435821533 + }, + { + "auxiliary_loss_clip": 0.01105123, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.0392108, + "balance_loss_mlp": 1.02460861, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 2.0567236724405062, + "language_loss": 0.67089057, + "learning_rate": 7.657193426846871e-07, + "loss": 0.6923337, + "num_input_tokens_seen": 258546690, + "step": 11981, + "time_per_iteration": 2.503544807434082 + }, + { + "auxiliary_loss_clip": 0.01081479, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.03431535, + "balance_loss_mlp": 1.02105379, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.7007230746423208, + "language_loss": 0.73613751, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75730014, + "num_input_tokens_seen": 258566340, + "step": 11982, + "time_per_iteration": 2.4837937355041504 + }, + { + "auxiliary_loss_clip": 0.01082629, + "auxiliary_loss_mlp": 0.00779117, + "balance_loss_clip": 1.03484678, + "balance_loss_mlp": 1.00047374, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 1.7528367854296714, + "language_loss": 0.65964127, + "learning_rate": 7.65106538038665e-07, + "loss": 0.67825878, + "num_input_tokens_seen": 258584455, + "step": 11983, + "time_per_iteration": 2.4703054428100586 + }, + { + "auxiliary_loss_clip": 0.01088529, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.04571009, + "balance_loss_mlp": 1.02096236, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.560512316627697, + "language_loss": 0.66723764, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68845809, + "num_input_tokens_seen": 258604725, + "step": 11984, + "time_per_iteration": 2.5224039554595947 + }, + { + "auxiliary_loss_clip": 0.01101982, + "auxiliary_loss_mlp": 0.01034113, + "balance_loss_clip": 1.03684461, + "balance_loss_mlp": 1.02029037, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 3.014960353271223, + "language_loss": 0.73984635, + "learning_rate": 7.644939207017771e-07, + "loss": 0.76120734, + "num_input_tokens_seen": 258622885, + "step": 11985, + "time_per_iteration": 2.447453737258911 + }, + { + "auxiliary_loss_clip": 0.01099728, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.03773451, + "balance_loss_mlp": 1.01817393, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 1.7949263356196818, + "language_loss": 0.62995028, + "learning_rate": 7.641876823032977e-07, + "loss": 0.65124917, + "num_input_tokens_seen": 258644305, + "step": 11986, + "time_per_iteration": 2.5045037269592285 + }, + { + "auxiliary_loss_clip": 0.01093547, + "auxiliary_loss_mlp": 0.01035682, + "balance_loss_clip": 1.03911328, + "balance_loss_mlp": 1.02071524, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.7617998779235804, + "language_loss": 0.72908151, + "learning_rate": 7.638814907669455e-07, + "loss": 0.75037378, + "num_input_tokens_seen": 258661775, + "step": 11987, + "time_per_iteration": 2.5342249870300293 + }, + { + "auxiliary_loss_clip": 0.01091357, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.03718877, + "balance_loss_mlp": 1.02106094, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 2.4136718541763704, + "language_loss": 0.78974897, + "learning_rate": 7.635753461043301e-07, + "loss": 0.81100142, + "num_input_tokens_seen": 258679830, + "step": 11988, + "time_per_iteration": 2.5164690017700195 + }, + { + "auxiliary_loss_clip": 0.01110343, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.03713393, + "balance_loss_mlp": 1.01784182, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 1.8527441607514272, + "language_loss": 0.78817725, + "learning_rate": 7.632692483270618e-07, + "loss": 0.80958599, + "num_input_tokens_seen": 258697415, + "step": 11989, + "time_per_iteration": 3.841334819793701 + }, + { + "auxiliary_loss_clip": 0.01108992, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.03728354, + "balance_loss_mlp": 1.02191663, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 2.6471555535231026, + "language_loss": 0.82354867, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84498578, + "num_input_tokens_seen": 258716755, + "step": 11990, + "time_per_iteration": 2.4187426567077637 + }, + { + "auxiliary_loss_clip": 0.01083652, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.0371052, + "balance_loss_mlp": 1.02873349, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 3.747518818286728, + "language_loss": 0.76750505, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78875113, + "num_input_tokens_seen": 258733270, + "step": 11991, + "time_per_iteration": 2.4518802165985107 + }, + { + "auxiliary_loss_clip": 0.01073348, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.03947127, + "balance_loss_mlp": 1.01995742, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 2.075964733630156, + "language_loss": 0.72688007, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74794424, + "num_input_tokens_seen": 258755270, + "step": 11992, + "time_per_iteration": 2.6093363761901855 + }, + { + "auxiliary_loss_clip": 0.01101343, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.03541172, + "balance_loss_mlp": 1.01788819, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.4335118175591794, + "language_loss": 0.66086674, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68218124, + "num_input_tokens_seen": 258775340, + "step": 11993, + "time_per_iteration": 2.4865190982818604 + }, + { + "auxiliary_loss_clip": 0.01100369, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.03597999, + "balance_loss_mlp": 1.01850283, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 2.1138979957737543, + "language_loss": 0.65282071, + "learning_rate": 7.61739463127115e-07, + "loss": 0.67413026, + "num_input_tokens_seen": 258794580, + "step": 11994, + "time_per_iteration": 2.5284485816955566 + }, + { + "auxiliary_loss_clip": 0.01101964, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.03643537, + "balance_loss_mlp": 1.0192095, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 2.1951444639613307, + "language_loss": 0.66903937, + "learning_rate": 7.614336469056172e-07, + "loss": 0.69038928, + "num_input_tokens_seen": 258812330, + "step": 11995, + "time_per_iteration": 2.433382272720337 + }, + { + "auxiliary_loss_clip": 0.01083807, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.03415871, + "balance_loss_mlp": 1.01725268, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.7151575295985102, + "language_loss": 0.7937603, + "learning_rate": 7.6112787765068e-07, + "loss": 0.8149029, + "num_input_tokens_seen": 258831770, + "step": 11996, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.01113092, + "auxiliary_loss_mlp": 0.01035328, + "balance_loss_clip": 1.03915691, + "balance_loss_mlp": 1.02279294, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 2.1915544128464215, + "language_loss": 0.81389099, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83537519, + "num_input_tokens_seen": 258849090, + "step": 11997, + "time_per_iteration": 2.4713990688323975 + }, + { + "auxiliary_loss_clip": 0.01113301, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.03751969, + "balance_loss_mlp": 1.02070451, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 2.3591604517166784, + "language_loss": 0.66778731, + "learning_rate": 7.605164800868646e-07, + "loss": 0.6892662, + "num_input_tokens_seen": 258868230, + "step": 11998, + "time_per_iteration": 3.8797051906585693 + }, + { + "auxiliary_loss_clip": 0.01110292, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03773403, + "balance_loss_mlp": 1.02188492, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 1.9523455941562626, + "language_loss": 0.72460949, + "learning_rate": 7.602108518011696e-07, + "loss": 0.7460466, + "num_input_tokens_seen": 258885525, + "step": 11999, + "time_per_iteration": 2.378086805343628 + }, + { + "auxiliary_loss_clip": 0.01093748, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.03836405, + "balance_loss_mlp": 1.01675367, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.9969818533215897, + "language_loss": 0.82941175, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85064483, + "num_input_tokens_seen": 258903245, + "step": 12000, + "time_per_iteration": 2.488973617553711 + }, + { + "auxiliary_loss_clip": 0.01103353, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.03900313, + "balance_loss_mlp": 1.02255023, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 1.705583235111523, + "language_loss": 0.77149385, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79288065, + "num_input_tokens_seen": 258921245, + "step": 12001, + "time_per_iteration": 2.418215274810791 + }, + { + "auxiliary_loss_clip": 0.01095183, + "auxiliary_loss_mlp": 0.01042734, + "balance_loss_clip": 1.03725815, + "balance_loss_mlp": 1.02979994, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.8755390803752985, + "language_loss": 0.81722265, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83860183, + "num_input_tokens_seen": 258939425, + "step": 12002, + "time_per_iteration": 3.839355230331421 + }, + { + "auxiliary_loss_clip": 0.01102748, + "auxiliary_loss_mlp": 0.01028433, + "balance_loss_clip": 1.0387485, + "balance_loss_mlp": 1.01574349, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 1.9867353410486905, + "language_loss": 0.62021768, + "learning_rate": 7.589888089035462e-07, + "loss": 0.6415295, + "num_input_tokens_seen": 258960710, + "step": 12003, + "time_per_iteration": 2.6167514324188232 + }, + { + "auxiliary_loss_clip": 0.01111134, + "auxiliary_loss_mlp": 0.01034333, + "balance_loss_clip": 1.03689396, + "balance_loss_mlp": 1.02130318, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.1678130974148098, + "language_loss": 0.68494767, + "learning_rate": 7.586834157983544e-07, + "loss": 0.7064023, + "num_input_tokens_seen": 258978475, + "step": 12004, + "time_per_iteration": 2.38987398147583 + }, + { + "auxiliary_loss_clip": 0.01021949, + "auxiliary_loss_mlp": 0.01005279, + "balance_loss_clip": 1.02255893, + "balance_loss_mlp": 1.00415885, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8525723614540751, + "language_loss": 0.54167044, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56194276, + "num_input_tokens_seen": 259037520, + "step": 12005, + "time_per_iteration": 3.003601551055908 + }, + { + "auxiliary_loss_clip": 0.01082112, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.03988278, + "balance_loss_mlp": 1.01878405, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.559170765670762, + "language_loss": 0.63439262, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65552926, + "num_input_tokens_seen": 259061325, + "step": 12006, + "time_per_iteration": 2.6917648315429688 + }, + { + "auxiliary_loss_clip": 0.0108487, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.03464699, + "balance_loss_mlp": 1.0245657, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 2.298114917492425, + "language_loss": 0.91759855, + "learning_rate": 7.577675189541865e-07, + "loss": 0.93881756, + "num_input_tokens_seen": 259078135, + "step": 12007, + "time_per_iteration": 2.5105793476104736 + }, + { + "auxiliary_loss_clip": 0.01075892, + "auxiliary_loss_mlp": 0.01039238, + "balance_loss_clip": 1.03294373, + "balance_loss_mlp": 1.02453959, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 1.8032102332546547, + "language_loss": 0.63718867, + "learning_rate": 7.574623142018568e-07, + "loss": 0.65833998, + "num_input_tokens_seen": 259095910, + "step": 12008, + "time_per_iteration": 2.509486675262451 + }, + { + "auxiliary_loss_clip": 0.01103001, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.0384264, + "balance_loss_mlp": 1.01819181, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 2.748785547263873, + "language_loss": 0.78539479, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80673766, + "num_input_tokens_seen": 259114225, + "step": 12009, + "time_per_iteration": 2.47892165184021 + }, + { + "auxiliary_loss_clip": 0.01101662, + "auxiliary_loss_mlp": 0.01039215, + "balance_loss_clip": 1.03796768, + "balance_loss_mlp": 1.02471364, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.7403380781982687, + "language_loss": 0.63876867, + "learning_rate": 7.568520460602297e-07, + "loss": 0.66017741, + "num_input_tokens_seen": 259134660, + "step": 12010, + "time_per_iteration": 2.496929168701172 + }, + { + "auxiliary_loss_clip": 0.01112727, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.03815246, + "balance_loss_mlp": 1.01893163, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 2.212024697915125, + "language_loss": 0.77641034, + "learning_rate": 7.565469826940742e-07, + "loss": 0.7978549, + "num_input_tokens_seen": 259153300, + "step": 12011, + "time_per_iteration": 2.450453996658325 + }, + { + "auxiliary_loss_clip": 0.01095998, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.03685391, + "balance_loss_mlp": 1.02258539, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.5622139850636574, + "language_loss": 0.79327989, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81458575, + "num_input_tokens_seen": 259172115, + "step": 12012, + "time_per_iteration": 2.489746332168579 + }, + { + "auxiliary_loss_clip": 0.01091547, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.03886819, + "balance_loss_mlp": 1.01541936, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.7131748771675501, + "language_loss": 0.75707847, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77827048, + "num_input_tokens_seen": 259191345, + "step": 12013, + "time_per_iteration": 2.521735906600952 + }, + { + "auxiliary_loss_clip": 0.01110934, + "auxiliary_loss_mlp": 0.01026113, + "balance_loss_clip": 1.03844702, + "balance_loss_mlp": 1.01441836, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.4154771068070893, + "language_loss": 0.75856423, + "learning_rate": 7.556320755530484e-07, + "loss": 0.7799347, + "num_input_tokens_seen": 259211700, + "step": 12014, + "time_per_iteration": 4.010887861251831 + }, + { + "auxiliary_loss_clip": 0.01102953, + "auxiliary_loss_mlp": 0.01030709, + "balance_loss_clip": 1.03665495, + "balance_loss_mlp": 1.01832879, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.5783329357372042, + "language_loss": 0.86800659, + "learning_rate": 7.553272008637346e-07, + "loss": 0.8893432, + "num_input_tokens_seen": 259233825, + "step": 12015, + "time_per_iteration": 2.5473544597625732 + }, + { + "auxiliary_loss_clip": 0.01099678, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03869414, + "balance_loss_mlp": 1.02122808, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.9846345503946603, + "language_loss": 0.78674793, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80807996, + "num_input_tokens_seen": 259253055, + "step": 12016, + "time_per_iteration": 2.536423444747925 + }, + { + "auxiliary_loss_clip": 0.01069227, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.03317821, + "balance_loss_mlp": 1.02945864, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.6479853414843135, + "language_loss": 0.77915782, + "learning_rate": 7.547175930910186e-07, + "loss": 0.8002907, + "num_input_tokens_seen": 259273420, + "step": 12017, + "time_per_iteration": 2.6119203567504883 + }, + { + "auxiliary_loss_clip": 0.01107371, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.03677344, + "balance_loss_mlp": 1.01710343, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 1.8375815644180185, + "language_loss": 0.74315482, + "learning_rate": 7.54412860030732e-07, + "loss": 0.76451486, + "num_input_tokens_seen": 259291000, + "step": 12018, + "time_per_iteration": 2.474483013153076 + }, + { + "auxiliary_loss_clip": 0.0108371, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.04443383, + "balance_loss_mlp": 1.01766706, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 1.5748896585439547, + "language_loss": 0.77848375, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79960954, + "num_input_tokens_seen": 259312390, + "step": 12019, + "time_per_iteration": 2.5657918453216553 + }, + { + "auxiliary_loss_clip": 0.01083942, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.03760099, + "balance_loss_mlp": 1.01650143, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.9299649901666251, + "language_loss": 0.7396332, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76075983, + "num_input_tokens_seen": 259332645, + "step": 12020, + "time_per_iteration": 2.6136386394500732 + }, + { + "auxiliary_loss_clip": 0.01096246, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.03618622, + "balance_loss_mlp": 1.01892829, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 2.0452627354921673, + "language_loss": 0.77303255, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79430342, + "num_input_tokens_seen": 259353810, + "step": 12021, + "time_per_iteration": 2.464303970336914 + }, + { + "auxiliary_loss_clip": 0.01076042, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.03571951, + "balance_loss_mlp": 1.02092862, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 2.517653487752107, + "language_loss": 0.6809957, + "learning_rate": 7.531944002330073e-07, + "loss": 0.70209181, + "num_input_tokens_seen": 259372460, + "step": 12022, + "time_per_iteration": 2.5640974044799805 + }, + { + "auxiliary_loss_clip": 0.01101112, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.03676021, + "balance_loss_mlp": 1.01652312, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.7398680397884423, + "language_loss": 0.69351315, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71481681, + "num_input_tokens_seen": 259393275, + "step": 12023, + "time_per_iteration": 2.5204105377197266 + }, + { + "auxiliary_loss_clip": 0.01082036, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.03127074, + "balance_loss_mlp": 1.01730251, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.7708705094835706, + "language_loss": 0.71094728, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73206556, + "num_input_tokens_seen": 259416205, + "step": 12024, + "time_per_iteration": 2.55907940864563 + }, + { + "auxiliary_loss_clip": 0.01078365, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.03757226, + "balance_loss_mlp": 1.02064657, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 1.8872836675926936, + "language_loss": 0.75663328, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77773976, + "num_input_tokens_seen": 259433115, + "step": 12025, + "time_per_iteration": 2.5006895065307617 + }, + { + "auxiliary_loss_clip": 0.01097885, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.03534293, + "balance_loss_mlp": 1.01824903, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 1.9763938529967489, + "language_loss": 0.76595169, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78723073, + "num_input_tokens_seen": 259450475, + "step": 12026, + "time_per_iteration": 2.457569122314453 + }, + { + "auxiliary_loss_clip": 0.01100365, + "auxiliary_loss_mlp": 0.0104069, + "balance_loss_clip": 1.03698683, + "balance_loss_mlp": 1.02874565, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 2.094250738868948, + "language_loss": 0.67373097, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69514149, + "num_input_tokens_seen": 259469355, + "step": 12027, + "time_per_iteration": 2.4868810176849365 + }, + { + "auxiliary_loss_clip": 0.010669, + "auxiliary_loss_mlp": 0.01029684, + "balance_loss_clip": 1.04463351, + "balance_loss_mlp": 1.01722646, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 1.9803130025469655, + "language_loss": 0.79087591, + "learning_rate": 7.513681291370469e-07, + "loss": 0.81184173, + "num_input_tokens_seen": 259486565, + "step": 12028, + "time_per_iteration": 4.190173625946045 + }, + { + "auxiliary_loss_clip": 0.0107128, + "auxiliary_loss_mlp": 0.01029619, + "balance_loss_clip": 1.03352165, + "balance_loss_mlp": 1.01656556, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.6667449014048092, + "language_loss": 0.82329798, + "learning_rate": 7.510639162726e-07, + "loss": 0.84430695, + "num_input_tokens_seen": 259505070, + "step": 12029, + "time_per_iteration": 2.5570085048675537 + }, + { + "auxiliary_loss_clip": 0.0101259, + "auxiliary_loss_mlp": 0.01006224, + "balance_loss_clip": 1.01067591, + "balance_loss_mlp": 1.0049063, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8127553593332563, + "language_loss": 0.61791182, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63809991, + "num_input_tokens_seen": 259569135, + "step": 12030, + "time_per_iteration": 3.14798641204834 + }, + { + "auxiliary_loss_clip": 0.01093708, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.03363609, + "balance_loss_mlp": 1.01957321, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.6591653965541568, + "language_loss": 0.77774519, + "learning_rate": 7.504556326345859e-07, + "loss": 0.79901218, + "num_input_tokens_seen": 259587035, + "step": 12031, + "time_per_iteration": 2.4474081993103027 + }, + { + "auxiliary_loss_clip": 0.0110209, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.0386492, + "balance_loss_mlp": 1.01626325, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 2.1325814399974306, + "language_loss": 0.81599075, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83729804, + "num_input_tokens_seen": 259606140, + "step": 12032, + "time_per_iteration": 2.4944870471954346 + }, + { + "auxiliary_loss_clip": 0.01075038, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.03364646, + "balance_loss_mlp": 1.02233672, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 2.184641147127928, + "language_loss": 0.75041759, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77152061, + "num_input_tokens_seen": 259624275, + "step": 12033, + "time_per_iteration": 2.5355727672576904 + }, + { + "auxiliary_loss_clip": 0.0107746, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.03663957, + "balance_loss_mlp": 1.01935697, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.5887378598692337, + "language_loss": 0.74898672, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77006829, + "num_input_tokens_seen": 259643465, + "step": 12034, + "time_per_iteration": 2.5397093296051025 + }, + { + "auxiliary_loss_clip": 0.01087175, + "auxiliary_loss_mlp": 0.01027204, + "balance_loss_clip": 1.03639591, + "balance_loss_mlp": 1.01614094, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.752677110440404, + "language_loss": 0.80423796, + "learning_rate": 7.492396340449578e-07, + "loss": 0.8253817, + "num_input_tokens_seen": 259662500, + "step": 12035, + "time_per_iteration": 2.5767886638641357 + }, + { + "auxiliary_loss_clip": 0.01051898, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.03492415, + "balance_loss_mlp": 1.01838279, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 2.465287805912195, + "language_loss": 0.61072534, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63155204, + "num_input_tokens_seen": 259680140, + "step": 12036, + "time_per_iteration": 2.6330502033233643 + }, + { + "auxiliary_loss_clip": 0.01096682, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.03691697, + "balance_loss_mlp": 1.02114868, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 2.0184473373627334, + "language_loss": 0.67686313, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69815505, + "num_input_tokens_seen": 259700160, + "step": 12037, + "time_per_iteration": 2.482593536376953 + }, + { + "auxiliary_loss_clip": 0.01111644, + "auxiliary_loss_mlp": 0.01035675, + "balance_loss_clip": 1.03866756, + "balance_loss_mlp": 1.02282476, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 2.066775110041631, + "language_loss": 0.7238667, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74533987, + "num_input_tokens_seen": 259720525, + "step": 12038, + "time_per_iteration": 3.9275262355804443 + }, + { + "auxiliary_loss_clip": 0.01111384, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.03864825, + "balance_loss_mlp": 1.01904976, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.6585683870973926, + "language_loss": 0.72185075, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74329233, + "num_input_tokens_seen": 259738680, + "step": 12039, + "time_per_iteration": 2.4172685146331787 + }, + { + "auxiliary_loss_clip": 0.01112308, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.03787065, + "balance_loss_mlp": 1.02000129, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 1.8350869523484052, + "language_loss": 0.76139456, + "learning_rate": 7.477207030458513e-07, + "loss": 0.78283697, + "num_input_tokens_seen": 259758790, + "step": 12040, + "time_per_iteration": 2.4381818771362305 + }, + { + "auxiliary_loss_clip": 0.0107896, + "auxiliary_loss_mlp": 0.01031006, + "balance_loss_clip": 1.03554893, + "balance_loss_mlp": 1.01862645, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 2.018956957406577, + "language_loss": 0.76516879, + "learning_rate": 7.474170592596301e-07, + "loss": 0.78626847, + "num_input_tokens_seen": 259777370, + "step": 12041, + "time_per_iteration": 2.514573097229004 + }, + { + "auxiliary_loss_clip": 0.01100863, + "auxiliary_loss_mlp": 0.01029237, + "balance_loss_clip": 1.03541267, + "balance_loss_mlp": 1.01741743, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 7.347255077170793, + "language_loss": 0.63650739, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65780842, + "num_input_tokens_seen": 259794665, + "step": 12042, + "time_per_iteration": 3.9359426498413086 + }, + { + "auxiliary_loss_clip": 0.01076225, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.03856778, + "balance_loss_mlp": 1.02086282, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 1.7928929231716697, + "language_loss": 0.83366698, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85476947, + "num_input_tokens_seen": 259811110, + "step": 12043, + "time_per_iteration": 2.5572900772094727 + }, + { + "auxiliary_loss_clip": 0.01082653, + "auxiliary_loss_mlp": 0.01026787, + "balance_loss_clip": 1.04044795, + "balance_loss_mlp": 1.01344132, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.6162316538766752, + "language_loss": 0.64131546, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66240984, + "num_input_tokens_seen": 259831080, + "step": 12044, + "time_per_iteration": 2.5595614910125732 + }, + { + "auxiliary_loss_clip": 0.01112425, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.03927147, + "balance_loss_mlp": 1.01901066, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.5759409308293097, + "language_loss": 0.81458896, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83603311, + "num_input_tokens_seen": 259850135, + "step": 12045, + "time_per_iteration": 2.397080421447754 + }, + { + "auxiliary_loss_clip": 0.01107476, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.03626609, + "balance_loss_mlp": 1.01953888, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.8246874169592369, + "language_loss": 0.71904314, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74043238, + "num_input_tokens_seen": 259868185, + "step": 12046, + "time_per_iteration": 2.399549961090088 + }, + { + "auxiliary_loss_clip": 0.01074196, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.03109026, + "balance_loss_mlp": 1.0172534, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 1.7467543451021996, + "language_loss": 0.70799756, + "learning_rate": 7.455961944046553e-07, + "loss": 0.72904527, + "num_input_tokens_seen": 259887055, + "step": 12047, + "time_per_iteration": 2.5158944129943848 + }, + { + "auxiliary_loss_clip": 0.01086715, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.0368036, + "balance_loss_mlp": 1.02074158, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.7179488108131353, + "language_loss": 0.7015419, + "learning_rate": 7.45292883346627e-07, + "loss": 0.72274423, + "num_input_tokens_seen": 259908295, + "step": 12048, + "time_per_iteration": 2.544297456741333 + }, + { + "auxiliary_loss_clip": 0.01011866, + "auxiliary_loss_mlp": 0.01013134, + "balance_loss_clip": 1.00482297, + "balance_loss_mlp": 1.01160848, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8404616320262683, + "language_loss": 0.53745246, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55770254, + "num_input_tokens_seen": 259968475, + "step": 12049, + "time_per_iteration": 3.0662307739257812 + }, + { + "auxiliary_loss_clip": 0.01089125, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.03584111, + "balance_loss_mlp": 1.01636624, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 2.721854946204266, + "language_loss": 0.59809989, + "learning_rate": 7.446864039779258e-07, + "loss": 0.61929989, + "num_input_tokens_seen": 259984865, + "step": 12050, + "time_per_iteration": 2.454623222351074 + }, + { + "auxiliary_loss_clip": 0.00997733, + "auxiliary_loss_mlp": 0.010021, + "balance_loss_clip": 1.01102114, + "balance_loss_mlp": 1.00084829, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7161891375705877, + "language_loss": 0.53270006, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55269837, + "num_input_tokens_seen": 260046735, + "step": 12051, + "time_per_iteration": 3.141409158706665 + }, + { + "auxiliary_loss_clip": 0.01097266, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_clip": 1.03503156, + "balance_loss_mlp": 1.0191288, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.6679688087098052, + "language_loss": 0.72280991, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74409205, + "num_input_tokens_seen": 260067950, + "step": 12052, + "time_per_iteration": 2.5046215057373047 + }, + { + "auxiliary_loss_clip": 0.01097428, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.03484011, + "balance_loss_mlp": 1.01915741, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 1.843532905216305, + "language_loss": 0.74510729, + "learning_rate": 7.437770419657415e-07, + "loss": 0.7664122, + "num_input_tokens_seen": 260087730, + "step": 12053, + "time_per_iteration": 4.0500078201293945 + }, + { + "auxiliary_loss_clip": 0.01076327, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.03923178, + "balance_loss_mlp": 1.01900709, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 5.649393380032038, + "language_loss": 0.78133821, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80242091, + "num_input_tokens_seen": 260107760, + "step": 12054, + "time_per_iteration": 2.5638461112976074 + }, + { + "auxiliary_loss_clip": 0.01076356, + "auxiliary_loss_mlp": 0.01038241, + "balance_loss_clip": 1.03545558, + "balance_loss_mlp": 1.02562857, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.5388486899577876, + "language_loss": 0.68351394, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70465994, + "num_input_tokens_seen": 260123660, + "step": 12055, + "time_per_iteration": 2.5104055404663086 + }, + { + "auxiliary_loss_clip": 0.01081038, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.03877258, + "balance_loss_mlp": 1.02158523, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.6667882758662604, + "language_loss": 0.74020088, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76133835, + "num_input_tokens_seen": 260142690, + "step": 12056, + "time_per_iteration": 2.546657085418701 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01027362, + "balance_loss_clip": 1.03643847, + "balance_loss_mlp": 1.01520252, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.518667823163834, + "language_loss": 0.70860654, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72994661, + "num_input_tokens_seen": 260162590, + "step": 12057, + "time_per_iteration": 2.465285539627075 + }, + { + "auxiliary_loss_clip": 0.01061787, + "auxiliary_loss_mlp": 0.01046188, + "balance_loss_clip": 1.03627682, + "balance_loss_mlp": 1.03258061, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 2.5674893177154154, + "language_loss": 0.62719494, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64827472, + "num_input_tokens_seen": 260181065, + "step": 12058, + "time_per_iteration": 2.51954984664917 + }, + { + "auxiliary_loss_clip": 0.01072178, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.03508818, + "balance_loss_mlp": 1.01669717, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 1.8109880437805963, + "language_loss": 0.74845934, + "learning_rate": 7.419596044262535e-07, + "loss": 0.76947904, + "num_input_tokens_seen": 260200330, + "step": 12059, + "time_per_iteration": 2.567629337310791 + }, + { + "auxiliary_loss_clip": 0.0109712, + "auxiliary_loss_mlp": 0.01033889, + "balance_loss_clip": 1.03651774, + "balance_loss_mlp": 1.02258205, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.7718159866107772, + "language_loss": 0.79361767, + "learning_rate": 7.416568650702472e-07, + "loss": 0.8149277, + "num_input_tokens_seen": 260219975, + "step": 12060, + "time_per_iteration": 2.485661029815674 + }, + { + "auxiliary_loss_clip": 0.01101584, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.0392257, + "balance_loss_mlp": 1.01378584, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 2.361788869139955, + "language_loss": 0.76515043, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78643334, + "num_input_tokens_seen": 260242025, + "step": 12061, + "time_per_iteration": 2.5146749019622803 + }, + { + "auxiliary_loss_clip": 0.01107345, + "auxiliary_loss_mlp": 0.00775883, + "balance_loss_clip": 1.03721428, + "balance_loss_mlp": 1.00042617, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.8852454326186228, + "language_loss": 0.81255615, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83138847, + "num_input_tokens_seen": 260260015, + "step": 12062, + "time_per_iteration": 2.4042470455169678 + }, + { + "auxiliary_loss_clip": 0.01065575, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.0349648, + "balance_loss_mlp": 1.01805031, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 2.1669275540241175, + "language_loss": 0.69286478, + "learning_rate": 7.407489333471262e-07, + "loss": 0.71385062, + "num_input_tokens_seen": 260278635, + "step": 12063, + "time_per_iteration": 2.61460542678833 + }, + { + "auxiliary_loss_clip": 0.01074613, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.03460765, + "balance_loss_mlp": 1.01824081, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.75979449424551, + "language_loss": 0.7008577, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72191256, + "num_input_tokens_seen": 260298510, + "step": 12064, + "time_per_iteration": 2.5271694660186768 + }, + { + "auxiliary_loss_clip": 0.01091161, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.03847039, + "balance_loss_mlp": 1.01687503, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 2.061078509871581, + "language_loss": 0.90750748, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92871165, + "num_input_tokens_seen": 260317405, + "step": 12065, + "time_per_iteration": 2.49285888671875 + }, + { + "auxiliary_loss_clip": 0.01020844, + "auxiliary_loss_mlp": 0.01003568, + "balance_loss_clip": 1.00622559, + "balance_loss_mlp": 1.00217915, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 1.152900915638554, + "language_loss": 0.56056392, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58080804, + "num_input_tokens_seen": 260388085, + "step": 12066, + "time_per_iteration": 3.1782994270324707 + }, + { + "auxiliary_loss_clip": 0.01060845, + "auxiliary_loss_mlp": 0.01026203, + "balance_loss_clip": 1.03531194, + "balance_loss_mlp": 1.01488447, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.7929977181710273, + "language_loss": 0.76695913, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78782964, + "num_input_tokens_seen": 260406165, + "step": 12067, + "time_per_iteration": 2.6139070987701416 + }, + { + "auxiliary_loss_clip": 0.01016421, + "auxiliary_loss_mlp": 0.01001239, + "balance_loss_clip": 1.01202512, + "balance_loss_mlp": 0.99998087, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7733248406348477, + "language_loss": 0.57029885, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59047544, + "num_input_tokens_seen": 260461365, + "step": 12068, + "time_per_iteration": 4.438831806182861 + }, + { + "auxiliary_loss_clip": 0.00995765, + "auxiliary_loss_mlp": 0.01002141, + "balance_loss_clip": 1.00826395, + "balance_loss_mlp": 1.00054348, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6620256128038412, + "language_loss": 0.55419195, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57417095, + "num_input_tokens_seen": 260523795, + "step": 12069, + "time_per_iteration": 3.146378993988037 + }, + { + "auxiliary_loss_clip": 0.01076389, + "auxiliary_loss_mlp": 0.01025644, + "balance_loss_clip": 1.03740597, + "balance_loss_mlp": 1.01427186, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 1.6927856740650016, + "language_loss": 0.79936624, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82038665, + "num_input_tokens_seen": 260544765, + "step": 12070, + "time_per_iteration": 2.5850276947021484 + }, + { + "auxiliary_loss_clip": 0.01082892, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.03441191, + "balance_loss_mlp": 1.02138662, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 2.0751989292906923, + "language_loss": 0.7170943, + "learning_rate": 7.383298839673197e-07, + "loss": 0.738258, + "num_input_tokens_seen": 260564340, + "step": 12071, + "time_per_iteration": 2.5382843017578125 + }, + { + "auxiliary_loss_clip": 0.01108808, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.03849173, + "balance_loss_mlp": 1.02359867, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 1.8333596891155446, + "language_loss": 0.69943631, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72087699, + "num_input_tokens_seen": 260582565, + "step": 12072, + "time_per_iteration": 2.4369566440582275 + }, + { + "auxiliary_loss_clip": 0.0107433, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.0323652, + "balance_loss_mlp": 1.01807368, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.8837865513600713, + "language_loss": 0.78575796, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80681044, + "num_input_tokens_seen": 260601700, + "step": 12073, + "time_per_iteration": 2.5624964237213135 + }, + { + "auxiliary_loss_clip": 0.01088292, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.03669751, + "balance_loss_mlp": 1.01710355, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.384748227721984, + "language_loss": 0.70335972, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72453594, + "num_input_tokens_seen": 260623040, + "step": 12074, + "time_per_iteration": 2.6233983039855957 + }, + { + "auxiliary_loss_clip": 0.0109229, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.03599143, + "balance_loss_mlp": 1.01796746, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 1.7379301105684217, + "language_loss": 0.74379015, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76502508, + "num_input_tokens_seen": 260642735, + "step": 12075, + "time_per_iteration": 2.5441513061523438 + }, + { + "auxiliary_loss_clip": 0.01100805, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.03841901, + "balance_loss_mlp": 1.01941788, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.674806045973539, + "language_loss": 0.63890457, + "learning_rate": 7.368195326186458e-07, + "loss": 0.66023469, + "num_input_tokens_seen": 260669935, + "step": 12076, + "time_per_iteration": 2.860239028930664 + }, + { + "auxiliary_loss_clip": 0.01074781, + "auxiliary_loss_mlp": 0.01029279, + "balance_loss_clip": 1.03418303, + "balance_loss_mlp": 1.01659513, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 3.257365830528947, + "language_loss": 0.78769171, + "learning_rate": 7.365176060028912e-07, + "loss": 0.80873227, + "num_input_tokens_seen": 260689605, + "step": 12077, + "time_per_iteration": 4.049619913101196 + }, + { + "auxiliary_loss_clip": 0.01030476, + "auxiliary_loss_mlp": 0.00752903, + "balance_loss_clip": 1.00658154, + "balance_loss_mlp": 1.00006914, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8814710780349819, + "language_loss": 0.65008694, + "learning_rate": 7.362157272985163e-07, + "loss": 0.66792071, + "num_input_tokens_seen": 260748265, + "step": 12078, + "time_per_iteration": 3.0200250148773193 + }, + { + "auxiliary_loss_clip": 0.01021654, + "auxiliary_loss_mlp": 0.01004109, + "balance_loss_clip": 1.00740814, + "balance_loss_mlp": 1.00286281, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7205595283694989, + "language_loss": 0.59330153, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61355913, + "num_input_tokens_seen": 260816715, + "step": 12079, + "time_per_iteration": 3.1678614616394043 + }, + { + "auxiliary_loss_clip": 0.01068419, + "auxiliary_loss_mlp": 0.01027229, + "balance_loss_clip": 1.03855598, + "balance_loss_mlp": 1.01500988, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 1.8732050620236822, + "language_loss": 0.64653492, + "learning_rate": 7.356121136696895e-07, + "loss": 0.66749138, + "num_input_tokens_seen": 260836765, + "step": 12080, + "time_per_iteration": 2.6090493202209473 + }, + { + "auxiliary_loss_clip": 0.01067141, + "auxiliary_loss_mlp": 0.01026748, + "balance_loss_clip": 1.03394532, + "balance_loss_mlp": 1.01370692, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 2.6521119083277096, + "language_loss": 0.69611281, + "learning_rate": 7.35310378768128e-07, + "loss": 0.71705174, + "num_input_tokens_seen": 260854610, + "step": 12081, + "time_per_iteration": 4.011984586715698 + }, + { + "auxiliary_loss_clip": 0.01113524, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.03897834, + "balance_loss_mlp": 1.0205729, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 1.6911832433692835, + "language_loss": 0.81265646, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83412206, + "num_input_tokens_seen": 260871620, + "step": 12082, + "time_per_iteration": 2.422858238220215 + }, + { + "auxiliary_loss_clip": 0.01104895, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.03630686, + "balance_loss_mlp": 1.02348995, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.6464361506366239, + "language_loss": 0.77303565, + "learning_rate": 7.347070528479158e-07, + "loss": 0.79446042, + "num_input_tokens_seen": 260890490, + "step": 12083, + "time_per_iteration": 2.589012622833252 + }, + { + "auxiliary_loss_clip": 0.01115286, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.04090154, + "balance_loss_mlp": 1.02016616, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.663126112644259, + "language_loss": 0.72736037, + "learning_rate": 7.344054618521433e-07, + "loss": 0.7488414, + "num_input_tokens_seen": 260909700, + "step": 12084, + "time_per_iteration": 2.5306456089019775 + }, + { + "auxiliary_loss_clip": 0.01114476, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.03969634, + "balance_loss_mlp": 1.0226562, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 1.7666968340059317, + "language_loss": 0.77675176, + "learning_rate": 7.34103918847843e-07, + "loss": 0.7982524, + "num_input_tokens_seen": 260929090, + "step": 12085, + "time_per_iteration": 2.4398043155670166 + }, + { + "auxiliary_loss_clip": 0.01101845, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.03867877, + "balance_loss_mlp": 1.01899302, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.657562031468055, + "language_loss": 0.72344345, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74477553, + "num_input_tokens_seen": 260946615, + "step": 12086, + "time_per_iteration": 2.4646565914154053 + }, + { + "auxiliary_loss_clip": 0.01074896, + "auxiliary_loss_mlp": 0.01037173, + "balance_loss_clip": 1.03451562, + "balance_loss_mlp": 1.02320147, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 2.0518732245377884, + "language_loss": 0.69484019, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71596086, + "num_input_tokens_seen": 260968515, + "step": 12087, + "time_per_iteration": 2.5935747623443604 + }, + { + "auxiliary_loss_clip": 0.01113087, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.03864324, + "balance_loss_mlp": 1.02140856, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 1.8245522561351137, + "language_loss": 0.79197454, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81344837, + "num_input_tokens_seen": 260986790, + "step": 12088, + "time_per_iteration": 2.442667007446289 + }, + { + "auxiliary_loss_clip": 0.01102178, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.04167008, + "balance_loss_mlp": 1.02678847, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.7039410063381624, + "language_loss": 0.73830748, + "learning_rate": 7.328982269740221e-07, + "loss": 0.759718, + "num_input_tokens_seen": 261004925, + "step": 12089, + "time_per_iteration": 2.445082426071167 + }, + { + "auxiliary_loss_clip": 0.01090979, + "auxiliary_loss_mlp": 0.01037806, + "balance_loss_clip": 1.03581035, + "balance_loss_mlp": 1.02515221, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.703207743221305, + "language_loss": 0.71448827, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73577619, + "num_input_tokens_seen": 261023895, + "step": 12090, + "time_per_iteration": 2.519423484802246 + }, + { + "auxiliary_loss_clip": 0.01062874, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.03744745, + "balance_loss_mlp": 1.01899803, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.8629823635167733, + "language_loss": 0.77364779, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79459631, + "num_input_tokens_seen": 261045445, + "step": 12091, + "time_per_iteration": 2.7086896896362305 + }, + { + "auxiliary_loss_clip": 0.01093256, + "auxiliary_loss_mlp": 0.00779319, + "balance_loss_clip": 1.03313291, + "balance_loss_mlp": 1.00059474, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 1.8614123144249721, + "language_loss": 0.71244299, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73116875, + "num_input_tokens_seen": 261064275, + "step": 12092, + "time_per_iteration": 3.954916000366211 + }, + { + "auxiliary_loss_clip": 0.01099612, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.03652573, + "balance_loss_mlp": 1.01594424, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 2.1628150208567525, + "language_loss": 0.60890627, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63019043, + "num_input_tokens_seen": 261083310, + "step": 12093, + "time_per_iteration": 2.570277214050293 + }, + { + "auxiliary_loss_clip": 0.0108889, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.0383929, + "balance_loss_mlp": 1.02056086, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.6069123975045672, + "language_loss": 0.75580347, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77702129, + "num_input_tokens_seen": 261103460, + "step": 12094, + "time_per_iteration": 2.5147182941436768 + }, + { + "auxiliary_loss_clip": 0.01076426, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.03355956, + "balance_loss_mlp": 1.01939631, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 1.8866963302969582, + "language_loss": 0.84977579, + "learning_rate": 7.310911308504808e-07, + "loss": 0.87085247, + "num_input_tokens_seen": 261121375, + "step": 12095, + "time_per_iteration": 2.5580127239227295 + }, + { + "auxiliary_loss_clip": 0.01098907, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.03663862, + "balance_loss_mlp": 1.02187085, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.81531321616963, + "language_loss": 0.77547193, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79680812, + "num_input_tokens_seen": 261141105, + "step": 12096, + "time_per_iteration": 2.4797933101654053 + }, + { + "auxiliary_loss_clip": 0.0111309, + "auxiliary_loss_mlp": 0.01033812, + "balance_loss_clip": 1.03981352, + "balance_loss_mlp": 1.02121735, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 2.7744810964770417, + "language_loss": 0.72454512, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74601412, + "num_input_tokens_seen": 261159255, + "step": 12097, + "time_per_iteration": 2.3931312561035156 + }, + { + "auxiliary_loss_clip": 0.01102737, + "auxiliary_loss_mlp": 0.00778736, + "balance_loss_clip": 1.03941846, + "balance_loss_mlp": 1.00057328, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 1.7755212531116238, + "language_loss": 0.76964962, + "learning_rate": 7.301882322160935e-07, + "loss": 0.78846437, + "num_input_tokens_seen": 261177960, + "step": 12098, + "time_per_iteration": 2.5006723403930664 + }, + { + "auxiliary_loss_clip": 0.01091199, + "auxiliary_loss_mlp": 0.01032661, + "balance_loss_clip": 1.03502607, + "balance_loss_mlp": 1.01981068, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 3.8278477795722714, + "language_loss": 0.67933869, + "learning_rate": 7.298873622921952e-07, + "loss": 0.70057726, + "num_input_tokens_seen": 261205660, + "step": 12099, + "time_per_iteration": 2.9055023193359375 + }, + { + "auxiliary_loss_clip": 0.01101002, + "auxiliary_loss_mlp": 0.01042028, + "balance_loss_clip": 1.03481412, + "balance_loss_mlp": 1.02638745, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 1.6900304666380053, + "language_loss": 0.72852635, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74995661, + "num_input_tokens_seen": 261225185, + "step": 12100, + "time_per_iteration": 2.468693494796753 + }, + { + "auxiliary_loss_clip": 0.0110428, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.03955913, + "balance_loss_mlp": 1.02171707, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.554524949370921, + "language_loss": 0.74855238, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76993108, + "num_input_tokens_seen": 261247965, + "step": 12101, + "time_per_iteration": 2.50169038772583 + }, + { + "auxiliary_loss_clip": 0.01069592, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.03555274, + "balance_loss_mlp": 1.01885808, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 1.6341745286132592, + "language_loss": 0.82561505, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84661859, + "num_input_tokens_seen": 261267585, + "step": 12102, + "time_per_iteration": 2.5427229404449463 + }, + { + "auxiliary_loss_clip": 0.01099865, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.03820264, + "balance_loss_mlp": 1.02060103, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.126086810500964, + "language_loss": 0.81228268, + "learning_rate": 7.286843643386495e-07, + "loss": 0.83360624, + "num_input_tokens_seen": 261285200, + "step": 12103, + "time_per_iteration": 2.477356433868408 + }, + { + "auxiliary_loss_clip": 0.01089974, + "auxiliary_loss_mlp": 0.01026258, + "balance_loss_clip": 1.03815639, + "balance_loss_mlp": 1.01346648, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 1.6716748523515097, + "language_loss": 0.66254115, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68370342, + "num_input_tokens_seen": 261303645, + "step": 12104, + "time_per_iteration": 2.4758872985839844 + }, + { + "auxiliary_loss_clip": 0.01082417, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.04016268, + "balance_loss_mlp": 1.01760519, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 2.301031179055801, + "language_loss": 0.66101813, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68213868, + "num_input_tokens_seen": 261323265, + "step": 12105, + "time_per_iteration": 2.6506588459014893 + }, + { + "auxiliary_loss_clip": 0.0111243, + "auxiliary_loss_mlp": 0.01037381, + "balance_loss_clip": 1.03933239, + "balance_loss_mlp": 1.02501273, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 2.009808769674244, + "language_loss": 0.7575618, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77905995, + "num_input_tokens_seen": 261339745, + "step": 12106, + "time_per_iteration": 2.4214582443237305 + }, + { + "auxiliary_loss_clip": 0.01102419, + "auxiliary_loss_mlp": 0.01030154, + "balance_loss_clip": 1.03595686, + "balance_loss_mlp": 1.01710045, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.0026149268522118, + "language_loss": 0.70397294, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72529876, + "num_input_tokens_seen": 261359310, + "step": 12107, + "time_per_iteration": 2.5781476497650146 + }, + { + "auxiliary_loss_clip": 0.01093742, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.03421342, + "balance_loss_mlp": 1.02372754, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.507226664181734, + "language_loss": 0.75750136, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77880514, + "num_input_tokens_seen": 261384640, + "step": 12108, + "time_per_iteration": 4.330064296722412 + }, + { + "auxiliary_loss_clip": 0.01109686, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.03599787, + "balance_loss_mlp": 1.02150154, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.8217397875816896, + "language_loss": 0.67150366, + "learning_rate": 7.268813138887124e-07, + "loss": 0.69293833, + "num_input_tokens_seen": 261405290, + "step": 12109, + "time_per_iteration": 2.548931121826172 + }, + { + "auxiliary_loss_clip": 0.0107394, + "auxiliary_loss_mlp": 0.01034481, + "balance_loss_clip": 1.0360167, + "balance_loss_mlp": 1.02068257, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 2.3196977155451206, + "language_loss": 0.63671976, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65780395, + "num_input_tokens_seen": 261419710, + "step": 12110, + "time_per_iteration": 2.5287258625030518 + }, + { + "auxiliary_loss_clip": 0.01079487, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.03472877, + "balance_loss_mlp": 1.0136466, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 2.079424957214441, + "language_loss": 0.58283532, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60390198, + "num_input_tokens_seen": 261442385, + "step": 12111, + "time_per_iteration": 2.6451218128204346 + }, + { + "auxiliary_loss_clip": 0.01064656, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.04060316, + "balance_loss_mlp": 1.01934361, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 2.1468100544756163, + "language_loss": 0.73991638, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76088136, + "num_input_tokens_seen": 261459805, + "step": 12112, + "time_per_iteration": 2.5891993045806885 + }, + { + "auxiliary_loss_clip": 0.01098801, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.03526616, + "balance_loss_mlp": 1.01949513, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 2.8741731288983816, + "language_loss": 0.66655695, + "learning_rate": 7.25680245639237e-07, + "loss": 0.68786156, + "num_input_tokens_seen": 261477175, + "step": 12113, + "time_per_iteration": 2.4522197246551514 + }, + { + "auxiliary_loss_clip": 0.01074217, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.03696251, + "balance_loss_mlp": 1.01623106, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 2.9178856399120456, + "language_loss": 0.73521751, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75624883, + "num_input_tokens_seen": 261494990, + "step": 12114, + "time_per_iteration": 2.5056004524230957 + }, + { + "auxiliary_loss_clip": 0.01083366, + "auxiliary_loss_mlp": 0.01033947, + "balance_loss_clip": 1.03776395, + "balance_loss_mlp": 1.02088189, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 2.3342724998856648, + "language_loss": 0.68177372, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70294684, + "num_input_tokens_seen": 261514445, + "step": 12115, + "time_per_iteration": 2.5410447120666504 + }, + { + "auxiliary_loss_clip": 0.01112212, + "auxiliary_loss_mlp": 0.01033466, + "balance_loss_clip": 1.03671551, + "balance_loss_mlp": 1.02044845, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.7258631378904583, + "language_loss": 0.59829199, + "learning_rate": 7.247799517967674e-07, + "loss": 0.61974871, + "num_input_tokens_seen": 261533565, + "step": 12116, + "time_per_iteration": 2.4292173385620117 + }, + { + "auxiliary_loss_clip": 0.01099035, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.03748083, + "balance_loss_mlp": 1.01840782, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 1.7750989503244579, + "language_loss": 0.72704554, + "learning_rate": 7.2447995054705e-07, + "loss": 0.74834466, + "num_input_tokens_seen": 261553795, + "step": 12117, + "time_per_iteration": 4.01696252822876 + }, + { + "auxiliary_loss_clip": 0.01096494, + "auxiliary_loss_mlp": 0.01029736, + "balance_loss_clip": 1.03466177, + "balance_loss_mlp": 1.01702189, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 1.9033015173702474, + "language_loss": 0.69644701, + "learning_rate": 7.241799976651807e-07, + "loss": 0.7177093, + "num_input_tokens_seen": 261572565, + "step": 12118, + "time_per_iteration": 2.456627368927002 + }, + { + "auxiliary_loss_clip": 0.01061155, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.03800344, + "balance_loss_mlp": 1.01974583, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 2.8565649999195286, + "language_loss": 0.84659976, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86752939, + "num_input_tokens_seen": 261590910, + "step": 12119, + "time_per_iteration": 2.542330026626587 + }, + { + "auxiliary_loss_clip": 0.01112672, + "auxiliary_loss_mlp": 0.01028942, + "balance_loss_clip": 1.03870821, + "balance_loss_mlp": 1.01702118, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.220230965988069, + "language_loss": 0.82034206, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84175819, + "num_input_tokens_seen": 261606005, + "step": 12120, + "time_per_iteration": 2.383955955505371 + }, + { + "auxiliary_loss_clip": 0.01072875, + "auxiliary_loss_mlp": 0.01038728, + "balance_loss_clip": 1.03497088, + "balance_loss_mlp": 1.02644992, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 2.2757853148287595, + "language_loss": 0.78946835, + "learning_rate": 7.232804293403963e-07, + "loss": 0.81058443, + "num_input_tokens_seen": 261622305, + "step": 12121, + "time_per_iteration": 3.9457764625549316 + }, + { + "auxiliary_loss_clip": 0.01112042, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.03610897, + "balance_loss_mlp": 1.02323198, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.6984576180396345, + "language_loss": 0.6869905, + "learning_rate": 7.229806700436441e-07, + "loss": 0.70847368, + "num_input_tokens_seen": 261642465, + "step": 12122, + "time_per_iteration": 2.5033695697784424 + }, + { + "auxiliary_loss_clip": 0.01066658, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.03472257, + "balance_loss_mlp": 1.0195446, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 2.7850548171479987, + "language_loss": 0.87074864, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89172566, + "num_input_tokens_seen": 261661420, + "step": 12123, + "time_per_iteration": 2.5764458179473877 + }, + { + "auxiliary_loss_clip": 0.0107698, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.0341475, + "balance_loss_mlp": 1.02012837, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 2.384410302254201, + "language_loss": 0.83254385, + "learning_rate": 7.223812967356065e-07, + "loss": 0.85363668, + "num_input_tokens_seen": 261680865, + "step": 12124, + "time_per_iteration": 2.5528740882873535 + }, + { + "auxiliary_loss_clip": 0.01081446, + "auxiliary_loss_mlp": 0.01029602, + "balance_loss_clip": 1.03538465, + "balance_loss_mlp": 1.01803863, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.8116346101185894, + "language_loss": 0.67441082, + "learning_rate": 7.220816827470499e-07, + "loss": 0.6955213, + "num_input_tokens_seen": 261701455, + "step": 12125, + "time_per_iteration": 2.53005051612854 + }, + { + "auxiliary_loss_clip": 0.01105445, + "auxiliary_loss_mlp": 0.0103566, + "balance_loss_clip": 1.03743875, + "balance_loss_mlp": 1.02226067, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 2.03341258517947, + "language_loss": 0.75226694, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77367795, + "num_input_tokens_seen": 261721260, + "step": 12126, + "time_per_iteration": 2.4688560962677 + }, + { + "auxiliary_loss_clip": 0.01012539, + "auxiliary_loss_mlp": 0.01000772, + "balance_loss_clip": 1.0081985, + "balance_loss_mlp": 0.99949664, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8241835831366828, + "language_loss": 0.58731681, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60744995, + "num_input_tokens_seen": 261779370, + "step": 12127, + "time_per_iteration": 3.012533187866211 + }, + { + "auxiliary_loss_clip": 0.01078758, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.03640413, + "balance_loss_mlp": 1.01593065, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 1.9635768109494736, + "language_loss": 0.68533468, + "learning_rate": 7.21183131579562e-07, + "loss": 0.70639837, + "num_input_tokens_seen": 261798050, + "step": 12128, + "time_per_iteration": 2.5528347492218018 + }, + { + "auxiliary_loss_clip": 0.0108714, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.03694701, + "balance_loss_mlp": 1.02013147, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 2.308874426676239, + "language_loss": 0.65707517, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67828429, + "num_input_tokens_seen": 261817660, + "step": 12129, + "time_per_iteration": 2.55332612991333 + }, + { + "auxiliary_loss_clip": 0.01108835, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.03772378, + "balance_loss_mlp": 1.01717544, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 2.5504663619675174, + "language_loss": 0.74222821, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76361573, + "num_input_tokens_seen": 261837935, + "step": 12130, + "time_per_iteration": 2.4713354110717773 + }, + { + "auxiliary_loss_clip": 0.01084802, + "auxiliary_loss_mlp": 0.01030861, + "balance_loss_clip": 1.03269529, + "balance_loss_mlp": 1.01806951, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.747941882269077, + "language_loss": 0.69572371, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71688032, + "num_input_tokens_seen": 261857575, + "step": 12131, + "time_per_iteration": 2.496823310852051 + }, + { + "auxiliary_loss_clip": 0.0107668, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.03825188, + "balance_loss_mlp": 1.01905966, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.606483969667208, + "language_loss": 0.77350008, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79457343, + "num_input_tokens_seen": 261877265, + "step": 12132, + "time_per_iteration": 4.038409233093262 + }, + { + "auxiliary_loss_clip": 0.01099823, + "auxiliary_loss_mlp": 0.01039314, + "balance_loss_clip": 1.0372858, + "balance_loss_mlp": 1.02728617, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 3.264029287057859, + "language_loss": 0.79324853, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81463993, + "num_input_tokens_seen": 261893695, + "step": 12133, + "time_per_iteration": 2.431817054748535 + }, + { + "auxiliary_loss_clip": 0.01063345, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.03304601, + "balance_loss_mlp": 1.01911867, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 1.9345041926796105, + "language_loss": 0.72189307, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74284589, + "num_input_tokens_seen": 261911825, + "step": 12134, + "time_per_iteration": 2.5576772689819336 + }, + { + "auxiliary_loss_clip": 0.01092082, + "auxiliary_loss_mlp": 0.01038827, + "balance_loss_clip": 1.03891277, + "balance_loss_mlp": 1.02638125, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.86949751017363, + "language_loss": 0.716699, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73800814, + "num_input_tokens_seen": 261931190, + "step": 12135, + "time_per_iteration": 2.510495901107788 + }, + { + "auxiliary_loss_clip": 0.01078221, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.03469336, + "balance_loss_mlp": 1.02610803, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.4901594418204356, + "language_loss": 0.61964059, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64080828, + "num_input_tokens_seen": 261951240, + "step": 12136, + "time_per_iteration": 2.618975877761841 + }, + { + "auxiliary_loss_clip": 0.01097575, + "auxiliary_loss_mlp": 0.00777445, + "balance_loss_clip": 1.03972697, + "balance_loss_mlp": 1.00057435, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 3.3839026458943304, + "language_loss": 0.74718392, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76593411, + "num_input_tokens_seen": 261971605, + "step": 12137, + "time_per_iteration": 2.515211343765259 + }, + { + "auxiliary_loss_clip": 0.01104545, + "auxiliary_loss_mlp": 0.00777941, + "balance_loss_clip": 1.04091191, + "balance_loss_mlp": 1.00058436, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 2.488708843051849, + "language_loss": 0.74406332, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76288819, + "num_input_tokens_seen": 261990830, + "step": 12138, + "time_per_iteration": 2.4864706993103027 + }, + { + "auxiliary_loss_clip": 0.01073041, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.03454709, + "balance_loss_mlp": 1.01765037, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 4.369401905991865, + "language_loss": 0.71848559, + "learning_rate": 7.178921802463702e-07, + "loss": 0.73951113, + "num_input_tokens_seen": 262008190, + "step": 12139, + "time_per_iteration": 2.493468999862671 + }, + { + "auxiliary_loss_clip": 0.01097777, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.03899801, + "balance_loss_mlp": 1.02011847, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.4340290630838484, + "language_loss": 0.73519123, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75648582, + "num_input_tokens_seen": 262030460, + "step": 12140, + "time_per_iteration": 2.5241992473602295 + }, + { + "auxiliary_loss_clip": 0.01085382, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.03662276, + "balance_loss_mlp": 1.02158296, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.764969803494674, + "language_loss": 0.55376029, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57496452, + "num_input_tokens_seen": 262050830, + "step": 12141, + "time_per_iteration": 2.584984540939331 + }, + { + "auxiliary_loss_clip": 0.0107319, + "auxiliary_loss_mlp": 0.01027879, + "balance_loss_clip": 1.03294039, + "balance_loss_mlp": 1.01651216, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.5378331707457606, + "language_loss": 0.72515416, + "learning_rate": 7.169956684003342e-07, + "loss": 0.7461648, + "num_input_tokens_seen": 262071245, + "step": 12142, + "time_per_iteration": 2.53969144821167 + }, + { + "auxiliary_loss_clip": 0.01110033, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.03784192, + "balance_loss_mlp": 1.02353418, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.9028919331834488, + "language_loss": 0.73685217, + "learning_rate": 7.16696928406521e-07, + "loss": 0.75830871, + "num_input_tokens_seen": 262087525, + "step": 12143, + "time_per_iteration": 2.416889190673828 + }, + { + "auxiliary_loss_clip": 0.01076959, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.0385617, + "balance_loss_mlp": 1.02193069, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 2.1610209805640603, + "language_loss": 0.66805845, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68917501, + "num_input_tokens_seen": 262107355, + "step": 12144, + "time_per_iteration": 2.5541601181030273 + }, + { + "auxiliary_loss_clip": 0.0108507, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.03607905, + "balance_loss_mlp": 1.01815593, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 1.7819490724080005, + "language_loss": 0.79112267, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81227899, + "num_input_tokens_seen": 262125645, + "step": 12145, + "time_per_iteration": 2.4628095626831055 + }, + { + "auxiliary_loss_clip": 0.01072103, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.03564298, + "balance_loss_mlp": 1.01762474, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.8490904036751654, + "language_loss": 0.91387582, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93490022, + "num_input_tokens_seen": 262144075, + "step": 12146, + "time_per_iteration": 2.5483694076538086 + }, + { + "auxiliary_loss_clip": 0.01108526, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.03830314, + "balance_loss_mlp": 1.01668394, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 3.0220547915214837, + "language_loss": 0.62374586, + "learning_rate": 7.155024551743316e-07, + "loss": 0.6451149, + "num_input_tokens_seen": 262165940, + "step": 12147, + "time_per_iteration": 3.922821283340454 + }, + { + "auxiliary_loss_clip": 0.01114241, + "auxiliary_loss_mlp": 0.01039262, + "balance_loss_clip": 1.04068398, + "balance_loss_mlp": 1.02640533, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.9005959933118712, + "language_loss": 0.75126314, + "learning_rate": 7.152039586086693e-07, + "loss": 0.77279824, + "num_input_tokens_seen": 262184520, + "step": 12148, + "time_per_iteration": 2.4067559242248535 + }, + { + "auxiliary_loss_clip": 0.01014971, + "auxiliary_loss_mlp": 0.00753315, + "balance_loss_clip": 1.01070094, + "balance_loss_mlp": 1.00019228, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.685895648308116, + "language_loss": 0.5667823, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58446515, + "num_input_tokens_seen": 262247070, + "step": 12149, + "time_per_iteration": 3.055173397064209 + }, + { + "auxiliary_loss_clip": 0.01090557, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.03607786, + "balance_loss_mlp": 1.01757169, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.6532185441465488, + "language_loss": 0.73826206, + "learning_rate": 7.146071116474451e-07, + "loss": 0.75946629, + "num_input_tokens_seen": 262266605, + "step": 12150, + "time_per_iteration": 2.4956464767456055 + }, + { + "auxiliary_loss_clip": 0.01113206, + "auxiliary_loss_mlp": 0.01032472, + "balance_loss_clip": 1.03768635, + "balance_loss_mlp": 1.01937652, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 2.0333173260220665, + "language_loss": 0.83950019, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86095697, + "num_input_tokens_seen": 262283880, + "step": 12151, + "time_per_iteration": 2.3962106704711914 + }, + { + "auxiliary_loss_clip": 0.0107647, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.03467286, + "balance_loss_mlp": 1.02414727, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 2.3547455169485008, + "language_loss": 0.78053689, + "learning_rate": 7.14010459655127e-07, + "loss": 0.80167377, + "num_input_tokens_seen": 262304155, + "step": 12152, + "time_per_iteration": 2.5463595390319824 + }, + { + "auxiliary_loss_clip": 0.01076464, + "auxiliary_loss_mlp": 0.01031174, + "balance_loss_clip": 1.03834105, + "balance_loss_mlp": 1.01893103, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.5806889170669542, + "language_loss": 0.79636276, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81743908, + "num_input_tokens_seen": 262325660, + "step": 12153, + "time_per_iteration": 2.5902109146118164 + }, + { + "auxiliary_loss_clip": 0.01103481, + "auxiliary_loss_mlp": 0.01035703, + "balance_loss_clip": 1.03946733, + "balance_loss_mlp": 1.0235194, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.8121499756050614, + "language_loss": 0.67613947, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69753128, + "num_input_tokens_seen": 262344075, + "step": 12154, + "time_per_iteration": 2.440109968185425 + }, + { + "auxiliary_loss_clip": 0.01065629, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.03749585, + "balance_loss_mlp": 1.01861668, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.9536255348098999, + "language_loss": 0.65793562, + "learning_rate": 7.131158474313128e-07, + "loss": 0.67890322, + "num_input_tokens_seen": 262363305, + "step": 12155, + "time_per_iteration": 2.569826126098633 + }, + { + "auxiliary_loss_clip": 0.01088462, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.03733516, + "balance_loss_mlp": 1.01890743, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.9120552536949824, + "language_loss": 0.81646943, + "learning_rate": 7.128177409391851e-07, + "loss": 0.83766139, + "num_input_tokens_seen": 262380730, + "step": 12156, + "time_per_iteration": 2.522575616836548 + }, + { + "auxiliary_loss_clip": 0.01073273, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.03501177, + "balance_loss_mlp": 1.02036238, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 2.0705724123911486, + "language_loss": 0.75179493, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77284861, + "num_input_tokens_seen": 262395480, + "step": 12157, + "time_per_iteration": 4.113892555236816 + }, + { + "auxiliary_loss_clip": 0.01096362, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.0373503, + "balance_loss_mlp": 1.01773572, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.024145468210365, + "language_loss": 0.72982192, + "learning_rate": 7.122216743964713e-07, + "loss": 0.75107211, + "num_input_tokens_seen": 262413340, + "step": 12158, + "time_per_iteration": 2.44679594039917 + }, + { + "auxiliary_loss_clip": 0.01090312, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.03759217, + "balance_loss_mlp": 1.02104688, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.7052259006697104, + "language_loss": 0.86014009, + "learning_rate": 7.119237143684896e-07, + "loss": 0.88138074, + "num_input_tokens_seen": 262433455, + "step": 12159, + "time_per_iteration": 2.5502750873565674 + }, + { + "auxiliary_loss_clip": 0.01093746, + "auxiliary_loss_mlp": 0.0102964, + "balance_loss_clip": 1.03697741, + "balance_loss_mlp": 1.0160557, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 2.756076847016121, + "language_loss": 0.7357378, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75697172, + "num_input_tokens_seen": 262450335, + "step": 12160, + "time_per_iteration": 3.8414595127105713 + }, + { + "auxiliary_loss_clip": 0.01104224, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.03912115, + "balance_loss_mlp": 1.01941121, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 2.231084934475389, + "language_loss": 0.73125637, + "learning_rate": 7.113279408557675e-07, + "loss": 0.75262797, + "num_input_tokens_seen": 262468240, + "step": 12161, + "time_per_iteration": 2.44280743598938 + }, + { + "auxiliary_loss_clip": 0.01085261, + "auxiliary_loss_mlp": 0.00779955, + "balance_loss_clip": 1.03625619, + "balance_loss_mlp": 1.00058627, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 2.2177884875406386, + "language_loss": 0.69710612, + "learning_rate": 7.110301273936192e-07, + "loss": 0.7157582, + "num_input_tokens_seen": 262487045, + "step": 12162, + "time_per_iteration": 2.583211660385132 + }, + { + "auxiliary_loss_clip": 0.01104506, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.04007888, + "balance_loss_mlp": 1.01579571, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.7826356241148718, + "language_loss": 0.66673613, + "learning_rate": 7.107323628093382e-07, + "loss": 0.68806791, + "num_input_tokens_seen": 262504855, + "step": 12163, + "time_per_iteration": 2.4956107139587402 + }, + { + "auxiliary_loss_clip": 0.01089543, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.0363127, + "balance_loss_mlp": 1.01679659, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.5471569925778212, + "language_loss": 0.6862694, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70746011, + "num_input_tokens_seen": 262524920, + "step": 12164, + "time_per_iteration": 2.51489520072937 + }, + { + "auxiliary_loss_clip": 0.01064681, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.03720868, + "balance_loss_mlp": 1.01879001, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.459283305422684, + "language_loss": 0.73114145, + "learning_rate": 7.101369803195391e-07, + "loss": 0.75208795, + "num_input_tokens_seen": 262545725, + "step": 12165, + "time_per_iteration": 2.5860538482666016 + }, + { + "auxiliary_loss_clip": 0.01104388, + "auxiliary_loss_mlp": 0.01035848, + "balance_loss_clip": 1.03847599, + "balance_loss_mlp": 1.0224613, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.9678484740346707, + "language_loss": 0.76883543, + "learning_rate": 7.098393624365988e-07, + "loss": 0.79023778, + "num_input_tokens_seen": 262565480, + "step": 12166, + "time_per_iteration": 2.4818506240844727 + }, + { + "auxiliary_loss_clip": 0.01084435, + "auxiliary_loss_mlp": 0.01030525, + "balance_loss_clip": 1.03707993, + "balance_loss_mlp": 1.01809168, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 1.8344477950390787, + "language_loss": 0.79466343, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81581306, + "num_input_tokens_seen": 262584145, + "step": 12167, + "time_per_iteration": 2.485635995864868 + }, + { + "auxiliary_loss_clip": 0.0109946, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_clip": 1.03793502, + "balance_loss_mlp": 1.03013825, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.8627409429087076, + "language_loss": 0.77133882, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79275823, + "num_input_tokens_seen": 262604045, + "step": 12168, + "time_per_iteration": 2.494579315185547 + }, + { + "auxiliary_loss_clip": 0.01098699, + "auxiliary_loss_mlp": 0.01046894, + "balance_loss_clip": 1.03602231, + "balance_loss_mlp": 1.03084838, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 1.4327043388075702, + "language_loss": 0.81885338, + "learning_rate": 7.089468023710326e-07, + "loss": 0.84030938, + "num_input_tokens_seen": 262624540, + "step": 12169, + "time_per_iteration": 2.46950364112854 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.04307747, + "balance_loss_mlp": 1.02379632, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.7897077292338621, + "language_loss": 0.70113146, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72253036, + "num_input_tokens_seen": 262644545, + "step": 12170, + "time_per_iteration": 2.548053026199341 + }, + { + "auxiliary_loss_clip": 0.01111588, + "auxiliary_loss_mlp": 0.01032926, + "balance_loss_clip": 1.0388931, + "balance_loss_mlp": 1.01889563, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.0628230503512537, + "language_loss": 0.69620246, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71764761, + "num_input_tokens_seen": 262662570, + "step": 12171, + "time_per_iteration": 4.009016513824463 + }, + { + "auxiliary_loss_clip": 0.01111404, + "auxiliary_loss_mlp": 0.01041388, + "balance_loss_clip": 1.03918111, + "balance_loss_mlp": 1.02884126, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 1.6295853538802605, + "language_loss": 0.65676129, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67828918, + "num_input_tokens_seen": 262683245, + "step": 12172, + "time_per_iteration": 2.5152759552001953 + }, + { + "auxiliary_loss_clip": 0.0111496, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.04105306, + "balance_loss_mlp": 1.01883793, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.5222106380712983, + "language_loss": 0.61178958, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63325316, + "num_input_tokens_seen": 262701585, + "step": 12173, + "time_per_iteration": 2.4062070846557617 + }, + { + "auxiliary_loss_clip": 0.01062921, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.03896403, + "balance_loss_mlp": 1.01728547, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 2.424850296359256, + "language_loss": 0.74300957, + "learning_rate": 7.074601815494243e-07, + "loss": 0.76393431, + "num_input_tokens_seen": 262719295, + "step": 12174, + "time_per_iteration": 2.573369264602661 + }, + { + "auxiliary_loss_clip": 0.01108838, + "auxiliary_loss_mlp": 0.01027207, + "balance_loss_clip": 1.03821361, + "balance_loss_mlp": 1.01563215, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.5385474477475776, + "language_loss": 0.8106901, + "learning_rate": 7.071630043797317e-07, + "loss": 0.83205056, + "num_input_tokens_seen": 262739995, + "step": 12175, + "time_per_iteration": 2.4785373210906982 + }, + { + "auxiliary_loss_clip": 0.01090326, + "auxiliary_loss_mlp": 0.01030975, + "balance_loss_clip": 1.0372057, + "balance_loss_mlp": 1.01883316, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 2.064726161999955, + "language_loss": 0.76854265, + "learning_rate": 7.068658762345488e-07, + "loss": 0.7897557, + "num_input_tokens_seen": 262757680, + "step": 12176, + "time_per_iteration": 2.5345335006713867 + }, + { + "auxiliary_loss_clip": 0.01099257, + "auxiliary_loss_mlp": 0.01034674, + "balance_loss_clip": 1.03908598, + "balance_loss_mlp": 1.0224669, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 1.5461707112971366, + "language_loss": 0.76394224, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78528154, + "num_input_tokens_seen": 262776990, + "step": 12177, + "time_per_iteration": 2.472975254058838 + }, + { + "auxiliary_loss_clip": 0.01078059, + "auxiliary_loss_mlp": 0.0103499, + "balance_loss_clip": 1.03622675, + "balance_loss_mlp": 1.02346849, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.3523940983247558, + "language_loss": 0.74230838, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76343894, + "num_input_tokens_seen": 262795440, + "step": 12178, + "time_per_iteration": 2.495683193206787 + }, + { + "auxiliary_loss_clip": 0.01092966, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.03563654, + "balance_loss_mlp": 1.01848078, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 2.0082822077930906, + "language_loss": 0.82525045, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84648883, + "num_input_tokens_seen": 262816385, + "step": 12179, + "time_per_iteration": 2.530553102493286 + }, + { + "auxiliary_loss_clip": 0.01076101, + "auxiliary_loss_mlp": 0.01038661, + "balance_loss_clip": 1.03547168, + "balance_loss_mlp": 1.02539873, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 1.5802500366743517, + "language_loss": 0.7450999, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76624751, + "num_input_tokens_seen": 262834955, + "step": 12180, + "time_per_iteration": 2.456041097640991 + }, + { + "auxiliary_loss_clip": 0.01101899, + "auxiliary_loss_mlp": 0.00778206, + "balance_loss_clip": 1.03517604, + "balance_loss_mlp": 1.00067878, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 1.8596070553749, + "language_loss": 0.79644299, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81524408, + "num_input_tokens_seen": 262853555, + "step": 12181, + "time_per_iteration": 2.5891873836517334 + }, + { + "auxiliary_loss_clip": 0.01103569, + "auxiliary_loss_mlp": 0.00778588, + "balance_loss_clip": 1.03818071, + "balance_loss_mlp": 1.00067055, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 1.9915024946905233, + "language_loss": 0.72299814, + "learning_rate": 7.050841375089506e-07, + "loss": 0.74181974, + "num_input_tokens_seen": 262870975, + "step": 12182, + "time_per_iteration": 2.4848737716674805 + }, + { + "auxiliary_loss_clip": 0.01115431, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.04074323, + "balance_loss_mlp": 1.02155328, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.712762011065059, + "language_loss": 0.70888644, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73037899, + "num_input_tokens_seen": 262892635, + "step": 12183, + "time_per_iteration": 2.5233750343322754 + }, + { + "auxiliary_loss_clip": 0.01106428, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.04563594, + "balance_loss_mlp": 1.02465725, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 1.8703605824003955, + "language_loss": 0.72821319, + "learning_rate": 7.04490617307045e-07, + "loss": 0.74966002, + "num_input_tokens_seen": 262910725, + "step": 12184, + "time_per_iteration": 2.475703001022339 + }, + { + "auxiliary_loss_clip": 0.01012332, + "auxiliary_loss_mlp": 0.01000346, + "balance_loss_clip": 1.0068258, + "balance_loss_mlp": 0.99903458, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.7602138897138898, + "language_loss": 0.65220797, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67233467, + "num_input_tokens_seen": 262974150, + "step": 12185, + "time_per_iteration": 3.052725076675415 + }, + { + "auxiliary_loss_clip": 0.01111792, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.03687012, + "balance_loss_mlp": 1.016294, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 2.297708080163748, + "language_loss": 0.80204481, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82345891, + "num_input_tokens_seen": 262993370, + "step": 12186, + "time_per_iteration": 2.4386825561523438 + }, + { + "auxiliary_loss_clip": 0.01095205, + "auxiliary_loss_mlp": 0.01037392, + "balance_loss_clip": 1.03634262, + "balance_loss_mlp": 1.02387404, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.702876320986116, + "language_loss": 0.73336267, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75468862, + "num_input_tokens_seen": 263012665, + "step": 12187, + "time_per_iteration": 3.9266815185546875 + }, + { + "auxiliary_loss_clip": 0.01115732, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.04055715, + "balance_loss_mlp": 1.02507079, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.7665698670263283, + "language_loss": 0.88824832, + "learning_rate": 7.033041665033716e-07, + "loss": 0.90978301, + "num_input_tokens_seen": 263031475, + "step": 12188, + "time_per_iteration": 2.4275622367858887 + }, + { + "auxiliary_loss_clip": 0.01070085, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.03305626, + "balance_loss_mlp": 1.02347493, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 2.537954508145259, + "language_loss": 0.74820352, + "learning_rate": 7.030076767014284e-07, + "loss": 0.7692703, + "num_input_tokens_seen": 263051445, + "step": 12189, + "time_per_iteration": 2.585320472717285 + }, + { + "auxiliary_loss_clip": 0.01081492, + "auxiliary_loss_mlp": 0.01029931, + "balance_loss_clip": 1.0395937, + "balance_loss_mlp": 1.01724744, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.8317886688020535, + "language_loss": 0.82588267, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84699684, + "num_input_tokens_seen": 263070835, + "step": 12190, + "time_per_iteration": 2.5348923206329346 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.03705478, + "balance_loss_mlp": 1.02707291, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.9235580778767183, + "language_loss": 0.71857619, + "learning_rate": 7.024148446550204e-07, + "loss": 0.73972237, + "num_input_tokens_seen": 263090070, + "step": 12191, + "time_per_iteration": 2.547715902328491 + }, + { + "auxiliary_loss_clip": 0.01114551, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.04028726, + "balance_loss_mlp": 1.02186894, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.721526705511116, + "language_loss": 0.68913913, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71063089, + "num_input_tokens_seen": 263110030, + "step": 12192, + "time_per_iteration": 2.500276565551758 + }, + { + "auxiliary_loss_clip": 0.010971, + "auxiliary_loss_mlp": 0.01034197, + "balance_loss_clip": 1.03625953, + "balance_loss_mlp": 1.02196062, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.604740786464439, + "language_loss": 0.73036277, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75167572, + "num_input_tokens_seen": 263129735, + "step": 12193, + "time_per_iteration": 2.475243091583252 + }, + { + "auxiliary_loss_clip": 0.01095788, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.03637075, + "balance_loss_mlp": 1.02277517, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 1.7554733763900972, + "language_loss": 0.77020705, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79152566, + "num_input_tokens_seen": 263149100, + "step": 12194, + "time_per_iteration": 2.4518558979034424 + }, + { + "auxiliary_loss_clip": 0.01100875, + "auxiliary_loss_mlp": 0.01034849, + "balance_loss_clip": 1.03968263, + "balance_loss_mlp": 1.02175415, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 4.42399633560323, + "language_loss": 0.70654869, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72790599, + "num_input_tokens_seen": 263166620, + "step": 12195, + "time_per_iteration": 2.577010154724121 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01044642, + "balance_loss_clip": 1.0381465, + "balance_loss_mlp": 1.03279877, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.9836617277147608, + "language_loss": 0.72706437, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74863827, + "num_input_tokens_seen": 263184780, + "step": 12196, + "time_per_iteration": 3.923858165740967 + }, + { + "auxiliary_loss_clip": 0.01110566, + "auxiliary_loss_mlp": 0.01034382, + "balance_loss_clip": 1.03822172, + "balance_loss_mlp": 1.02125144, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.9453926409769944, + "language_loss": 0.71894521, + "learning_rate": 7.006375297847394e-07, + "loss": 0.74039459, + "num_input_tokens_seen": 263204625, + "step": 12197, + "time_per_iteration": 2.4770748615264893 + }, + { + "auxiliary_loss_clip": 0.01062309, + "auxiliary_loss_mlp": 0.00778975, + "balance_loss_clip": 1.03514135, + "balance_loss_mlp": 1.00071669, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 2.0951464137144824, + "language_loss": 0.78417301, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80258584, + "num_input_tokens_seen": 263221565, + "step": 12198, + "time_per_iteration": 2.5847902297973633 + }, + { + "auxiliary_loss_clip": 0.01057295, + "auxiliary_loss_mlp": 0.01032988, + "balance_loss_clip": 1.03489017, + "balance_loss_mlp": 1.02098393, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 2.197193211732214, + "language_loss": 0.74691737, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76782018, + "num_input_tokens_seen": 263240620, + "step": 12199, + "time_per_iteration": 4.010565996170044 + }, + { + "auxiliary_loss_clip": 0.0109688, + "auxiliary_loss_mlp": 0.0104037, + "balance_loss_clip": 1.04077363, + "balance_loss_mlp": 1.02741158, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.3399365529437985, + "language_loss": 0.77443141, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79580396, + "num_input_tokens_seen": 263254365, + "step": 12200, + "time_per_iteration": 2.4654173851013184 + }, + { + "auxiliary_loss_clip": 0.01073787, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_clip": 1.03651059, + "balance_loss_mlp": 1.01903296, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 2.357162175951035, + "language_loss": 0.61360174, + "learning_rate": 6.994536384938754e-07, + "loss": 0.6346491, + "num_input_tokens_seen": 263275880, + "step": 12201, + "time_per_iteration": 2.5581510066986084 + }, + { + "auxiliary_loss_clip": 0.01073874, + "auxiliary_loss_mlp": 0.00777912, + "balance_loss_clip": 1.03335071, + "balance_loss_mlp": 1.00059044, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 1.7025147710705528, + "language_loss": 0.52382797, + "learning_rate": 6.991577889352264e-07, + "loss": 0.54234582, + "num_input_tokens_seen": 263298315, + "step": 12202, + "time_per_iteration": 2.67288875579834 + }, + { + "auxiliary_loss_clip": 0.01085709, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.03371716, + "balance_loss_mlp": 1.01784873, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.6900964847629196, + "language_loss": 0.68776017, + "learning_rate": 6.98861988704645e-07, + "loss": 0.7089169, + "num_input_tokens_seen": 263318615, + "step": 12203, + "time_per_iteration": 2.500871181488037 + }, + { + "auxiliary_loss_clip": 0.01090459, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_clip": 1.04069018, + "balance_loss_mlp": 1.02900267, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.0280169370209977, + "language_loss": 0.65669262, + "learning_rate": 6.985662378133474e-07, + "loss": 0.67801929, + "num_input_tokens_seen": 263336705, + "step": 12204, + "time_per_iteration": 2.528352975845337 + }, + { + "auxiliary_loss_clip": 0.01086903, + "auxiliary_loss_mlp": 0.01036674, + "balance_loss_clip": 1.03868222, + "balance_loss_mlp": 1.02478266, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 2.0139198076922726, + "language_loss": 0.77443296, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79566872, + "num_input_tokens_seen": 263355065, + "step": 12205, + "time_per_iteration": 2.5147743225097656 + }, + { + "auxiliary_loss_clip": 0.01059842, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.03583002, + "balance_loss_mlp": 1.02235365, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.701871386937382, + "language_loss": 0.79831958, + "learning_rate": 6.979748840934601e-07, + "loss": 0.81925464, + "num_input_tokens_seen": 263374460, + "step": 12206, + "time_per_iteration": 2.5576670169830322 + }, + { + "auxiliary_loss_clip": 0.01072032, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.03332472, + "balance_loss_mlp": 1.02093577, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 1.9586738327416504, + "language_loss": 0.71916926, + "learning_rate": 6.976792812872958e-07, + "loss": 0.74022424, + "num_input_tokens_seen": 263393610, + "step": 12207, + "time_per_iteration": 2.609811782836914 + }, + { + "auxiliary_loss_clip": 0.01012957, + "auxiliary_loss_mlp": 0.01010283, + "balance_loss_clip": 1.00769293, + "balance_loss_mlp": 1.00907886, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7801831029938657, + "language_loss": 0.54764694, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56787932, + "num_input_tokens_seen": 263450340, + "step": 12208, + "time_per_iteration": 3.1708714962005615 + }, + { + "auxiliary_loss_clip": 0.01111668, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.0397284, + "balance_loss_mlp": 1.02362442, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.6506090975063512, + "language_loss": 0.80180931, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82327366, + "num_input_tokens_seen": 263471735, + "step": 12209, + "time_per_iteration": 2.452336311340332 + }, + { + "auxiliary_loss_clip": 0.01106629, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.03498125, + "balance_loss_mlp": 1.01889777, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.5011575315480319, + "language_loss": 0.78721273, + "learning_rate": 6.96792769218423e-07, + "loss": 0.80858344, + "num_input_tokens_seen": 263493245, + "step": 12210, + "time_per_iteration": 2.470334529876709 + }, + { + "auxiliary_loss_clip": 0.01109935, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.03815889, + "balance_loss_mlp": 1.02159929, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 1.8197538563134743, + "language_loss": 0.75979364, + "learning_rate": 6.964973640160236e-07, + "loss": 0.7812351, + "num_input_tokens_seen": 263511660, + "step": 12211, + "time_per_iteration": 3.9150795936584473 + }, + { + "auxiliary_loss_clip": 0.0108865, + "auxiliary_loss_mlp": 0.01028565, + "balance_loss_clip": 1.0394367, + "balance_loss_mlp": 1.01611376, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 1.8781358131427113, + "language_loss": 0.72428811, + "learning_rate": 6.962020082425748e-07, + "loss": 0.74546027, + "num_input_tokens_seen": 263530875, + "step": 12212, + "time_per_iteration": 2.5091962814331055 + }, + { + "auxiliary_loss_clip": 0.01112884, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.04014206, + "balance_loss_mlp": 1.02283883, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.4434423049992604, + "language_loss": 0.68882066, + "learning_rate": 6.959067019092766e-07, + "loss": 0.71030271, + "num_input_tokens_seen": 263551585, + "step": 12213, + "time_per_iteration": 2.435012102127075 + }, + { + "auxiliary_loss_clip": 0.01030486, + "auxiliary_loss_mlp": 0.01004578, + "balance_loss_clip": 1.00681782, + "balance_loss_mlp": 1.00324905, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7309414280362063, + "language_loss": 0.54318488, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56353557, + "num_input_tokens_seen": 263609545, + "step": 12214, + "time_per_iteration": 2.89292573928833 + }, + { + "auxiliary_loss_clip": 0.01114396, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.03849244, + "balance_loss_mlp": 1.01955676, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 4.982557056279636, + "language_loss": 0.70401216, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72547746, + "num_input_tokens_seen": 263627880, + "step": 12215, + "time_per_iteration": 2.4064879417419434 + }, + { + "auxiliary_loss_clip": 0.01082982, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.03575993, + "balance_loss_mlp": 1.02237976, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.5374540621219228, + "language_loss": 0.7274974, + "learning_rate": 6.950210796622573e-07, + "loss": 0.74866825, + "num_input_tokens_seen": 263645665, + "step": 12216, + "time_per_iteration": 2.4858617782592773 + }, + { + "auxiliary_loss_clip": 0.01117563, + "auxiliary_loss_mlp": 0.01042683, + "balance_loss_clip": 1.03884375, + "balance_loss_mlp": 1.02821732, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.827945196552067, + "language_loss": 0.78538942, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80699188, + "num_input_tokens_seen": 263668170, + "step": 12217, + "time_per_iteration": 2.4647138118743896 + }, + { + "auxiliary_loss_clip": 0.01077848, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.04240632, + "balance_loss_mlp": 1.01635504, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 1.9691053273818346, + "language_loss": 0.78422701, + "learning_rate": 6.94430912236911e-07, + "loss": 0.80527747, + "num_input_tokens_seen": 263684190, + "step": 12218, + "time_per_iteration": 2.553283214569092 + }, + { + "auxiliary_loss_clip": 0.01063983, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.0332675, + "balance_loss_mlp": 1.02612567, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 1.9309445506143819, + "language_loss": 0.72100246, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74203932, + "num_input_tokens_seen": 263702095, + "step": 12219, + "time_per_iteration": 2.57808518409729 + }, + { + "auxiliary_loss_clip": 0.01085574, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.03374505, + "balance_loss_mlp": 1.02119112, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.6477963496141623, + "language_loss": 0.74908692, + "learning_rate": 6.938409428408061e-07, + "loss": 0.77027822, + "num_input_tokens_seen": 263721385, + "step": 12220, + "time_per_iteration": 2.5154030323028564 + }, + { + "auxiliary_loss_clip": 0.01102098, + "auxiliary_loss_mlp": 0.01034846, + "balance_loss_clip": 1.03599191, + "balance_loss_mlp": 1.02250195, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.745554760994927, + "language_loss": 0.65677631, + "learning_rate": 6.93546032431684e-07, + "loss": 0.67814577, + "num_input_tokens_seen": 263737835, + "step": 12221, + "time_per_iteration": 2.430534601211548 + }, + { + "auxiliary_loss_clip": 0.01085992, + "auxiliary_loss_mlp": 0.01038807, + "balance_loss_clip": 1.03426123, + "balance_loss_mlp": 1.02624893, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 1.75880937164506, + "language_loss": 0.69307011, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71431816, + "num_input_tokens_seen": 263756480, + "step": 12222, + "time_per_iteration": 2.543576240539551 + }, + { + "auxiliary_loss_clip": 0.0106504, + "auxiliary_loss_mlp": 0.01028685, + "balance_loss_clip": 1.03477502, + "balance_loss_mlp": 1.01765871, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.8687717275265567, + "language_loss": 0.6583727, + "learning_rate": 6.92956360247217e-07, + "loss": 0.6793099, + "num_input_tokens_seen": 263776440, + "step": 12223, + "time_per_iteration": 2.610868453979492 + }, + { + "auxiliary_loss_clip": 0.01094874, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.03539968, + "balance_loss_mlp": 1.01644576, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.9478744301943636, + "language_loss": 0.72478884, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74602747, + "num_input_tokens_seen": 263793700, + "step": 12224, + "time_per_iteration": 2.444000005722046 + }, + { + "auxiliary_loss_clip": 0.01085359, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.04119658, + "balance_loss_mlp": 1.01751268, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.6411584188747406, + "language_loss": 0.72689003, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74803722, + "num_input_tokens_seen": 263814620, + "step": 12225, + "time_per_iteration": 2.60440993309021 + }, + { + "auxiliary_loss_clip": 0.01114715, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.03849518, + "balance_loss_mlp": 1.02141345, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.7008078481875317, + "language_loss": 0.76316839, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78466582, + "num_input_tokens_seen": 263832725, + "step": 12226, + "time_per_iteration": 3.948624610900879 + }, + { + "auxiliary_loss_clip": 0.01083929, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.0335772, + "balance_loss_mlp": 1.01749575, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.5310200133585055, + "language_loss": 0.66810256, + "learning_rate": 6.917776107264008e-07, + "loss": 0.68925536, + "num_input_tokens_seen": 263853850, + "step": 12227, + "time_per_iteration": 2.506094455718994 + }, + { + "auxiliary_loss_clip": 0.01101139, + "auxiliary_loss_mlp": 0.01033417, + "balance_loss_clip": 1.03652394, + "balance_loss_mlp": 1.0215739, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.8822505610372762, + "language_loss": 0.6376183, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65896386, + "num_input_tokens_seen": 263874760, + "step": 12228, + "time_per_iteration": 2.5011444091796875 + }, + { + "auxiliary_loss_clip": 0.010912, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.03763223, + "balance_loss_mlp": 1.02143335, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 2.894880803304608, + "language_loss": 0.6384871, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65972728, + "num_input_tokens_seen": 263893390, + "step": 12229, + "time_per_iteration": 2.506486415863037 + }, + { + "auxiliary_loss_clip": 0.01087476, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.03920507, + "balance_loss_mlp": 1.02690959, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 2.8470473081254126, + "language_loss": 0.73497617, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75625527, + "num_input_tokens_seen": 263911180, + "step": 12230, + "time_per_iteration": 2.617867946624756 + }, + { + "auxiliary_loss_clip": 0.01059349, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.03509498, + "balance_loss_mlp": 1.01886296, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 2.282048762109965, + "language_loss": 0.72279602, + "learning_rate": 6.90599654932332e-07, + "loss": 0.74370039, + "num_input_tokens_seen": 263928975, + "step": 12231, + "time_per_iteration": 2.603830575942993 + }, + { + "auxiliary_loss_clip": 0.01104069, + "auxiliary_loss_mlp": 0.01041479, + "balance_loss_clip": 1.03941011, + "balance_loss_mlp": 1.02748358, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 2.18244905906904, + "language_loss": 0.63984066, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66129613, + "num_input_tokens_seen": 263944495, + "step": 12232, + "time_per_iteration": 2.4596009254455566 + }, + { + "auxiliary_loss_clip": 0.01092711, + "auxiliary_loss_mlp": 0.0102855, + "balance_loss_clip": 1.03773105, + "balance_loss_mlp": 1.01607466, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.8524539678150167, + "language_loss": 0.75401348, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77522606, + "num_input_tokens_seen": 263961325, + "step": 12233, + "time_per_iteration": 2.4757657051086426 + }, + { + "auxiliary_loss_clip": 0.01111687, + "auxiliary_loss_mlp": 0.0102726, + "balance_loss_clip": 1.03800094, + "balance_loss_mlp": 1.01477849, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.6877439922042103, + "language_loss": 0.73257583, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75396532, + "num_input_tokens_seen": 263980445, + "step": 12234, + "time_per_iteration": 2.432912588119507 + }, + { + "auxiliary_loss_clip": 0.01100036, + "auxiliary_loss_mlp": 0.01030444, + "balance_loss_clip": 1.03639269, + "balance_loss_mlp": 1.01761699, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 2.142286093201241, + "language_loss": 0.5962826, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61758739, + "num_input_tokens_seen": 263999330, + "step": 12235, + "time_per_iteration": 2.512843370437622 + }, + { + "auxiliary_loss_clip": 0.01087753, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.03641248, + "balance_loss_mlp": 1.01454425, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.580134822630165, + "language_loss": 0.85835546, + "learning_rate": 6.891283274567259e-07, + "loss": 0.87949586, + "num_input_tokens_seen": 264014150, + "step": 12236, + "time_per_iteration": 3.9290261268615723 + }, + { + "auxiliary_loss_clip": 0.01099651, + "auxiliary_loss_mlp": 0.00776875, + "balance_loss_clip": 1.03644323, + "balance_loss_mlp": 1.00069213, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.8119260805242556, + "language_loss": 0.69348711, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71225238, + "num_input_tokens_seen": 264033140, + "step": 12237, + "time_per_iteration": 2.4656758308410645 + }, + { + "auxiliary_loss_clip": 0.0102704, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.03059602, + "balance_loss_mlp": 1.01487017, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.71872205784553, + "language_loss": 0.72236443, + "learning_rate": 6.885401443470839e-07, + "loss": 0.74290442, + "num_input_tokens_seen": 264052105, + "step": 12238, + "time_per_iteration": 2.7851338386535645 + }, + { + "auxiliary_loss_clip": 0.0108254, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.03376341, + "balance_loss_mlp": 1.01809335, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.8953363633290103, + "language_loss": 0.72422957, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74536884, + "num_input_tokens_seen": 264070690, + "step": 12239, + "time_per_iteration": 4.398822784423828 + }, + { + "auxiliary_loss_clip": 0.01078842, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.03612685, + "balance_loss_mlp": 1.01921225, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.3179463122308934, + "language_loss": 0.78816545, + "learning_rate": 6.879521601601954e-07, + "loss": 0.80926383, + "num_input_tokens_seen": 264094225, + "step": 12240, + "time_per_iteration": 2.559985399246216 + }, + { + "auxiliary_loss_clip": 0.01100046, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.03697777, + "balance_loss_mlp": 1.02243865, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 1.9356888045872498, + "language_loss": 0.83174574, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85309452, + "num_input_tokens_seen": 264113190, + "step": 12241, + "time_per_iteration": 2.4832212924957275 + }, + { + "auxiliary_loss_clip": 0.01097536, + "auxiliary_loss_mlp": 0.01026037, + "balance_loss_clip": 1.03422856, + "balance_loss_mlp": 1.01389527, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 1.9964119791535568, + "language_loss": 0.78941137, + "learning_rate": 6.873643749852484e-07, + "loss": 0.81064713, + "num_input_tokens_seen": 264132050, + "step": 12242, + "time_per_iteration": 2.4617722034454346 + }, + { + "auxiliary_loss_clip": 0.01061855, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.03225017, + "balance_loss_mlp": 1.01823509, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 1.746224714794938, + "language_loss": 0.79210657, + "learning_rate": 6.870705570551145e-07, + "loss": 0.8130331, + "num_input_tokens_seen": 264152800, + "step": 12243, + "time_per_iteration": 2.6115503311157227 + }, + { + "auxiliary_loss_clip": 0.01101178, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.03529668, + "balance_loss_mlp": 1.0224545, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 2.336655878233538, + "language_loss": 0.74552786, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76689529, + "num_input_tokens_seen": 264169650, + "step": 12244, + "time_per_iteration": 2.4299535751342773 + }, + { + "auxiliary_loss_clip": 0.01094668, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.0339756, + "balance_loss_mlp": 1.01892531, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.5698817190993695, + "language_loss": 0.69362146, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71488643, + "num_input_tokens_seen": 264190530, + "step": 12245, + "time_per_iteration": 2.4992666244506836 + }, + { + "auxiliary_loss_clip": 0.01072321, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.03528559, + "balance_loss_mlp": 1.02074921, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.5673237552720534, + "language_loss": 0.73398411, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75504422, + "num_input_tokens_seen": 264210820, + "step": 12246, + "time_per_iteration": 2.5697672367095947 + }, + { + "auxiliary_loss_clip": 0.01084878, + "auxiliary_loss_mlp": 0.01023799, + "balance_loss_clip": 1.03505731, + "balance_loss_mlp": 1.01226008, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 2.1026345631104313, + "language_loss": 0.73204947, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75313628, + "num_input_tokens_seen": 264227430, + "step": 12247, + "time_per_iteration": 2.459578275680542 + }, + { + "auxiliary_loss_clip": 0.01099965, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.04031217, + "balance_loss_mlp": 1.01957011, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.7271484170796039, + "language_loss": 0.74489653, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76620764, + "num_input_tokens_seen": 264245230, + "step": 12248, + "time_per_iteration": 2.4398176670074463 + }, + { + "auxiliary_loss_clip": 0.01090684, + "auxiliary_loss_mlp": 0.01035982, + "balance_loss_clip": 1.0365411, + "balance_loss_mlp": 1.02316666, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 2.277594111554981, + "language_loss": 0.72813827, + "learning_rate": 6.853086953788727e-07, + "loss": 0.74940497, + "num_input_tokens_seen": 264263945, + "step": 12249, + "time_per_iteration": 2.4992001056671143 + }, + { + "auxiliary_loss_clip": 0.01089172, + "auxiliary_loss_mlp": 0.01027798, + "balance_loss_clip": 1.03611517, + "balance_loss_mlp": 1.01508474, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 1.8466975709947162, + "language_loss": 0.76988846, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79105818, + "num_input_tokens_seen": 264281500, + "step": 12250, + "time_per_iteration": 4.118049383163452 + }, + { + "auxiliary_loss_clip": 0.01070187, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.037714, + "balance_loss_mlp": 1.01586223, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 1.7039227435209134, + "language_loss": 0.71661693, + "learning_rate": 6.8472180686052e-07, + "loss": 0.73760617, + "num_input_tokens_seen": 264301625, + "step": 12251, + "time_per_iteration": 2.6281299591064453 + }, + { + "auxiliary_loss_clip": 0.01099607, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.04170728, + "balance_loss_mlp": 1.01833987, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.4928222872335166, + "language_loss": 0.65533048, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67663229, + "num_input_tokens_seen": 264323975, + "step": 12252, + "time_per_iteration": 2.821864366531372 + }, + { + "auxiliary_loss_clip": 0.0106525, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.03432727, + "balance_loss_mlp": 1.02016044, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.7171879741460558, + "language_loss": 0.7868824, + "learning_rate": 6.841351178440884e-07, + "loss": 0.80786741, + "num_input_tokens_seen": 264343785, + "step": 12253, + "time_per_iteration": 2.5707385540008545 + }, + { + "auxiliary_loss_clip": 0.01105539, + "auxiliary_loss_mlp": 0.00776531, + "balance_loss_clip": 1.03530383, + "balance_loss_mlp": 1.0007906, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.248155036612371, + "language_loss": 0.76261938, + "learning_rate": 6.83841848176905e-07, + "loss": 0.78144002, + "num_input_tokens_seen": 264361130, + "step": 12254, + "time_per_iteration": 2.426176071166992 + }, + { + "auxiliary_loss_clip": 0.01085245, + "auxiliary_loss_mlp": 0.01041804, + "balance_loss_clip": 1.0346508, + "balance_loss_mlp": 1.02767193, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.443500750395434, + "language_loss": 0.69598949, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71726, + "num_input_tokens_seen": 264376965, + "step": 12255, + "time_per_iteration": 2.46958589553833 + }, + { + "auxiliary_loss_clip": 0.01102196, + "auxiliary_loss_mlp": 0.01032376, + "balance_loss_clip": 1.03960562, + "balance_loss_mlp": 1.01919794, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 1.596188048057917, + "language_loss": 0.75197387, + "learning_rate": 6.832554585802012e-07, + "loss": 0.7733196, + "num_input_tokens_seen": 264396310, + "step": 12256, + "time_per_iteration": 2.525458335876465 + }, + { + "auxiliary_loss_clip": 0.01101691, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.03794193, + "balance_loss_mlp": 1.01747322, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.6127991612857329, + "language_loss": 0.73562425, + "learning_rate": 6.829623386729182e-07, + "loss": 0.75694609, + "num_input_tokens_seen": 264418085, + "step": 12257, + "time_per_iteration": 2.5974345207214355 + }, + { + "auxiliary_loss_clip": 0.01094743, + "auxiliary_loss_mlp": 0.01038104, + "balance_loss_clip": 1.03473485, + "balance_loss_mlp": 1.02542591, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.5342206614211644, + "language_loss": 0.78083944, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80216789, + "num_input_tokens_seen": 264437595, + "step": 12258, + "time_per_iteration": 2.491715431213379 + }, + { + "auxiliary_loss_clip": 0.01101577, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.03748083, + "balance_loss_mlp": 1.01946759, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.4056087422240306, + "language_loss": 0.6628077, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68413872, + "num_input_tokens_seen": 264457385, + "step": 12259, + "time_per_iteration": 2.480818510055542 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.01037457, + "balance_loss_clip": 1.04111552, + "balance_loss_mlp": 1.02392673, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 4.666898842597084, + "language_loss": 0.73031485, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75172734, + "num_input_tokens_seen": 264477205, + "step": 12260, + "time_per_iteration": 2.517019748687744 + }, + { + "auxiliary_loss_clip": 0.01097532, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.03610253, + "balance_loss_mlp": 1.02171791, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.7707007100262482, + "language_loss": 0.73504138, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75636458, + "num_input_tokens_seen": 264497195, + "step": 12261, + "time_per_iteration": 2.4623525142669678 + }, + { + "auxiliary_loss_clip": 0.0109493, + "auxiliary_loss_mlp": 0.01038274, + "balance_loss_clip": 1.03824341, + "balance_loss_mlp": 1.02420759, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 1.8811075407760203, + "language_loss": 0.67200339, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69333541, + "num_input_tokens_seen": 264516950, + "step": 12262, + "time_per_iteration": 2.5359199047088623 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.03751349, + "balance_loss_mlp": 1.01727819, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 2.206070460210825, + "language_loss": 0.88783079, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90925246, + "num_input_tokens_seen": 264532675, + "step": 12263, + "time_per_iteration": 2.4127111434936523 + }, + { + "auxiliary_loss_clip": 0.01104744, + "auxiliary_loss_mlp": 0.01025049, + "balance_loss_clip": 1.03636432, + "balance_loss_mlp": 1.01475501, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 1.5562838326573498, + "language_loss": 0.6722101, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69350803, + "num_input_tokens_seen": 264555635, + "step": 12264, + "time_per_iteration": 2.486668348312378 + }, + { + "auxiliary_loss_clip": 0.01106694, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.03610325, + "balance_loss_mlp": 1.0194726, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 1.7199165594823593, + "language_loss": 0.8020159, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82339656, + "num_input_tokens_seen": 264573140, + "step": 12265, + "time_per_iteration": 2.497079372406006 + }, + { + "auxiliary_loss_clip": 0.01106704, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.03810012, + "balance_loss_mlp": 1.02018261, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 2.389683909708835, + "language_loss": 0.74365926, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76505458, + "num_input_tokens_seen": 264591610, + "step": 12266, + "time_per_iteration": 3.92482328414917 + }, + { + "auxiliary_loss_clip": 0.01101781, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.03868937, + "balance_loss_mlp": 1.02346683, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.6960682286881092, + "language_loss": 0.73171341, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75309682, + "num_input_tokens_seen": 264611170, + "step": 12267, + "time_per_iteration": 2.526191473007202 + }, + { + "auxiliary_loss_clip": 0.01076114, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.03670609, + "balance_loss_mlp": 1.02225399, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 1.9708134255689524, + "language_loss": 0.83789438, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85899425, + "num_input_tokens_seen": 264629365, + "step": 12268, + "time_per_iteration": 2.537682294845581 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01040052, + "balance_loss_clip": 1.0378561, + "balance_loss_mlp": 1.02784467, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.971741204227826, + "language_loss": 0.73195976, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75345623, + "num_input_tokens_seen": 264647915, + "step": 12269, + "time_per_iteration": 2.408676862716675 + }, + { + "auxiliary_loss_clip": 0.01089231, + "auxiliary_loss_mlp": 0.01036826, + "balance_loss_clip": 1.034904, + "balance_loss_mlp": 1.02302194, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 2.8672841778518103, + "language_loss": 0.70063567, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72189623, + "num_input_tokens_seen": 264669620, + "step": 12270, + "time_per_iteration": 2.635307550430298 + }, + { + "auxiliary_loss_clip": 0.01097628, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.03542781, + "balance_loss_mlp": 1.0200789, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.9167711615890126, + "language_loss": 0.69434798, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71563935, + "num_input_tokens_seen": 264689345, + "step": 12271, + "time_per_iteration": 2.4983246326446533 + }, + { + "auxiliary_loss_clip": 0.01081058, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.03825045, + "balance_loss_mlp": 1.0200541, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 2.338193528394988, + "language_loss": 0.67832541, + "learning_rate": 6.785715393476586e-07, + "loss": 0.69946963, + "num_input_tokens_seen": 264707625, + "step": 12272, + "time_per_iteration": 2.5253169536590576 + }, + { + "auxiliary_loss_clip": 0.01086876, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.03859615, + "balance_loss_mlp": 1.01937783, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 1.6657367642819516, + "language_loss": 0.77995896, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80113816, + "num_input_tokens_seen": 264725575, + "step": 12273, + "time_per_iteration": 2.4721860885620117 + }, + { + "auxiliary_loss_clip": 0.01108911, + "auxiliary_loss_mlp": 0.01033921, + "balance_loss_clip": 1.03637648, + "balance_loss_mlp": 1.02135611, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.999255474191347, + "language_loss": 0.8350184, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85644674, + "num_input_tokens_seen": 264742855, + "step": 12274, + "time_per_iteration": 2.4092583656311035 + }, + { + "auxiliary_loss_clip": 0.01096038, + "auxiliary_loss_mlp": 0.00779845, + "balance_loss_clip": 1.04348433, + "balance_loss_mlp": 1.00086939, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 2.3570893798732313, + "language_loss": 0.7356329, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75439173, + "num_input_tokens_seen": 264761155, + "step": 12275, + "time_per_iteration": 3.993830919265747 + }, + { + "auxiliary_loss_clip": 0.0107775, + "auxiliary_loss_mlp": 0.01048465, + "balance_loss_clip": 1.03464699, + "balance_loss_mlp": 1.03339696, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.7769176510754463, + "language_loss": 0.73757696, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75883913, + "num_input_tokens_seen": 264780660, + "step": 12276, + "time_per_iteration": 2.5346529483795166 + }, + { + "auxiliary_loss_clip": 0.01112317, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.03754795, + "balance_loss_mlp": 1.02022862, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 2.0641879393890283, + "language_loss": 0.77681845, + "learning_rate": 6.771104431585551e-07, + "loss": 0.7982657, + "num_input_tokens_seen": 264798850, + "step": 12277, + "time_per_iteration": 2.4200024604797363 + }, + { + "auxiliary_loss_clip": 0.01110754, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.03881288, + "balance_loss_mlp": 1.02925754, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 2.0849025378163257, + "language_loss": 0.78773165, + "learning_rate": 6.768183743687338e-07, + "loss": 0.8092612, + "num_input_tokens_seen": 264816795, + "step": 12278, + "time_per_iteration": 3.9633588790893555 + }, + { + "auxiliary_loss_clip": 0.01100759, + "auxiliary_loss_mlp": 0.00778101, + "balance_loss_clip": 1.0362736, + "balance_loss_mlp": 1.00070369, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.7403580543333352, + "language_loss": 0.72118556, + "learning_rate": 6.765263557540921e-07, + "loss": 0.73997414, + "num_input_tokens_seen": 264834105, + "step": 12279, + "time_per_iteration": 2.453568458557129 + }, + { + "auxiliary_loss_clip": 0.01102211, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.03572297, + "balance_loss_mlp": 1.02173543, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.481812809072162, + "language_loss": 0.86058486, + "learning_rate": 6.762343873257034e-07, + "loss": 0.88196039, + "num_input_tokens_seen": 264850895, + "step": 12280, + "time_per_iteration": 2.46024227142334 + }, + { + "auxiliary_loss_clip": 0.01075744, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.03676105, + "balance_loss_mlp": 1.01877332, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 1.9529416377118012, + "language_loss": 0.72621679, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74729776, + "num_input_tokens_seen": 264869505, + "step": 12281, + "time_per_iteration": 2.548612594604492 + }, + { + "auxiliary_loss_clip": 0.01072856, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.03754985, + "balance_loss_mlp": 1.01898623, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.7540862574246752, + "language_loss": 0.60770798, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62875676, + "num_input_tokens_seen": 264886915, + "step": 12282, + "time_per_iteration": 2.571624517440796 + }, + { + "auxiliary_loss_clip": 0.01079272, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.03813469, + "balance_loss_mlp": 1.01792884, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 1.9084041504623022, + "language_loss": 0.67787826, + "learning_rate": 6.753587832687632e-07, + "loss": 0.698982, + "num_input_tokens_seen": 264910350, + "step": 12283, + "time_per_iteration": 2.607501745223999 + }, + { + "auxiliary_loss_clip": 0.0111227, + "auxiliary_loss_mlp": 0.00779085, + "balance_loss_clip": 1.03926039, + "balance_loss_mlp": 1.00070345, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.6505988623015078, + "language_loss": 0.76401651, + "learning_rate": 6.750670156960832e-07, + "loss": 0.78293014, + "num_input_tokens_seen": 264930705, + "step": 12284, + "time_per_iteration": 2.5505623817443848 + }, + { + "auxiliary_loss_clip": 0.01097617, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.03555119, + "balance_loss_mlp": 1.02244139, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 2.2770778811167847, + "language_loss": 0.69183338, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71317947, + "num_input_tokens_seen": 264946975, + "step": 12285, + "time_per_iteration": 2.473127603530884 + }, + { + "auxiliary_loss_clip": 0.01095966, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.03811932, + "balance_loss_mlp": 1.01910365, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 1.9837104018031662, + "language_loss": 0.79820204, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81948477, + "num_input_tokens_seen": 264967665, + "step": 12286, + "time_per_iteration": 2.546895742416382 + }, + { + "auxiliary_loss_clip": 0.01070657, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.04359674, + "balance_loss_mlp": 1.01651597, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 2.3925210717857577, + "language_loss": 0.65588856, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67688811, + "num_input_tokens_seen": 264985480, + "step": 12287, + "time_per_iteration": 2.542191505432129 + }, + { + "auxiliary_loss_clip": 0.0108849, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.03725851, + "balance_loss_mlp": 1.01679182, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.8432163604201237, + "language_loss": 0.76497638, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78615063, + "num_input_tokens_seen": 265004790, + "step": 12288, + "time_per_iteration": 2.543658494949341 + }, + { + "auxiliary_loss_clip": 0.01105107, + "auxiliary_loss_mlp": 0.0078003, + "balance_loss_clip": 1.0398376, + "balance_loss_mlp": 1.00078237, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.6564256894992913, + "language_loss": 0.58146012, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60031146, + "num_input_tokens_seen": 265028790, + "step": 12289, + "time_per_iteration": 4.274630308151245 + }, + { + "auxiliary_loss_clip": 0.01029276, + "auxiliary_loss_mlp": 0.00753319, + "balance_loss_clip": 1.0054903, + "balance_loss_mlp": 1.00031841, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6462106989061122, + "language_loss": 0.49264354, + "learning_rate": 6.733174657205287e-07, + "loss": 0.5104695, + "num_input_tokens_seen": 265096660, + "step": 12290, + "time_per_iteration": 3.141381025314331 + }, + { + "auxiliary_loss_clip": 0.01098307, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.03850222, + "balance_loss_mlp": 1.02061164, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 2.9380395154976786, + "language_loss": 0.67180669, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69313246, + "num_input_tokens_seen": 265116375, + "step": 12291, + "time_per_iteration": 2.4989826679229736 + }, + { + "auxiliary_loss_clip": 0.01003153, + "auxiliary_loss_mlp": 0.01004812, + "balance_loss_clip": 1.01811898, + "balance_loss_mlp": 1.0035727, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9959786990690906, + "language_loss": 0.60810703, + "learning_rate": 6.727346847409052e-07, + "loss": 0.6281867, + "num_input_tokens_seen": 265161230, + "step": 12292, + "time_per_iteration": 2.7831907272338867 + }, + { + "auxiliary_loss_clip": 0.01066015, + "auxiliary_loss_mlp": 0.01034237, + "balance_loss_clip": 1.03517437, + "balance_loss_mlp": 1.02236378, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 2.031626465820408, + "language_loss": 0.67232716, + "learning_rate": 6.724433697406191e-07, + "loss": 0.69332969, + "num_input_tokens_seen": 265182515, + "step": 12293, + "time_per_iteration": 2.6863386631011963 + }, + { + "auxiliary_loss_clip": 0.01101401, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.03692174, + "balance_loss_mlp": 1.01585829, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.9418811712799435, + "language_loss": 0.83985221, + "learning_rate": 6.721521050814134e-07, + "loss": 0.86115408, + "num_input_tokens_seen": 265198160, + "step": 12294, + "time_per_iteration": 2.430220365524292 + }, + { + "auxiliary_loss_clip": 0.01076298, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.03553593, + "balance_loss_mlp": 1.02073884, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.5903173043416332, + "language_loss": 0.7322849, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75338817, + "num_input_tokens_seen": 265218480, + "step": 12295, + "time_per_iteration": 2.629918336868286 + }, + { + "auxiliary_loss_clip": 0.01100639, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.03925669, + "balance_loss_mlp": 1.02851701, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 1.6582362708249334, + "language_loss": 0.78773177, + "learning_rate": 6.715697268304215e-07, + "loss": 0.80914378, + "num_input_tokens_seen": 265240165, + "step": 12296, + "time_per_iteration": 2.56729793548584 + }, + { + "auxiliary_loss_clip": 0.01112225, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.03836882, + "balance_loss_mlp": 1.01840508, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 1.99117538339588, + "language_loss": 0.66907769, + "learning_rate": 6.712786132607182e-07, + "loss": 0.69051981, + "num_input_tokens_seen": 265263295, + "step": 12297, + "time_per_iteration": 2.5791029930114746 + }, + { + "auxiliary_loss_clip": 0.01091862, + "auxiliary_loss_mlp": 0.01038771, + "balance_loss_clip": 1.04116631, + "balance_loss_mlp": 1.02535379, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 1.6109275885584315, + "language_loss": 0.68604362, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70734996, + "num_input_tokens_seen": 265282740, + "step": 12298, + "time_per_iteration": 2.578235149383545 + }, + { + "auxiliary_loss_clip": 0.01085722, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.03615785, + "balance_loss_mlp": 1.02134728, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.9023659506966168, + "language_loss": 0.74842769, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76962256, + "num_input_tokens_seen": 265300175, + "step": 12299, + "time_per_iteration": 2.492908239364624 + }, + { + "auxiliary_loss_clip": 0.01018734, + "auxiliary_loss_mlp": 0.01004511, + "balance_loss_clip": 1.01857889, + "balance_loss_mlp": 1.00324178, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7251994716068694, + "language_loss": 0.60792804, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62816048, + "num_input_tokens_seen": 265363275, + "step": 12300, + "time_per_iteration": 3.1543400287628174 + }, + { + "auxiliary_loss_clip": 0.01084792, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.03812766, + "balance_loss_mlp": 1.02203226, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 4.799245647340685, + "language_loss": 0.80040073, + "learning_rate": 6.7011466294475e-07, + "loss": 0.8215965, + "num_input_tokens_seen": 265382935, + "step": 12301, + "time_per_iteration": 2.4877922534942627 + }, + { + "auxiliary_loss_clip": 0.01110626, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.01576698, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.6263159892085757, + "language_loss": 0.7311734, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75255626, + "num_input_tokens_seen": 265403245, + "step": 12302, + "time_per_iteration": 2.4637699127197266 + }, + { + "auxiliary_loss_clip": 0.01112128, + "auxiliary_loss_mlp": 0.0104017, + "balance_loss_clip": 1.03738713, + "balance_loss_mlp": 1.02757597, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 2.1548066458741286, + "language_loss": 0.74029827, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76182127, + "num_input_tokens_seen": 265423105, + "step": 12303, + "time_per_iteration": 2.4748520851135254 + }, + { + "auxiliary_loss_clip": 0.01110125, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.03811502, + "balance_loss_mlp": 1.01784468, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.7121509770854826, + "language_loss": 0.54102737, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56242687, + "num_input_tokens_seen": 265443445, + "step": 12304, + "time_per_iteration": 2.463301181793213 + }, + { + "auxiliary_loss_clip": 0.01090172, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.03627086, + "balance_loss_mlp": 1.02202582, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 2.515669310827455, + "language_loss": 0.84470499, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86595368, + "num_input_tokens_seen": 265462085, + "step": 12305, + "time_per_iteration": 4.055265665054321 + }, + { + "auxiliary_loss_clip": 0.01021357, + "auxiliary_loss_mlp": 0.01001936, + "balance_loss_clip": 1.02131724, + "balance_loss_mlp": 1.00064266, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.872203006989861, + "language_loss": 0.57695121, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59718418, + "num_input_tokens_seen": 265521190, + "step": 12306, + "time_per_iteration": 3.101294755935669 + }, + { + "auxiliary_loss_clip": 0.01094093, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.04006433, + "balance_loss_mlp": 1.02071452, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 1.9871411331795206, + "language_loss": 0.81646562, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83774257, + "num_input_tokens_seen": 265539705, + "step": 12307, + "time_per_iteration": 2.496908187866211 + }, + { + "auxiliary_loss_clip": 0.01099511, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.04422855, + "balance_loss_mlp": 1.01874924, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 2.1667335261313787, + "language_loss": 0.69883156, + "learning_rate": 6.680796918475893e-07, + "loss": 0.72013414, + "num_input_tokens_seen": 265555855, + "step": 12308, + "time_per_iteration": 2.4970972537994385 + }, + { + "auxiliary_loss_clip": 0.01079426, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.03553975, + "balance_loss_mlp": 1.01583064, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 3.354159602495616, + "language_loss": 0.8130216, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83409393, + "num_input_tokens_seen": 265575455, + "step": 12309, + "time_per_iteration": 2.5288002490997314 + }, + { + "auxiliary_loss_clip": 0.0109999, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.03732538, + "balance_loss_mlp": 1.02073598, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 1.82547083820188, + "language_loss": 0.73109847, + "learning_rate": 6.674987259277692e-07, + "loss": 0.75243831, + "num_input_tokens_seen": 265595250, + "step": 12310, + "time_per_iteration": 2.494011163711548 + }, + { + "auxiliary_loss_clip": 0.01076434, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.03583741, + "balance_loss_mlp": 1.02520895, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.3788710718374455, + "language_loss": 0.88369787, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90485001, + "num_input_tokens_seen": 265606945, + "step": 12311, + "time_per_iteration": 2.515963315963745 + }, + { + "auxiliary_loss_clip": 0.0105269, + "auxiliary_loss_mlp": 0.01027738, + "balance_loss_clip": 1.03575838, + "balance_loss_mlp": 1.01610911, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.789335330467914, + "language_loss": 0.8033973, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82420158, + "num_input_tokens_seen": 265626115, + "step": 12312, + "time_per_iteration": 2.63482666015625 + }, + { + "auxiliary_loss_clip": 0.0106466, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.03210545, + "balance_loss_mlp": 1.02170849, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 2.0539606982916485, + "language_loss": 0.7842797, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80526769, + "num_input_tokens_seen": 265646520, + "step": 12313, + "time_per_iteration": 2.5952208042144775 + }, + { + "auxiliary_loss_clip": 0.01064563, + "auxiliary_loss_mlp": 0.0103874, + "balance_loss_clip": 1.03649402, + "balance_loss_mlp": 1.02506697, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 2.745401656449902, + "language_loss": 0.78559643, + "learning_rate": 6.663374005191937e-07, + "loss": 0.80662948, + "num_input_tokens_seen": 265661875, + "step": 12314, + "time_per_iteration": 2.541896343231201 + }, + { + "auxiliary_loss_clip": 0.01018399, + "auxiliary_loss_mlp": 0.01000034, + "balance_loss_clip": 1.00490594, + "balance_loss_mlp": 0.99892503, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8240045686263685, + "language_loss": 0.55171341, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57189769, + "num_input_tokens_seen": 265721255, + "step": 12315, + "time_per_iteration": 4.6738505363464355 + }, + { + "auxiliary_loss_clip": 0.01094612, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.03804445, + "balance_loss_mlp": 1.02288198, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.5533798520997344, + "language_loss": 0.79767859, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81897599, + "num_input_tokens_seen": 265743970, + "step": 12316, + "time_per_iteration": 2.571213960647583 + }, + { + "auxiliary_loss_clip": 0.01098816, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.03638375, + "balance_loss_mlp": 1.01907039, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.7459088564281522, + "language_loss": 0.74816811, + "learning_rate": 6.654669374367275e-07, + "loss": 0.76947343, + "num_input_tokens_seen": 265760890, + "step": 12317, + "time_per_iteration": 2.431234836578369 + }, + { + "auxiliary_loss_clip": 0.01078663, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.03546977, + "balance_loss_mlp": 1.02189231, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.5949539047477306, + "language_loss": 0.81699228, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83812034, + "num_input_tokens_seen": 265779600, + "step": 12318, + "time_per_iteration": 3.953974723815918 + }, + { + "auxiliary_loss_clip": 0.01087107, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.03386831, + "balance_loss_mlp": 1.02122402, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 1.7813735320693114, + "language_loss": 0.76512241, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78633714, + "num_input_tokens_seen": 265797030, + "step": 12319, + "time_per_iteration": 2.470170497894287 + }, + { + "auxiliary_loss_clip": 0.01082193, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.0352813, + "balance_loss_mlp": 1.02388108, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 1.8446151173914547, + "language_loss": 0.63579559, + "learning_rate": 6.64596929804897e-07, + "loss": 0.65697515, + "num_input_tokens_seen": 265815055, + "step": 12320, + "time_per_iteration": 2.4593424797058105 + }, + { + "auxiliary_loss_clip": 0.01103786, + "auxiliary_loss_mlp": 0.0103966, + "balance_loss_clip": 1.03891969, + "balance_loss_mlp": 1.02661228, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.698000281807166, + "language_loss": 0.82928395, + "learning_rate": 6.643070285235288e-07, + "loss": 0.85071838, + "num_input_tokens_seen": 265828480, + "step": 12321, + "time_per_iteration": 2.436619758605957 + }, + { + "auxiliary_loss_clip": 0.01096626, + "auxiliary_loss_mlp": 0.01048672, + "balance_loss_clip": 1.03936434, + "balance_loss_mlp": 1.0345695, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 3.4690971156677772, + "language_loss": 0.71587002, + "learning_rate": 6.640171778917727e-07, + "loss": 0.73732305, + "num_input_tokens_seen": 265845825, + "step": 12322, + "time_per_iteration": 2.493717670440674 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.00778507, + "balance_loss_clip": 1.03702545, + "balance_loss_mlp": 1.00089204, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.6815286590964411, + "language_loss": 0.64106578, + "learning_rate": 6.637273779206183e-07, + "loss": 0.65985978, + "num_input_tokens_seen": 265866335, + "step": 12323, + "time_per_iteration": 2.5169360637664795 + }, + { + "auxiliary_loss_clip": 0.01075946, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.03418911, + "balance_loss_mlp": 1.02007055, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.393098138627791, + "language_loss": 0.75937688, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78046501, + "num_input_tokens_seen": 265888945, + "step": 12324, + "time_per_iteration": 2.634493112564087 + }, + { + "auxiliary_loss_clip": 0.01087082, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.04372334, + "balance_loss_mlp": 1.01568604, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.7572352047010409, + "language_loss": 0.74898505, + "learning_rate": 6.63147930004073e-07, + "loss": 0.77014041, + "num_input_tokens_seen": 265908030, + "step": 12325, + "time_per_iteration": 2.48297381401062 + }, + { + "auxiliary_loss_clip": 0.01073605, + "auxiliary_loss_mlp": 0.01036345, + "balance_loss_clip": 1.03747368, + "balance_loss_mlp": 1.02305293, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 2.972997964571181, + "language_loss": 0.68695462, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70805413, + "num_input_tokens_seen": 265927030, + "step": 12326, + "time_per_iteration": 2.6045382022857666 + }, + { + "auxiliary_loss_clip": 0.01073021, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.03725874, + "balance_loss_mlp": 1.01647961, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 2.0076192320687127, + "language_loss": 0.89354837, + "learning_rate": 6.625686848617835e-07, + "loss": 0.9145658, + "num_input_tokens_seen": 265945490, + "step": 12327, + "time_per_iteration": 2.581613302230835 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.03856325, + "balance_loss_mlp": 1.0203979, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.7801491746716118, + "language_loss": 0.85440964, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87586153, + "num_input_tokens_seen": 265963265, + "step": 12328, + "time_per_iteration": 3.987492322921753 + }, + { + "auxiliary_loss_clip": 0.01099844, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.03832924, + "balance_loss_mlp": 1.01686513, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.8011737719178371, + "language_loss": 0.66971588, + "learning_rate": 6.619896425816103e-07, + "loss": 0.69102049, + "num_input_tokens_seen": 265982270, + "step": 12329, + "time_per_iteration": 2.479280471801758 + }, + { + "auxiliary_loss_clip": 0.01084056, + "auxiliary_loss_mlp": 0.01044832, + "balance_loss_clip": 1.03954291, + "balance_loss_mlp": 1.03039575, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.683986404841615, + "language_loss": 0.66762638, + "learning_rate": 6.617001975422647e-07, + "loss": 0.68891525, + "num_input_tokens_seen": 266003835, + "step": 12330, + "time_per_iteration": 2.609487771987915 + }, + { + "auxiliary_loss_clip": 0.01086391, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.04637671, + "balance_loss_mlp": 1.01818132, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 2.2884774413996323, + "language_loss": 0.85331357, + "learning_rate": 6.614108032513823e-07, + "loss": 0.8745029, + "num_input_tokens_seen": 266021595, + "step": 12331, + "time_per_iteration": 2.5391428470611572 + }, + { + "auxiliary_loss_clip": 0.01052219, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.0378902, + "balance_loss_mlp": 1.01588535, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 1.9527750139762623, + "language_loss": 0.69225377, + "learning_rate": 6.611214597199364e-07, + "loss": 0.71306765, + "num_input_tokens_seen": 266039860, + "step": 12332, + "time_per_iteration": 2.7513020038604736 + }, + { + "auxiliary_loss_clip": 0.01113713, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.03945434, + "balance_loss_mlp": 1.02564454, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 1.8100899170974654, + "language_loss": 0.63044095, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65197265, + "num_input_tokens_seen": 266058050, + "step": 12333, + "time_per_iteration": 2.8101954460144043 + }, + { + "auxiliary_loss_clip": 0.01089301, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.04030764, + "balance_loss_mlp": 1.02196765, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 2.094722865436014, + "language_loss": 0.71287096, + "learning_rate": 6.605429249792387e-07, + "loss": 0.7341013, + "num_input_tokens_seen": 266078060, + "step": 12334, + "time_per_iteration": 2.5717265605926514 + }, + { + "auxiliary_loss_clip": 0.01067148, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.0368309, + "balance_loss_mlp": 1.01624143, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.7661083705866762, + "language_loss": 0.8218441, + "learning_rate": 6.602537337919257e-07, + "loss": 0.8427999, + "num_input_tokens_seen": 266097110, + "step": 12335, + "time_per_iteration": 2.5838565826416016 + }, + { + "auxiliary_loss_clip": 0.01112996, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.03856659, + "balance_loss_mlp": 1.02106047, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.5696492863089415, + "language_loss": 0.74444705, + "learning_rate": 6.599645934079259e-07, + "loss": 0.76593208, + "num_input_tokens_seen": 266110870, + "step": 12336, + "time_per_iteration": 2.388240098953247 + }, + { + "auxiliary_loss_clip": 0.01070683, + "auxiliary_loss_mlp": 0.01030946, + "balance_loss_clip": 1.03595257, + "balance_loss_mlp": 1.01779139, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 1.8856561037227406, + "language_loss": 0.73807687, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75909317, + "num_input_tokens_seen": 266127845, + "step": 12337, + "time_per_iteration": 2.550819158554077 + }, + { + "auxiliary_loss_clip": 0.01089064, + "auxiliary_loss_mlp": 0.01035501, + "balance_loss_clip": 1.04097974, + "balance_loss_mlp": 1.02319312, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.5984610468362213, + "language_loss": 0.76202536, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78327101, + "num_input_tokens_seen": 266145400, + "step": 12338, + "time_per_iteration": 2.518570899963379 + }, + { + "auxiliary_loss_clip": 0.01100109, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.03704238, + "balance_loss_mlp": 1.02066112, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7210521146805486, + "language_loss": 0.73057842, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75190133, + "num_input_tokens_seen": 266164430, + "step": 12339, + "time_per_iteration": 2.4890317916870117 + }, + { + "auxiliary_loss_clip": 0.01092207, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.03739905, + "balance_loss_mlp": 1.01508176, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 1.7692497203386264, + "language_loss": 0.79743701, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81863701, + "num_input_tokens_seen": 266183855, + "step": 12340, + "time_per_iteration": 2.5147736072540283 + }, + { + "auxiliary_loss_clip": 0.01070952, + "auxiliary_loss_mlp": 0.01036284, + "balance_loss_clip": 1.03301823, + "balance_loss_mlp": 1.02342701, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.739135865652589, + "language_loss": 0.7569232, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77799559, + "num_input_tokens_seen": 266202085, + "step": 12341, + "time_per_iteration": 2.4833412170410156 + }, + { + "auxiliary_loss_clip": 0.01082338, + "auxiliary_loss_mlp": 0.01037964, + "balance_loss_clip": 1.03706753, + "balance_loss_mlp": 1.02403474, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.4308052597391865, + "language_loss": 0.7993027, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82050574, + "num_input_tokens_seen": 266223445, + "step": 12342, + "time_per_iteration": 2.542905330657959 + }, + { + "auxiliary_loss_clip": 0.01077567, + "auxiliary_loss_mlp": 0.01031005, + "balance_loss_clip": 1.03535795, + "balance_loss_mlp": 1.01845217, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.6622143266095324, + "language_loss": 0.77202678, + "learning_rate": 6.57942034133433e-07, + "loss": 0.7931124, + "num_input_tokens_seen": 266246575, + "step": 12343, + "time_per_iteration": 2.928952693939209 + }, + { + "auxiliary_loss_clip": 0.01084727, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.03277493, + "balance_loss_mlp": 1.01894593, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 1.8536699577790043, + "language_loss": 0.67486453, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69603109, + "num_input_tokens_seen": 266266055, + "step": 12344, + "time_per_iteration": 3.9667184352874756 + }, + { + "auxiliary_loss_clip": 0.01068789, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.03935564, + "balance_loss_mlp": 1.01952457, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.639585638805019, + "language_loss": 0.81334257, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83435768, + "num_input_tokens_seen": 266282240, + "step": 12345, + "time_per_iteration": 2.5225536823272705 + }, + { + "auxiliary_loss_clip": 0.01074484, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.03377581, + "balance_loss_mlp": 1.0222249, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.9803199581148632, + "language_loss": 0.71235681, + "learning_rate": 6.570759861612988e-07, + "loss": 0.73345792, + "num_input_tokens_seen": 266300980, + "step": 12346, + "time_per_iteration": 2.5140483379364014 + }, + { + "auxiliary_loss_clip": 0.01102267, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.03906131, + "balance_loss_mlp": 1.01773298, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.574804853082285, + "language_loss": 0.73606825, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75739729, + "num_input_tokens_seen": 266322215, + "step": 12347, + "time_per_iteration": 2.571449041366577 + }, + { + "auxiliary_loss_clip": 0.01091249, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.03454208, + "balance_loss_mlp": 1.01779008, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.7564356220089432, + "language_loss": 0.81104863, + "learning_rate": 6.564988754473642e-07, + "loss": 0.832268, + "num_input_tokens_seen": 266341600, + "step": 12348, + "time_per_iteration": 2.4785633087158203 + }, + { + "auxiliary_loss_clip": 0.01108761, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.03633285, + "balance_loss_mlp": 1.02304399, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.6540656988885716, + "language_loss": 0.72393537, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74537683, + "num_input_tokens_seen": 266362895, + "step": 12349, + "time_per_iteration": 2.57285737991333 + }, + { + "auxiliary_loss_clip": 0.01094395, + "auxiliary_loss_mlp": 0.01032042, + "balance_loss_clip": 1.03638339, + "balance_loss_mlp": 1.01786208, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 2.393362233890942, + "language_loss": 0.79015756, + "learning_rate": 6.559219685162165e-07, + "loss": 0.81142193, + "num_input_tokens_seen": 266384015, + "step": 12350, + "time_per_iteration": 2.55086350440979 + }, + { + "auxiliary_loss_clip": 0.01068745, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.03860855, + "balance_loss_mlp": 1.02245927, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 4.186000388288125, + "language_loss": 0.75625914, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77728951, + "num_input_tokens_seen": 266405990, + "step": 12351, + "time_per_iteration": 2.6848208904266357 + }, + { + "auxiliary_loss_clip": 0.01056973, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.03742003, + "balance_loss_mlp": 1.01457858, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 2.5438680514561476, + "language_loss": 0.81624568, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83708441, + "num_input_tokens_seen": 266424260, + "step": 12352, + "time_per_iteration": 2.612901449203491 + }, + { + "auxiliary_loss_clip": 0.01103048, + "auxiliary_loss_mlp": 0.01037842, + "balance_loss_clip": 1.03965843, + "balance_loss_mlp": 1.02556312, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.9511152831909602, + "language_loss": 0.72061175, + "learning_rate": 6.550569904036307e-07, + "loss": 0.74202067, + "num_input_tokens_seen": 266444580, + "step": 12353, + "time_per_iteration": 2.502446174621582 + }, + { + "auxiliary_loss_clip": 0.01102388, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.03926563, + "balance_loss_mlp": 1.02091742, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 1.6136155634149694, + "language_loss": 0.71982592, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74117595, + "num_input_tokens_seen": 266465640, + "step": 12354, + "time_per_iteration": 4.062608480453491 + }, + { + "auxiliary_loss_clip": 0.01019303, + "auxiliary_loss_mlp": 0.01000581, + "balance_loss_clip": 1.00526237, + "balance_loss_mlp": 0.99928743, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.6909068472234501, + "language_loss": 0.59474897, + "learning_rate": 6.544805933122199e-07, + "loss": 0.6149478, + "num_input_tokens_seen": 266531950, + "step": 12355, + "time_per_iteration": 3.1580779552459717 + }, + { + "auxiliary_loss_clip": 0.01113082, + "auxiliary_loss_mlp": 0.0103054, + "balance_loss_clip": 1.03875041, + "balance_loss_mlp": 1.01761174, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.78233946412079, + "language_loss": 0.67342234, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69485861, + "num_input_tokens_seen": 266550665, + "step": 12356, + "time_per_iteration": 2.4234976768493652 + }, + { + "auxiliary_loss_clip": 0.01102786, + "auxiliary_loss_mlp": 0.00780151, + "balance_loss_clip": 1.03454471, + "balance_loss_mlp": 1.0006845, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 1.6927221465308986, + "language_loss": 0.7202636, + "learning_rate": 6.539044003097301e-07, + "loss": 0.73909295, + "num_input_tokens_seen": 266572455, + "step": 12357, + "time_per_iteration": 4.136470794677734 + }, + { + "auxiliary_loss_clip": 0.01087437, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.03767836, + "balance_loss_mlp": 1.01677346, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 1.8532221258520352, + "language_loss": 0.65475059, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67590737, + "num_input_tokens_seen": 266590895, + "step": 12358, + "time_per_iteration": 2.562168836593628 + }, + { + "auxiliary_loss_clip": 0.01074626, + "auxiliary_loss_mlp": 0.01037568, + "balance_loss_clip": 1.03957224, + "balance_loss_mlp": 1.02363873, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 1.6975646661772699, + "language_loss": 0.80544257, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82656455, + "num_input_tokens_seen": 266607660, + "step": 12359, + "time_per_iteration": 2.592529058456421 + }, + { + "auxiliary_loss_clip": 0.01099414, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.0365901, + "balance_loss_mlp": 1.01425552, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.9978542263312757, + "language_loss": 0.68160659, + "learning_rate": 6.530404936638956e-07, + "loss": 0.70286471, + "num_input_tokens_seen": 266624260, + "step": 12360, + "time_per_iteration": 2.431338310241699 + }, + { + "auxiliary_loss_clip": 0.01098749, + "auxiliary_loss_mlp": 0.00778665, + "balance_loss_clip": 1.03629255, + "balance_loss_mlp": 1.00073397, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 2.1986221148695675, + "language_loss": 0.72567791, + "learning_rate": 6.527526269210715e-07, + "loss": 0.744452, + "num_input_tokens_seen": 266644210, + "step": 12361, + "time_per_iteration": 2.5276525020599365 + }, + { + "auxiliary_loss_clip": 0.01065853, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.03454471, + "balance_loss_mlp": 1.02498102, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 2.050177697852671, + "language_loss": 0.55955184, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58059824, + "num_input_tokens_seen": 266664230, + "step": 12362, + "time_per_iteration": 2.575960159301758 + }, + { + "auxiliary_loss_clip": 0.01076578, + "auxiliary_loss_mlp": 0.01033827, + "balance_loss_clip": 1.03657532, + "balance_loss_mlp": 1.02064788, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.7188129118335245, + "language_loss": 0.77368569, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79478979, + "num_input_tokens_seen": 266683270, + "step": 12363, + "time_per_iteration": 2.5415804386138916 + }, + { + "auxiliary_loss_clip": 0.01082933, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.03379631, + "balance_loss_mlp": 1.02224779, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.8221568353585695, + "language_loss": 0.78095388, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80212653, + "num_input_tokens_seen": 266701235, + "step": 12364, + "time_per_iteration": 2.4900782108306885 + }, + { + "auxiliary_loss_clip": 0.01097731, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.03583407, + "balance_loss_mlp": 1.01948738, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 2.0895766450955313, + "language_loss": 0.78700709, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80830097, + "num_input_tokens_seen": 266721495, + "step": 12365, + "time_per_iteration": 2.4992899894714355 + }, + { + "auxiliary_loss_clip": 0.01093767, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_clip": 1.04094744, + "balance_loss_mlp": 1.01830626, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 2.1454710199163323, + "language_loss": 0.7708323, + "learning_rate": 6.513140597415346e-07, + "loss": 0.79207742, + "num_input_tokens_seen": 266747400, + "step": 12366, + "time_per_iteration": 2.7965993881225586 + }, + { + "auxiliary_loss_clip": 0.01099017, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.03778315, + "balance_loss_mlp": 1.01770282, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.5105367055239693, + "language_loss": 0.7142576, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73553586, + "num_input_tokens_seen": 266767630, + "step": 12367, + "time_per_iteration": 4.067622184753418 + }, + { + "auxiliary_loss_clip": 0.01085065, + "auxiliary_loss_mlp": 0.01037501, + "balance_loss_clip": 1.03699183, + "balance_loss_mlp": 1.02506709, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.5039131970040707, + "language_loss": 0.7456733, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76689899, + "num_input_tokens_seen": 266788015, + "step": 12368, + "time_per_iteration": 2.626084327697754 + }, + { + "auxiliary_loss_clip": 0.01095385, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.03734648, + "balance_loss_mlp": 1.0232321, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 1.5330852320567894, + "language_loss": 0.69327468, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71456897, + "num_input_tokens_seen": 266809010, + "step": 12369, + "time_per_iteration": 2.6310179233551025 + }, + { + "auxiliary_loss_clip": 0.01089082, + "auxiliary_loss_mlp": 0.00777187, + "balance_loss_clip": 1.03797531, + "balance_loss_mlp": 1.00072241, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 2.1056986684342784, + "language_loss": 0.75641382, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77507651, + "num_input_tokens_seen": 266825390, + "step": 12370, + "time_per_iteration": 2.4749562740325928 + }, + { + "auxiliary_loss_clip": 0.0111004, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.03777289, + "balance_loss_mlp": 1.02047849, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.4501626281843718, + "language_loss": 0.78211391, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80354577, + "num_input_tokens_seen": 266844675, + "step": 12371, + "time_per_iteration": 2.440068483352661 + }, + { + "auxiliary_loss_clip": 0.01093496, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.03902662, + "balance_loss_mlp": 1.01449561, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.6560245341398065, + "language_loss": 0.69345969, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71466458, + "num_input_tokens_seen": 266865160, + "step": 12372, + "time_per_iteration": 2.562652826309204 + }, + { + "auxiliary_loss_clip": 0.01084586, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.03967834, + "balance_loss_mlp": 1.02079868, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 3.3804730685418907, + "language_loss": 0.74859369, + "learning_rate": 6.493022139721245e-07, + "loss": 0.76977694, + "num_input_tokens_seen": 266883285, + "step": 12373, + "time_per_iteration": 2.497469186782837 + }, + { + "auxiliary_loss_clip": 0.01065362, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.03813243, + "balance_loss_mlp": 1.02313948, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.9541877513979788, + "language_loss": 0.77577788, + "learning_rate": 6.49015012220858e-07, + "loss": 0.7968055, + "num_input_tokens_seen": 266900960, + "step": 12374, + "time_per_iteration": 2.6473982334136963 + }, + { + "auxiliary_loss_clip": 0.01051413, + "auxiliary_loss_mlp": 0.01036645, + "balance_loss_clip": 1.03490949, + "balance_loss_mlp": 1.02369297, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.2360523719963727, + "language_loss": 0.76031953, + "learning_rate": 6.487278616990774e-07, + "loss": 0.78120017, + "num_input_tokens_seen": 266917710, + "step": 12375, + "time_per_iteration": 2.6035983562469482 + }, + { + "auxiliary_loss_clip": 0.01095483, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.03560555, + "balance_loss_mlp": 1.01897168, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.9727592013106234, + "language_loss": 0.77516282, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79641569, + "num_input_tokens_seen": 266934220, + "step": 12376, + "time_per_iteration": 2.468600034713745 + }, + { + "auxiliary_loss_clip": 0.01072676, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.03162956, + "balance_loss_mlp": 1.02133965, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.6793452644481959, + "language_loss": 0.79553723, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81662053, + "num_input_tokens_seen": 266955210, + "step": 12377, + "time_per_iteration": 2.583315372467041 + }, + { + "auxiliary_loss_clip": 0.01104303, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.04282188, + "balance_loss_mlp": 1.01677275, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 2.0506470292173624, + "language_loss": 0.67240548, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69374895, + "num_input_tokens_seen": 266976555, + "step": 12378, + "time_per_iteration": 2.8647913932800293 + }, + { + "auxiliary_loss_clip": 0.01084409, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.04331994, + "balance_loss_mlp": 1.02159917, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 1.9143609772445267, + "language_loss": 0.7172274, + "learning_rate": 6.475797721245648e-07, + "loss": 0.73842824, + "num_input_tokens_seen": 266997640, + "step": 12379, + "time_per_iteration": 2.6428513526916504 + }, + { + "auxiliary_loss_clip": 0.01073681, + "auxiliary_loss_mlp": 0.00780462, + "balance_loss_clip": 1.03468037, + "balance_loss_mlp": 1.00078201, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 2.0815088141175893, + "language_loss": 0.65352958, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67207104, + "num_input_tokens_seen": 267016165, + "step": 12380, + "time_per_iteration": 2.534834146499634 + }, + { + "auxiliary_loss_clip": 0.01102176, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.03889012, + "balance_loss_mlp": 1.02079403, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 2.0902627064239727, + "language_loss": 0.78437686, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80573291, + "num_input_tokens_seen": 267034075, + "step": 12381, + "time_per_iteration": 2.482743263244629 + }, + { + "auxiliary_loss_clip": 0.01073782, + "auxiliary_loss_mlp": 0.01041019, + "balance_loss_clip": 1.03760874, + "balance_loss_mlp": 1.02553976, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 2.161100860543288, + "language_loss": 0.72494614, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74609411, + "num_input_tokens_seen": 267053645, + "step": 12382, + "time_per_iteration": 2.53306245803833 + }, + { + "auxiliary_loss_clip": 0.01008043, + "auxiliary_loss_mlp": 0.01003467, + "balance_loss_clip": 1.01335144, + "balance_loss_mlp": 1.00214338, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6508843683775795, + "language_loss": 0.54600239, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56611747, + "num_input_tokens_seen": 267121830, + "step": 12383, + "time_per_iteration": 3.247413158416748 + }, + { + "auxiliary_loss_clip": 0.0109322, + "auxiliary_loss_mlp": 0.0102851, + "balance_loss_clip": 1.04147768, + "balance_loss_mlp": 1.01586819, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 1.8798369018011787, + "language_loss": 0.7570774, + "learning_rate": 6.461458141259395e-07, + "loss": 0.77829468, + "num_input_tokens_seen": 267141145, + "step": 12384, + "time_per_iteration": 4.005999565124512 + }, + { + "auxiliary_loss_clip": 0.01099012, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.0369699, + "balance_loss_mlp": 1.01572061, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 2.2757546393298878, + "language_loss": 0.79149282, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81276834, + "num_input_tokens_seen": 267159280, + "step": 12385, + "time_per_iteration": 2.486154556274414 + }, + { + "auxiliary_loss_clip": 0.01078045, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.03687191, + "balance_loss_mlp": 1.02207172, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 2.014047554890883, + "language_loss": 0.81315458, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83429158, + "num_input_tokens_seen": 267179390, + "step": 12386, + "time_per_iteration": 2.561429738998413 + }, + { + "auxiliary_loss_clip": 0.01096136, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.03560424, + "balance_loss_mlp": 1.02359927, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.6991578111107202, + "language_loss": 0.70800889, + "learning_rate": 6.452860552992037e-07, + "loss": 0.72932941, + "num_input_tokens_seen": 267198165, + "step": 12387, + "time_per_iteration": 2.5214316844940186 + }, + { + "auxiliary_loss_clip": 0.01080101, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.03871441, + "balance_loss_mlp": 1.01731133, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 1.9187196020713557, + "language_loss": 0.70436347, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72545081, + "num_input_tokens_seen": 267214520, + "step": 12388, + "time_per_iteration": 2.5146143436431885 + }, + { + "auxiliary_loss_clip": 0.01100403, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.03601849, + "balance_loss_mlp": 1.01789045, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.5715503939126534, + "language_loss": 0.84798932, + "learning_rate": 6.447131395843761e-07, + "loss": 0.86930001, + "num_input_tokens_seen": 267236555, + "step": 12389, + "time_per_iteration": 2.496915817260742 + }, + { + "auxiliary_loss_clip": 0.01070031, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.03581643, + "balance_loss_mlp": 1.01896811, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 2.0192981551048437, + "language_loss": 0.79072547, + "learning_rate": 6.444267588104526e-07, + "loss": 0.81173521, + "num_input_tokens_seen": 267254800, + "step": 12390, + "time_per_iteration": 2.599329710006714 + }, + { + "auxiliary_loss_clip": 0.01090885, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.03641427, + "balance_loss_mlp": 1.01994455, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.755827640649318, + "language_loss": 0.84752297, + "learning_rate": 6.441404294400014e-07, + "loss": 0.8687588, + "num_input_tokens_seen": 267274610, + "step": 12391, + "time_per_iteration": 2.5119504928588867 + }, + { + "auxiliary_loss_clip": 0.01110688, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.0175705, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 1.7175023810561736, + "language_loss": 0.73675466, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75815678, + "num_input_tokens_seen": 267292600, + "step": 12392, + "time_per_iteration": 2.441067934036255 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.03814864, + "balance_loss_mlp": 1.0224601, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.5380709765260259, + "language_loss": 0.77119905, + "learning_rate": 6.435679249529487e-07, + "loss": 0.79251975, + "num_input_tokens_seen": 267311295, + "step": 12393, + "time_per_iteration": 3.9532315731048584 + }, + { + "auxiliary_loss_clip": 0.01099475, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.03800046, + "balance_loss_mlp": 1.02142525, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 1.879187375376356, + "language_loss": 0.7246145, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74595976, + "num_input_tokens_seen": 267328390, + "step": 12394, + "time_per_iteration": 2.4745326042175293 + }, + { + "auxiliary_loss_clip": 0.01059173, + "auxiliary_loss_mlp": 0.00777553, + "balance_loss_clip": 1.04260683, + "balance_loss_mlp": 1.00058317, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.8331827120952484, + "language_loss": 0.81726366, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83563089, + "num_input_tokens_seen": 267348185, + "step": 12395, + "time_per_iteration": 2.6326138973236084 + }, + { + "auxiliary_loss_clip": 0.01103349, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.03792667, + "balance_loss_mlp": 1.01963937, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 2.1325044652820724, + "language_loss": 0.71117532, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73253322, + "num_input_tokens_seen": 267367010, + "step": 12396, + "time_per_iteration": 3.9156363010406494 + }, + { + "auxiliary_loss_clip": 0.0107453, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.03962505, + "balance_loss_mlp": 1.02123404, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 1.8893623481238133, + "language_loss": 0.67623436, + "learning_rate": 6.424235332981245e-07, + "loss": 0.69731629, + "num_input_tokens_seen": 267386605, + "step": 12397, + "time_per_iteration": 2.638795852661133 + }, + { + "auxiliary_loss_clip": 0.01108364, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.03625178, + "balance_loss_mlp": 1.0248214, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 2.0302654809965754, + "language_loss": 0.76531661, + "learning_rate": 6.421375640558908e-07, + "loss": 0.78677398, + "num_input_tokens_seen": 267404135, + "step": 12398, + "time_per_iteration": 2.440140962600708 + }, + { + "auxiliary_loss_clip": 0.01097778, + "auxiliary_loss_mlp": 0.01025803, + "balance_loss_clip": 1.0373404, + "balance_loss_mlp": 1.01329827, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.6364354685763576, + "language_loss": 0.77815241, + "learning_rate": 6.418516463039363e-07, + "loss": 0.79938829, + "num_input_tokens_seen": 267423120, + "step": 12399, + "time_per_iteration": 2.5112950801849365 + }, + { + "auxiliary_loss_clip": 0.01085772, + "auxiliary_loss_mlp": 0.0104067, + "balance_loss_clip": 1.03397012, + "balance_loss_mlp": 1.02901721, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 2.242833619115987, + "language_loss": 0.73880863, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76007301, + "num_input_tokens_seen": 267441250, + "step": 12400, + "time_per_iteration": 2.492032527923584 + }, + { + "auxiliary_loss_clip": 0.01096109, + "auxiliary_loss_mlp": 0.01031957, + "balance_loss_clip": 1.03519988, + "balance_loss_mlp": 1.02036989, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.8738913895496654, + "language_loss": 0.81680441, + "learning_rate": 6.412799653142327e-07, + "loss": 0.83808506, + "num_input_tokens_seen": 267462820, + "step": 12401, + "time_per_iteration": 2.554194450378418 + }, + { + "auxiliary_loss_clip": 0.01078809, + "auxiliary_loss_mlp": 0.01032481, + "balance_loss_clip": 1.03927457, + "balance_loss_mlp": 1.02099502, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 3.2941733905433455, + "language_loss": 0.6492126, + "learning_rate": 6.409942020981611e-07, + "loss": 0.67032552, + "num_input_tokens_seen": 267483065, + "step": 12402, + "time_per_iteration": 2.5706779956817627 + }, + { + "auxiliary_loss_clip": 0.0107792, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.03420985, + "balance_loss_mlp": 1.01740468, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.6423254083642056, + "language_loss": 0.7335372, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75460428, + "num_input_tokens_seen": 267504825, + "step": 12403, + "time_per_iteration": 2.7107644081115723 + }, + { + "auxiliary_loss_clip": 0.01008074, + "auxiliary_loss_mlp": 0.01005024, + "balance_loss_clip": 1.01441956, + "balance_loss_mlp": 1.00368333, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8291292242889238, + "language_loss": 0.58794963, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60808063, + "num_input_tokens_seen": 267559260, + "step": 12404, + "time_per_iteration": 2.943676233291626 + }, + { + "auxiliary_loss_clip": 0.01109925, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.0364182, + "balance_loss_mlp": 1.01931751, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 1.8364772635231212, + "language_loss": 0.7767601, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79816633, + "num_input_tokens_seen": 267578720, + "step": 12405, + "time_per_iteration": 2.431344509124756 + }, + { + "auxiliary_loss_clip": 0.01084578, + "auxiliary_loss_mlp": 0.01038684, + "balance_loss_clip": 1.03367817, + "balance_loss_mlp": 1.02597666, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.6940382717505609, + "language_loss": 0.69442201, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71565461, + "num_input_tokens_seen": 267598250, + "step": 12406, + "time_per_iteration": 2.503938674926758 + }, + { + "auxiliary_loss_clip": 0.01047197, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.0359236, + "balance_loss_mlp": 1.022596, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 2.51776684948797, + "language_loss": 0.64898878, + "learning_rate": 6.39566159239002e-07, + "loss": 0.66982156, + "num_input_tokens_seen": 267615430, + "step": 12407, + "time_per_iteration": 4.1107378005981445 + }, + { + "auxiliary_loss_clip": 0.0107339, + "auxiliary_loss_mlp": 0.01034401, + "balance_loss_clip": 1.03819513, + "balance_loss_mlp": 1.020787, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.9897092134208278, + "language_loss": 0.72155833, + "learning_rate": 6.392807053872212e-07, + "loss": 0.7426362, + "num_input_tokens_seen": 267635075, + "step": 12408, + "time_per_iteration": 2.58467435836792 + }, + { + "auxiliary_loss_clip": 0.01106316, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.0389483, + "balance_loss_mlp": 1.02030385, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 15.500163047731748, + "language_loss": 0.72769082, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74909258, + "num_input_tokens_seen": 267654105, + "step": 12409, + "time_per_iteration": 2.52128529548645 + }, + { + "auxiliary_loss_clip": 0.01096687, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.03583598, + "balance_loss_mlp": 1.01962209, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.616055802920365, + "language_loss": 0.65809923, + "learning_rate": 6.38709952490319e-07, + "loss": 0.67937517, + "num_input_tokens_seen": 267673090, + "step": 12410, + "time_per_iteration": 2.5753166675567627 + }, + { + "auxiliary_loss_clip": 0.01100118, + "auxiliary_loss_mlp": 0.00776698, + "balance_loss_clip": 1.04317427, + "balance_loss_mlp": 1.00067639, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 2.193789625446016, + "language_loss": 0.84438199, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86315012, + "num_input_tokens_seen": 267690605, + "step": 12411, + "time_per_iteration": 2.4964945316314697 + }, + { + "auxiliary_loss_clip": 0.0108052, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.03973222, + "balance_loss_mlp": 1.01573467, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.5379184773352192, + "language_loss": 0.77758431, + "learning_rate": 6.381394060744339e-07, + "loss": 0.7986722, + "num_input_tokens_seen": 267710540, + "step": 12412, + "time_per_iteration": 2.6049771308898926 + }, + { + "auxiliary_loss_clip": 0.01070662, + "auxiliary_loss_mlp": 0.01037725, + "balance_loss_clip": 1.03224778, + "balance_loss_mlp": 1.02491021, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 5.46463053751631, + "language_loss": 0.62457263, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64565653, + "num_input_tokens_seen": 267730780, + "step": 12413, + "time_per_iteration": 2.655902147293091 + }, + { + "auxiliary_loss_clip": 0.01022288, + "auxiliary_loss_mlp": 0.00753401, + "balance_loss_clip": 1.00685346, + "balance_loss_mlp": 1.00020981, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7203805087659254, + "language_loss": 0.54895186, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56670868, + "num_input_tokens_seen": 267794240, + "step": 12414, + "time_per_iteration": 3.0890746116638184 + }, + { + "auxiliary_loss_clip": 0.01080908, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.03238237, + "balance_loss_mlp": 1.01847577, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.6005786764751784, + "language_loss": 0.55243337, + "learning_rate": 6.372839737918154e-07, + "loss": 0.57355821, + "num_input_tokens_seen": 267817190, + "step": 12415, + "time_per_iteration": 2.6285791397094727 + }, + { + "auxiliary_loss_clip": 0.01049796, + "auxiliary_loss_mlp": 0.01040436, + "balance_loss_clip": 1.03457069, + "balance_loss_mlp": 1.02571321, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.8003554425734896, + "language_loss": 0.74992192, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77082419, + "num_input_tokens_seen": 267836245, + "step": 12416, + "time_per_iteration": 2.660116195678711 + }, + { + "auxiliary_loss_clip": 0.01063002, + "auxiliary_loss_mlp": 0.0104082, + "balance_loss_clip": 1.0328722, + "balance_loss_mlp": 1.02733147, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.6129370978570343, + "language_loss": 0.69376349, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71480167, + "num_input_tokens_seen": 267858310, + "step": 12417, + "time_per_iteration": 2.784179925918579 + }, + { + "auxiliary_loss_clip": 0.01083917, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.03970695, + "balance_loss_mlp": 1.01755667, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 1.792490906058003, + "language_loss": 0.73344398, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75458068, + "num_input_tokens_seen": 267876345, + "step": 12418, + "time_per_iteration": 2.5595943927764893 + }, + { + "auxiliary_loss_clip": 0.01100981, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.0392102, + "balance_loss_mlp": 1.01840448, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.5822574796263145, + "language_loss": 0.69131082, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71262151, + "num_input_tokens_seen": 267896740, + "step": 12419, + "time_per_iteration": 2.494347095489502 + }, + { + "auxiliary_loss_clip": 0.01104305, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.03562617, + "balance_loss_mlp": 1.02208567, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.6952805901355046, + "language_loss": 0.74741352, + "learning_rate": 6.358592869514216e-07, + "loss": 0.7687974, + "num_input_tokens_seen": 267914765, + "step": 12420, + "time_per_iteration": 2.4570930004119873 + }, + { + "auxiliary_loss_clip": 0.01104847, + "auxiliary_loss_mlp": 0.01028472, + "balance_loss_clip": 1.03938985, + "balance_loss_mlp": 1.01610398, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.7024225024356683, + "language_loss": 0.67272854, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69406176, + "num_input_tokens_seen": 267934085, + "step": 12421, + "time_per_iteration": 2.445549964904785 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.04597068, + "balance_loss_mlp": 1.02138984, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.7045852255896554, + "language_loss": 0.72277176, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74409544, + "num_input_tokens_seen": 267955170, + "step": 12422, + "time_per_iteration": 2.5554864406585693 + }, + { + "auxiliary_loss_clip": 0.01074057, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.03672576, + "balance_loss_mlp": 1.02047884, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 2.1342981573233533, + "language_loss": 0.74866593, + "learning_rate": 6.350050955009796e-07, + "loss": 0.76974243, + "num_input_tokens_seen": 267974980, + "step": 12423, + "time_per_iteration": 4.066561460494995 + }, + { + "auxiliary_loss_clip": 0.01097394, + "auxiliary_loss_mlp": 0.01024824, + "balance_loss_clip": 1.03652978, + "balance_loss_mlp": 1.01335061, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.39055257613934, + "language_loss": 0.67432851, + "learning_rate": 6.347204685245929e-07, + "loss": 0.69555068, + "num_input_tokens_seen": 267994985, + "step": 12424, + "time_per_iteration": 2.5049471855163574 + }, + { + "auxiliary_loss_clip": 0.01105971, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.03943825, + "balance_loss_mlp": 1.02750146, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 2.066984796402006, + "language_loss": 0.74195075, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76341617, + "num_input_tokens_seen": 268014985, + "step": 12425, + "time_per_iteration": 2.6012966632843018 + }, + { + "auxiliary_loss_clip": 0.01075657, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.03744841, + "balance_loss_mlp": 1.02221251, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 1.8012566308565947, + "language_loss": 0.69763708, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71875417, + "num_input_tokens_seen": 268034395, + "step": 12426, + "time_per_iteration": 2.552302837371826 + }, + { + "auxiliary_loss_clip": 0.01069172, + "auxiliary_loss_mlp": 0.01030549, + "balance_loss_clip": 1.03557515, + "balance_loss_mlp": 1.01846719, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.5008887495088055, + "language_loss": 0.65629935, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67729652, + "num_input_tokens_seen": 268054485, + "step": 12427, + "time_per_iteration": 2.5203752517700195 + }, + { + "auxiliary_loss_clip": 0.01112028, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.03851867, + "balance_loss_mlp": 1.01741767, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.74275945568113, + "language_loss": 0.74814886, + "learning_rate": 6.335824784423118e-07, + "loss": 0.76957595, + "num_input_tokens_seen": 268072250, + "step": 12428, + "time_per_iteration": 2.4326813220977783 + }, + { + "auxiliary_loss_clip": 0.01107015, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.03918409, + "balance_loss_mlp": 1.01804662, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 1.9353599763590759, + "language_loss": 0.58048952, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60187793, + "num_input_tokens_seen": 268089840, + "step": 12429, + "time_per_iteration": 2.4609220027923584 + }, + { + "auxiliary_loss_clip": 0.0110581, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.04414821, + "balance_loss_mlp": 1.02063012, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.9078521312864918, + "language_loss": 0.60456628, + "learning_rate": 6.330137942461595e-07, + "loss": 0.6259619, + "num_input_tokens_seen": 268109360, + "step": 12430, + "time_per_iteration": 2.551680326461792 + }, + { + "auxiliary_loss_clip": 0.01090128, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.0358125, + "balance_loss_mlp": 1.01748824, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.4975169051868082, + "language_loss": 0.75768268, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77888238, + "num_input_tokens_seen": 268131840, + "step": 12431, + "time_per_iteration": 2.5662949085235596 + }, + { + "auxiliary_loss_clip": 0.01102855, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.03886724, + "balance_loss_mlp": 1.0193553, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.9423264839274512, + "language_loss": 0.75664973, + "learning_rate": 6.32445317395021e-07, + "loss": 0.77799892, + "num_input_tokens_seen": 268148300, + "step": 12432, + "time_per_iteration": 4.013429641723633 + }, + { + "auxiliary_loss_clip": 0.01095759, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.04358339, + "balance_loss_mlp": 1.019804, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 2.304654197805759, + "language_loss": 0.69998151, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72128463, + "num_input_tokens_seen": 268166450, + "step": 12433, + "time_per_iteration": 2.517061233520508 + }, + { + "auxiliary_loss_clip": 0.01065156, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.03524351, + "balance_loss_mlp": 1.01890039, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.9005982399652561, + "language_loss": 0.67234826, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69332373, + "num_input_tokens_seen": 268186165, + "step": 12434, + "time_per_iteration": 2.584076166152954 + }, + { + "auxiliary_loss_clip": 0.01105304, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.03724813, + "balance_loss_mlp": 1.0174706, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.4328936384842355, + "language_loss": 0.79344869, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81478667, + "num_input_tokens_seen": 268208145, + "step": 12435, + "time_per_iteration": 2.5009677410125732 + }, + { + "auxiliary_loss_clip": 0.01084843, + "auxiliary_loss_mlp": 0.01027926, + "balance_loss_clip": 1.03785515, + "balance_loss_mlp": 1.01537955, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 1.822476936558607, + "language_loss": 0.6782971, + "learning_rate": 6.313089860726604e-07, + "loss": 0.69942474, + "num_input_tokens_seen": 268228345, + "step": 12436, + "time_per_iteration": 4.121602773666382 + }, + { + "auxiliary_loss_clip": 0.01088092, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.03742599, + "balance_loss_mlp": 1.01869857, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 1.5223834443753261, + "language_loss": 0.70708466, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72827864, + "num_input_tokens_seen": 268250260, + "step": 12437, + "time_per_iteration": 2.6526107788085938 + }, + { + "auxiliary_loss_clip": 0.0107206, + "auxiliary_loss_mlp": 0.01024346, + "balance_loss_clip": 1.03330326, + "balance_loss_mlp": 1.0134201, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 1.790778721823799, + "language_loss": 0.67242086, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69338489, + "num_input_tokens_seen": 268268440, + "step": 12438, + "time_per_iteration": 2.596068859100342 + }, + { + "auxiliary_loss_clip": 0.01087248, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.0349009, + "balance_loss_mlp": 1.02014208, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.6353049680151555, + "language_loss": 0.80436158, + "learning_rate": 6.304572825026344e-07, + "loss": 0.82555687, + "num_input_tokens_seen": 268285765, + "step": 12439, + "time_per_iteration": 2.467668056488037 + }, + { + "auxiliary_loss_clip": 0.01075622, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.03465044, + "balance_loss_mlp": 1.02125716, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 2.0590230790945814, + "language_loss": 0.70732999, + "learning_rate": 6.301734851646674e-07, + "loss": 0.72841984, + "num_input_tokens_seen": 268304015, + "step": 12440, + "time_per_iteration": 2.5313220024108887 + }, + { + "auxiliary_loss_clip": 0.01087841, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.03967345, + "balance_loss_mlp": 1.01987684, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.9295635646756422, + "language_loss": 0.74342096, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76461595, + "num_input_tokens_seen": 268323290, + "step": 12441, + "time_per_iteration": 2.497302293777466 + }, + { + "auxiliary_loss_clip": 0.0110159, + "auxiliary_loss_mlp": 0.00778097, + "balance_loss_clip": 1.03829265, + "balance_loss_mlp": 1.00063229, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.424359727737345, + "language_loss": 0.82845676, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84725356, + "num_input_tokens_seen": 268339490, + "step": 12442, + "time_per_iteration": 2.4810070991516113 + }, + { + "auxiliary_loss_clip": 0.01059303, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.03762269, + "balance_loss_mlp": 1.0180887, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 2.1735963335749724, + "language_loss": 0.62347388, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64438695, + "num_input_tokens_seen": 268359865, + "step": 12443, + "time_per_iteration": 2.6591737270355225 + }, + { + "auxiliary_loss_clip": 0.01074427, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.03901982, + "balance_loss_mlp": 1.01779819, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 2.954812585204027, + "language_loss": 0.71295398, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73399556, + "num_input_tokens_seen": 268377065, + "step": 12444, + "time_per_iteration": 2.552016496658325 + }, + { + "auxiliary_loss_clip": 0.01065134, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.03967571, + "balance_loss_mlp": 1.0196265, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.4929019297311001, + "language_loss": 0.69027489, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71125376, + "num_input_tokens_seen": 268396935, + "step": 12445, + "time_per_iteration": 2.595855236053467 + }, + { + "auxiliary_loss_clip": 0.01097488, + "auxiliary_loss_mlp": 0.01024719, + "balance_loss_clip": 1.03676963, + "balance_loss_mlp": 1.01273251, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.9636118786671468, + "language_loss": 0.74155235, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76277435, + "num_input_tokens_seen": 268414460, + "step": 12446, + "time_per_iteration": 2.4444468021392822 + }, + { + "auxiliary_loss_clip": 0.01096074, + "auxiliary_loss_mlp": 0.00779203, + "balance_loss_clip": 1.03894937, + "balance_loss_mlp": 1.00075114, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 2.0955588513033434, + "language_loss": 0.73261094, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75136375, + "num_input_tokens_seen": 268432225, + "step": 12447, + "time_per_iteration": 4.037806510925293 + }, + { + "auxiliary_loss_clip": 0.01072471, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.03837478, + "balance_loss_mlp": 1.0181911, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 3.1188019643155096, + "language_loss": 0.72225106, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74326873, + "num_input_tokens_seen": 268449270, + "step": 12448, + "time_per_iteration": 2.634077310562134 + }, + { + "auxiliary_loss_clip": 0.01112362, + "auxiliary_loss_mlp": 0.0103664, + "balance_loss_clip": 1.03825617, + "balance_loss_mlp": 1.02415299, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 2.525248193575339, + "language_loss": 0.73582232, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75731242, + "num_input_tokens_seen": 268467250, + "step": 12449, + "time_per_iteration": 2.4940009117126465 + }, + { + "auxiliary_loss_clip": 0.01081475, + "auxiliary_loss_mlp": 0.01035379, + "balance_loss_clip": 1.04017365, + "balance_loss_mlp": 1.02193248, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 1.9870176640751442, + "language_loss": 0.61168677, + "learning_rate": 6.273383704774225e-07, + "loss": 0.6328553, + "num_input_tokens_seen": 268487270, + "step": 12450, + "time_per_iteration": 2.6021509170532227 + }, + { + "auxiliary_loss_clip": 0.01104962, + "auxiliary_loss_mlp": 0.0102582, + "balance_loss_clip": 1.03625846, + "balance_loss_mlp": 1.01423883, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 2.732948561336978, + "language_loss": 0.70352316, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72483093, + "num_input_tokens_seen": 268508020, + "step": 12451, + "time_per_iteration": 2.487609624862671 + }, + { + "auxiliary_loss_clip": 0.01105806, + "auxiliary_loss_mlp": 0.01028122, + "balance_loss_clip": 1.03833485, + "balance_loss_mlp": 1.01559949, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 2.440152963026617, + "language_loss": 0.80671728, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82805657, + "num_input_tokens_seen": 268527375, + "step": 12452, + "time_per_iteration": 2.510768175125122 + }, + { + "auxiliary_loss_clip": 0.01120776, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.04312098, + "balance_loss_mlp": 1.02249074, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 3.366939107334647, + "language_loss": 0.7160235, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73758548, + "num_input_tokens_seen": 268544870, + "step": 12453, + "time_per_iteration": 2.448838949203491 + }, + { + "auxiliary_loss_clip": 0.0109107, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.03881228, + "balance_loss_mlp": 1.02356732, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.6817845608493103, + "language_loss": 0.73884451, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76011443, + "num_input_tokens_seen": 268564580, + "step": 12454, + "time_per_iteration": 2.5122363567352295 + }, + { + "auxiliary_loss_clip": 0.01000478, + "auxiliary_loss_mlp": 0.0100479, + "balance_loss_clip": 1.00409114, + "balance_loss_mlp": 1.00332952, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7332906877127235, + "language_loss": 0.59385478, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61390746, + "num_input_tokens_seen": 268629550, + "step": 12455, + "time_per_iteration": 3.2147216796875 + }, + { + "auxiliary_loss_clip": 0.01072924, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.03432059, + "balance_loss_mlp": 1.01577771, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 1.6697466482974548, + "language_loss": 0.79668474, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81769598, + "num_input_tokens_seen": 268646645, + "step": 12456, + "time_per_iteration": 2.5246658325195312 + }, + { + "auxiliary_loss_clip": 0.0102048, + "auxiliary_loss_mlp": 0.01000999, + "balance_loss_clip": 1.00537705, + "balance_loss_mlp": 0.99973547, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.8389442870395895, + "language_loss": 0.61476153, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63497633, + "num_input_tokens_seen": 268702275, + "step": 12457, + "time_per_iteration": 2.981950044631958 + }, + { + "auxiliary_loss_clip": 0.01098327, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.04076767, + "balance_loss_mlp": 1.02110958, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 2.024333755179961, + "language_loss": 0.67374015, + "learning_rate": 6.250740259166711e-07, + "loss": 0.69505847, + "num_input_tokens_seen": 268716265, + "step": 12458, + "time_per_iteration": 2.5271177291870117 + }, + { + "auxiliary_loss_clip": 0.01058787, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.0352354, + "balance_loss_mlp": 1.01802945, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 1.6984179591268465, + "language_loss": 0.79756981, + "learning_rate": 6.247912173519106e-07, + "loss": 0.81845897, + "num_input_tokens_seen": 268734330, + "step": 12459, + "time_per_iteration": 2.572354793548584 + }, + { + "auxiliary_loss_clip": 0.01074066, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.03543174, + "balance_loss_mlp": 1.02183259, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.6738114528241845, + "language_loss": 0.80624163, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82732892, + "num_input_tokens_seen": 268753500, + "step": 12460, + "time_per_iteration": 2.5650863647460938 + }, + { + "auxiliary_loss_clip": 0.01086101, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.03749549, + "balance_loss_mlp": 1.02074003, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.8394249789401917, + "language_loss": 0.85944235, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88064837, + "num_input_tokens_seen": 268772055, + "step": 12461, + "time_per_iteration": 2.5378592014312744 + }, + { + "auxiliary_loss_clip": 0.01093387, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.03915918, + "balance_loss_mlp": 1.02306736, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 2.080720170511464, + "language_loss": 0.69863141, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71991658, + "num_input_tokens_seen": 268792265, + "step": 12462, + "time_per_iteration": 2.5234317779541016 + }, + { + "auxiliary_loss_clip": 0.0111021, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.03735137, + "balance_loss_mlp": 1.01856875, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 1.8275478867925783, + "language_loss": 0.70917094, + "learning_rate": 6.236605046806267e-07, + "loss": 0.73058641, + "num_input_tokens_seen": 268812735, + "step": 12463, + "time_per_iteration": 3.9369466304779053 + }, + { + "auxiliary_loss_clip": 0.01076083, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.03630853, + "balance_loss_mlp": 1.01929116, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.904316772780487, + "language_loss": 0.77421761, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79528666, + "num_input_tokens_seen": 268833090, + "step": 12464, + "time_per_iteration": 2.614760160446167 + }, + { + "auxiliary_loss_clip": 0.01086619, + "auxiliary_loss_mlp": 0.01029208, + "balance_loss_clip": 1.03524363, + "balance_loss_mlp": 1.01743579, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.9166908937561868, + "language_loss": 0.78551614, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80667442, + "num_input_tokens_seen": 268851880, + "step": 12465, + "time_per_iteration": 2.517880439758301 + }, + { + "auxiliary_loss_clip": 0.01083092, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.03685284, + "balance_loss_mlp": 1.02286124, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.657974726195749, + "language_loss": 0.74437833, + "learning_rate": 6.22813018144422e-07, + "loss": 0.76558077, + "num_input_tokens_seen": 268867910, + "step": 12466, + "time_per_iteration": 2.521212339401245 + }, + { + "auxiliary_loss_clip": 0.01101651, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.03700113, + "balance_loss_mlp": 1.02106452, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 1.852239014968496, + "language_loss": 0.66890168, + "learning_rate": 6.22530627064209e-07, + "loss": 0.69025016, + "num_input_tokens_seen": 268887260, + "step": 12467, + "time_per_iteration": 2.4801573753356934 + }, + { + "auxiliary_loss_clip": 0.01062239, + "auxiliary_loss_mlp": 0.00780945, + "balance_loss_clip": 1.03455281, + "balance_loss_mlp": 1.00065958, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 3.667378687261668, + "language_loss": 0.76732564, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78575742, + "num_input_tokens_seen": 268902520, + "step": 12468, + "time_per_iteration": 2.554455518722534 + }, + { + "auxiliary_loss_clip": 0.0107284, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.03886652, + "balance_loss_mlp": 1.01839173, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 2.894703314172131, + "language_loss": 0.69601583, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71705639, + "num_input_tokens_seen": 268920970, + "step": 12469, + "time_per_iteration": 2.5415239334106445 + }, + { + "auxiliary_loss_clip": 0.01091521, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.03783691, + "balance_loss_mlp": 1.02060914, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 2.0262177238527768, + "language_loss": 0.69282341, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71407354, + "num_input_tokens_seen": 268936600, + "step": 12470, + "time_per_iteration": 2.5489377975463867 + }, + { + "auxiliary_loss_clip": 0.01084662, + "auxiliary_loss_mlp": 0.01037421, + "balance_loss_clip": 1.03628957, + "balance_loss_mlp": 1.02296066, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 1.7370104800653725, + "language_loss": 0.7531755, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77439636, + "num_input_tokens_seen": 268956560, + "step": 12471, + "time_per_iteration": 4.05262017250061 + }, + { + "auxiliary_loss_clip": 0.01088524, + "auxiliary_loss_mlp": 0.01036877, + "balance_loss_clip": 1.03734636, + "balance_loss_mlp": 1.02284038, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 2.1861367816188455, + "language_loss": 0.76793826, + "learning_rate": 6.211194553838929e-07, + "loss": 0.78919232, + "num_input_tokens_seen": 268973945, + "step": 12472, + "time_per_iteration": 2.493684768676758 + }, + { + "auxiliary_loss_clip": 0.01097068, + "auxiliary_loss_mlp": 0.0077691, + "balance_loss_clip": 1.03585935, + "balance_loss_mlp": 1.00064027, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 2.5214572739163357, + "language_loss": 0.84301269, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86175251, + "num_input_tokens_seen": 268993245, + "step": 12473, + "time_per_iteration": 2.4869866371154785 + }, + { + "auxiliary_loss_clip": 0.01085315, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.03791225, + "balance_loss_mlp": 1.02285576, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 2.0556342698899743, + "language_loss": 0.74005198, + "learning_rate": 6.205553526478829e-07, + "loss": 0.76127172, + "num_input_tokens_seen": 269012125, + "step": 12474, + "time_per_iteration": 2.5634422302246094 + }, + { + "auxiliary_loss_clip": 0.0108884, + "auxiliary_loss_mlp": 0.01039079, + "balance_loss_clip": 1.03668594, + "balance_loss_mlp": 1.02575755, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 2.0838440340173054, + "language_loss": 0.74458784, + "learning_rate": 6.202733797375492e-07, + "loss": 0.76586705, + "num_input_tokens_seen": 269030545, + "step": 12475, + "time_per_iteration": 3.8765411376953125 + }, + { + "auxiliary_loss_clip": 0.01108016, + "auxiliary_loss_mlp": 0.01037946, + "balance_loss_clip": 1.03917933, + "balance_loss_mlp": 1.02467227, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 2.221446505773977, + "language_loss": 0.80401242, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82547206, + "num_input_tokens_seen": 269048180, + "step": 12476, + "time_per_iteration": 2.4802448749542236 + }, + { + "auxiliary_loss_clip": 0.01077447, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.0362649, + "balance_loss_mlp": 1.02232051, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.8318963238654438, + "language_loss": 0.77916551, + "learning_rate": 6.19709590885688e-07, + "loss": 0.80028856, + "num_input_tokens_seen": 269068600, + "step": 12477, + "time_per_iteration": 2.6102569103240967 + }, + { + "auxiliary_loss_clip": 0.01013959, + "auxiliary_loss_mlp": 0.01006528, + "balance_loss_clip": 1.00972724, + "balance_loss_mlp": 1.00515747, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8060344434254789, + "language_loss": 0.54406023, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56426501, + "num_input_tokens_seen": 269119045, + "step": 12478, + "time_per_iteration": 3.0461814403533936 + }, + { + "auxiliary_loss_clip": 0.01088458, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.04049504, + "balance_loss_mlp": 1.02210689, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.7046089103664257, + "language_loss": 0.7999627, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82119, + "num_input_tokens_seen": 269136755, + "step": 12479, + "time_per_iteration": 2.5223140716552734 + }, + { + "auxiliary_loss_clip": 0.01104384, + "auxiliary_loss_mlp": 0.01041322, + "balance_loss_clip": 1.03848171, + "balance_loss_mlp": 1.02779174, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.8825973809948735, + "language_loss": 0.62711531, + "learning_rate": 6.188643001902369e-07, + "loss": 0.64857233, + "num_input_tokens_seen": 269156120, + "step": 12480, + "time_per_iteration": 2.4825780391693115 + }, + { + "auxiliary_loss_clip": 0.0108435, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.034724, + "balance_loss_mlp": 1.02286518, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.6469412105958203, + "language_loss": 0.77544343, + "learning_rate": 6.185826413564512e-07, + "loss": 0.79663259, + "num_input_tokens_seen": 269175650, + "step": 12481, + "time_per_iteration": 2.5120925903320312 + }, + { + "auxiliary_loss_clip": 0.01072565, + "auxiliary_loss_mlp": 0.01036136, + "balance_loss_clip": 1.03672147, + "balance_loss_mlp": 1.02296352, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 3.7294549277818283, + "language_loss": 0.71263814, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73372519, + "num_input_tokens_seen": 269197080, + "step": 12482, + "time_per_iteration": 2.614487886428833 + }, + { + "auxiliary_loss_clip": 0.01111635, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.0385468, + "balance_loss_mlp": 1.02203679, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.6439427881388753, + "language_loss": 0.70131993, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72278047, + "num_input_tokens_seen": 269218600, + "step": 12483, + "time_per_iteration": 2.4877090454101562 + }, + { + "auxiliary_loss_clip": 0.01110505, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.03738856, + "balance_loss_mlp": 1.01735163, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 1.668868331322045, + "language_loss": 0.74297678, + "learning_rate": 6.177379791987131e-07, + "loss": 0.7643708, + "num_input_tokens_seen": 269239245, + "step": 12484, + "time_per_iteration": 2.449413776397705 + }, + { + "auxiliary_loss_clip": 0.01089273, + "auxiliary_loss_mlp": 0.01028839, + "balance_loss_clip": 1.03714228, + "balance_loss_mlp": 1.01640511, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 2.0180034029034606, + "language_loss": 0.84701622, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86819726, + "num_input_tokens_seen": 269258520, + "step": 12485, + "time_per_iteration": 2.509442090988159 + }, + { + "auxiliary_loss_clip": 0.01078827, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.04093385, + "balance_loss_mlp": 1.0162642, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.5461612023113258, + "language_loss": 0.77816904, + "learning_rate": 6.171751331533323e-07, + "loss": 0.79924315, + "num_input_tokens_seen": 269278320, + "step": 12486, + "time_per_iteration": 4.104748725891113 + }, + { + "auxiliary_loss_clip": 0.01100268, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.03533125, + "balance_loss_mlp": 1.01828361, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 2.037219370266155, + "language_loss": 0.72401869, + "learning_rate": 6.168937887805932e-07, + "loss": 0.7453422, + "num_input_tokens_seen": 269298025, + "step": 12487, + "time_per_iteration": 2.501659870147705 + }, + { + "auxiliary_loss_clip": 0.01087533, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.03480136, + "balance_loss_mlp": 1.01834559, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.8382697629899551, + "language_loss": 0.67552018, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69670504, + "num_input_tokens_seen": 269316770, + "step": 12488, + "time_per_iteration": 2.5452704429626465 + }, + { + "auxiliary_loss_clip": 0.01053955, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.03925133, + "balance_loss_mlp": 1.02075219, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.6224060747467304, + "language_loss": 0.77006769, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79094762, + "num_input_tokens_seen": 269334755, + "step": 12489, + "time_per_iteration": 2.6394293308258057 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.03925192, + "balance_loss_mlp": 1.01711893, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 1.8476576982611554, + "language_loss": 0.74956691, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77084821, + "num_input_tokens_seen": 269353810, + "step": 12490, + "time_per_iteration": 2.545645236968994 + }, + { + "auxiliary_loss_clip": 0.01110848, + "auxiliary_loss_mlp": 0.01032508, + "balance_loss_clip": 1.03937435, + "balance_loss_mlp": 1.02013397, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.5983603630408445, + "language_loss": 0.77952945, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80096304, + "num_input_tokens_seen": 269372910, + "step": 12491, + "time_per_iteration": 2.442758560180664 + }, + { + "auxiliary_loss_clip": 0.01098704, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.03627992, + "balance_loss_mlp": 1.01986969, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 2.258779340819769, + "language_loss": 0.76455998, + "learning_rate": 6.154878538430899e-07, + "loss": 0.78585839, + "num_input_tokens_seen": 269391545, + "step": 12492, + "time_per_iteration": 2.493413209915161 + }, + { + "auxiliary_loss_clip": 0.01079201, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.0344975, + "balance_loss_mlp": 1.01760149, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 1.9117129435636688, + "language_loss": 0.71274614, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73382783, + "num_input_tokens_seen": 269408530, + "step": 12493, + "time_per_iteration": 2.539933204650879 + }, + { + "auxiliary_loss_clip": 0.01102244, + "auxiliary_loss_mlp": 0.0077793, + "balance_loss_clip": 1.03888607, + "balance_loss_mlp": 1.00067735, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.891904609293717, + "language_loss": 0.8061313, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82493305, + "num_input_tokens_seen": 269425930, + "step": 12494, + "time_per_iteration": 2.503875255584717 + }, + { + "auxiliary_loss_clip": 0.0111177, + "auxiliary_loss_mlp": 0.01027651, + "balance_loss_clip": 1.03781962, + "balance_loss_mlp": 1.01467502, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 2.1759684503315633, + "language_loss": 0.7871896, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80858374, + "num_input_tokens_seen": 269443945, + "step": 12495, + "time_per_iteration": 2.4496235847473145 + }, + { + "auxiliary_loss_clip": 0.01111725, + "auxiliary_loss_mlp": 0.00777454, + "balance_loss_clip": 1.0383296, + "balance_loss_mlp": 1.00060534, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 1.897352694736259, + "language_loss": 0.7083751, + "learning_rate": 6.143640508441898e-07, + "loss": 0.72726691, + "num_input_tokens_seen": 269463625, + "step": 12496, + "time_per_iteration": 2.462860107421875 + }, + { + "auxiliary_loss_clip": 0.01069363, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.03512537, + "balance_loss_mlp": 1.01874042, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.6865160551941665, + "language_loss": 0.78499776, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80599594, + "num_input_tokens_seen": 269483415, + "step": 12497, + "time_per_iteration": 2.6011464595794678 + }, + { + "auxiliary_loss_clip": 0.01105982, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.04164946, + "balance_loss_mlp": 1.02127433, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 2.271077845362606, + "language_loss": 0.76538694, + "learning_rate": 6.13802464562855e-07, + "loss": 0.78678834, + "num_input_tokens_seen": 269504635, + "step": 12498, + "time_per_iteration": 2.5737764835357666 + }, + { + "auxiliary_loss_clip": 0.01087216, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.03758681, + "balance_loss_mlp": 1.02053165, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.885779875639859, + "language_loss": 0.7444694, + "learning_rate": 6.135217502639878e-07, + "loss": 0.76566207, + "num_input_tokens_seen": 269523955, + "step": 12499, + "time_per_iteration": 2.4852006435394287 + }, + { + "auxiliary_loss_clip": 0.01095801, + "auxiliary_loss_mlp": 0.01025096, + "balance_loss_clip": 1.03330004, + "balance_loss_mlp": 1.01364028, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.183478326688834, + "language_loss": 0.79547518, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81668413, + "num_input_tokens_seen": 269544410, + "step": 12500, + "time_per_iteration": 2.5075275897979736 + }, + { + "auxiliary_loss_clip": 0.01108661, + "auxiliary_loss_mlp": 0.01038858, + "balance_loss_clip": 1.03855014, + "balance_loss_mlp": 1.0237304, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 2.394341304432419, + "language_loss": 0.73490632, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75638151, + "num_input_tokens_seen": 269563315, + "step": 12501, + "time_per_iteration": 2.4595587253570557 + }, + { + "auxiliary_loss_clip": 0.01086624, + "auxiliary_loss_mlp": 0.01025166, + "balance_loss_clip": 1.03403497, + "balance_loss_mlp": 1.01263738, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 7.364386548173012, + "language_loss": 0.78146386, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80258179, + "num_input_tokens_seen": 269583950, + "step": 12502, + "time_per_iteration": 3.977666139602661 + }, + { + "auxiliary_loss_clip": 0.01089753, + "auxiliary_loss_mlp": 0.01038831, + "balance_loss_clip": 1.03747916, + "balance_loss_mlp": 1.02574229, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.5013175749021714, + "language_loss": 0.70448864, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72577447, + "num_input_tokens_seen": 269600120, + "step": 12503, + "time_per_iteration": 2.458033323287964 + }, + { + "auxiliary_loss_clip": 0.01029363, + "auxiliary_loss_mlp": 0.01006518, + "balance_loss_clip": 1.00579453, + "balance_loss_mlp": 1.00521314, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9953740960297939, + "language_loss": 0.64001566, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66037446, + "num_input_tokens_seen": 269659815, + "step": 12504, + "time_per_iteration": 2.9365458488464355 + }, + { + "auxiliary_loss_clip": 0.01068465, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.03111339, + "balance_loss_mlp": 1.02295566, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 1.5485521214286284, + "language_loss": 0.68848044, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70951492, + "num_input_tokens_seen": 269684565, + "step": 12505, + "time_per_iteration": 2.706467390060425 + }, + { + "auxiliary_loss_clip": 0.0101847, + "auxiliary_loss_mlp": 0.00753146, + "balance_loss_clip": 1.00522292, + "balance_loss_mlp": 1.00035214, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6471098912028146, + "language_loss": 0.55104285, + "learning_rate": 6.11558222878809e-07, + "loss": 0.56875902, + "num_input_tokens_seen": 269752325, + "step": 12506, + "time_per_iteration": 3.141982078552246 + }, + { + "auxiliary_loss_clip": 0.01100521, + "auxiliary_loss_mlp": 0.01036818, + "balance_loss_clip": 1.0369848, + "balance_loss_mlp": 1.0238657, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 2.1220772448416523, + "language_loss": 0.78346062, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80483401, + "num_input_tokens_seen": 269770630, + "step": 12507, + "time_per_iteration": 2.4556655883789062 + }, + { + "auxiliary_loss_clip": 0.01084021, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.03879583, + "balance_loss_mlp": 1.0162189, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.6928555053226206, + "language_loss": 0.71615177, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73727, + "num_input_tokens_seen": 269787280, + "step": 12508, + "time_per_iteration": 2.472543954849243 + }, + { + "auxiliary_loss_clip": 0.0109605, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.03530777, + "balance_loss_mlp": 1.01758182, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.8251430533058701, + "language_loss": 0.72202921, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74328816, + "num_input_tokens_seen": 269805205, + "step": 12509, + "time_per_iteration": 2.451249122619629 + }, + { + "auxiliary_loss_clip": 0.01116243, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.03868997, + "balance_loss_mlp": 1.02456903, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.7275175209709959, + "language_loss": 0.6202935, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64183927, + "num_input_tokens_seen": 269824820, + "step": 12510, + "time_per_iteration": 2.4786365032196045 + }, + { + "auxiliary_loss_clip": 0.01095913, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.03713775, + "balance_loss_mlp": 1.01990604, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.6365104624978342, + "language_loss": 0.81828094, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83956265, + "num_input_tokens_seen": 269842825, + "step": 12511, + "time_per_iteration": 4.051864862442017 + }, + { + "auxiliary_loss_clip": 0.01088943, + "auxiliary_loss_mlp": 0.01038112, + "balance_loss_clip": 1.03759849, + "balance_loss_mlp": 1.02499902, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.8849321519115831, + "language_loss": 0.75535917, + "learning_rate": 6.098772526115412e-07, + "loss": 0.77662969, + "num_input_tokens_seen": 269859000, + "step": 12512, + "time_per_iteration": 2.467304229736328 + }, + { + "auxiliary_loss_clip": 0.0109382, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.03568065, + "balance_loss_mlp": 1.01747966, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.8740457418630692, + "language_loss": 0.82375824, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84498525, + "num_input_tokens_seen": 269878895, + "step": 12513, + "time_per_iteration": 2.5238728523254395 + }, + { + "auxiliary_loss_clip": 0.01103254, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.04231048, + "balance_loss_mlp": 1.01913655, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 3.127135479966143, + "language_loss": 0.7470594, + "learning_rate": 6.093173507845771e-07, + "loss": 0.76841652, + "num_input_tokens_seen": 269897280, + "step": 12514, + "time_per_iteration": 2.499136209487915 + }, + { + "auxiliary_loss_clip": 0.01091455, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.03801513, + "balance_loss_mlp": 1.02026057, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 1.815236934385393, + "language_loss": 0.68660116, + "learning_rate": 6.090374789680271e-07, + "loss": 0.70783103, + "num_input_tokens_seen": 269914640, + "step": 12515, + "time_per_iteration": 3.846132278442383 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.03753757, + "balance_loss_mlp": 1.01922286, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 1.5558512864724388, + "language_loss": 0.70133632, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72265017, + "num_input_tokens_seen": 269934960, + "step": 12516, + "time_per_iteration": 2.5339064598083496 + }, + { + "auxiliary_loss_clip": 0.01063378, + "auxiliary_loss_mlp": 0.01026926, + "balance_loss_clip": 1.03820467, + "balance_loss_mlp": 1.0154283, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.553791878311067, + "language_loss": 0.89354515, + "learning_rate": 6.084778935818495e-07, + "loss": 0.9144482, + "num_input_tokens_seen": 269956655, + "step": 12517, + "time_per_iteration": 2.6268980503082275 + }, + { + "auxiliary_loss_clip": 0.01087852, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.03663659, + "balance_loss_mlp": 1.02430952, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.6199475680951672, + "language_loss": 0.74555218, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76679504, + "num_input_tokens_seen": 269976835, + "step": 12518, + "time_per_iteration": 2.552536964416504 + }, + { + "auxiliary_loss_clip": 0.01003286, + "auxiliary_loss_mlp": 0.01002974, + "balance_loss_clip": 1.02112412, + "balance_loss_mlp": 1.00158525, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.706144478669645, + "language_loss": 0.55659854, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57666111, + "num_input_tokens_seen": 270040630, + "step": 12519, + "time_per_iteration": 3.2310683727264404 + }, + { + "auxiliary_loss_clip": 0.01093307, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.03752708, + "balance_loss_mlp": 1.02232718, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.5170126236648747, + "language_loss": 0.77934426, + "learning_rate": 6.07638911279029e-07, + "loss": 0.80061108, + "num_input_tokens_seen": 270059695, + "step": 12520, + "time_per_iteration": 2.532562732696533 + }, + { + "auxiliary_loss_clip": 0.01093942, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.03483868, + "balance_loss_mlp": 1.02497363, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 2.2324808442975623, + "language_loss": 0.7367357, + "learning_rate": 6.07359356094229e-07, + "loss": 0.75804418, + "num_input_tokens_seen": 270078420, + "step": 12521, + "time_per_iteration": 2.4729373455047607 + }, + { + "auxiliary_loss_clip": 0.01090269, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.04013348, + "balance_loss_mlp": 1.02291799, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 1.8650436038318954, + "language_loss": 0.67276734, + "learning_rate": 6.070798537185016e-07, + "loss": 0.69402862, + "num_input_tokens_seen": 270097040, + "step": 12522, + "time_per_iteration": 2.557190418243408 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.03987074, + "balance_loss_mlp": 1.02691817, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 1.6425934606020571, + "language_loss": 0.77994633, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80137169, + "num_input_tokens_seen": 270116365, + "step": 12523, + "time_per_iteration": 2.513749599456787 + }, + { + "auxiliary_loss_clip": 0.01108559, + "auxiliary_loss_mlp": 0.01028841, + "balance_loss_clip": 1.03710234, + "balance_loss_mlp": 1.01692033, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 3.293449931400946, + "language_loss": 0.80739027, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82876426, + "num_input_tokens_seen": 270135395, + "step": 12524, + "time_per_iteration": 2.453763246536255 + }, + { + "auxiliary_loss_clip": 0.01100638, + "auxiliary_loss_mlp": 0.00776259, + "balance_loss_clip": 1.03901732, + "balance_loss_mlp": 1.00069976, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.621559403223363, + "language_loss": 0.7405566, + "learning_rate": 6.062416635517326e-07, + "loss": 0.75932562, + "num_input_tokens_seen": 270156425, + "step": 12525, + "time_per_iteration": 3.94576358795166 + }, + { + "auxiliary_loss_clip": 0.01079378, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.03742659, + "balance_loss_mlp": 1.01841509, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.920540899740019, + "language_loss": 0.72561741, + "learning_rate": 6.059623725182641e-07, + "loss": 0.74671364, + "num_input_tokens_seen": 270176905, + "step": 12526, + "time_per_iteration": 2.5789437294006348 + }, + { + "auxiliary_loss_clip": 0.01086743, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.03606391, + "balance_loss_mlp": 1.01627171, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 1.7860600135887477, + "language_loss": 0.72295868, + "learning_rate": 6.056831343468414e-07, + "loss": 0.74410164, + "num_input_tokens_seen": 270196640, + "step": 12527, + "time_per_iteration": 2.571167230606079 + }, + { + "auxiliary_loss_clip": 0.01077273, + "auxiliary_loss_mlp": 0.01023006, + "balance_loss_clip": 1.03863287, + "balance_loss_mlp": 1.01263499, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 2.0430169047698774, + "language_loss": 0.80824423, + "learning_rate": 6.054039490480539e-07, + "loss": 0.829247, + "num_input_tokens_seen": 270213905, + "step": 12528, + "time_per_iteration": 2.53082275390625 + }, + { + "auxiliary_loss_clip": 0.0106415, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.04135871, + "balance_loss_mlp": 1.02089429, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 2.3345921754268186, + "language_loss": 0.85051644, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87150317, + "num_input_tokens_seen": 270231995, + "step": 12529, + "time_per_iteration": 2.6118972301483154 + }, + { + "auxiliary_loss_clip": 0.01083012, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.03972173, + "balance_loss_mlp": 1.01906586, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 1.7973299361576947, + "language_loss": 0.73844874, + "learning_rate": 6.048457371107303e-07, + "loss": 0.75959563, + "num_input_tokens_seen": 270251480, + "step": 12530, + "time_per_iteration": 2.5470428466796875 + }, + { + "auxiliary_loss_clip": 0.00995515, + "auxiliary_loss_mlp": 0.01000703, + "balance_loss_clip": 1.02357626, + "balance_loss_mlp": 0.99940366, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8305735695293878, + "language_loss": 0.63634127, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65630347, + "num_input_tokens_seen": 270306480, + "step": 12531, + "time_per_iteration": 3.0133113861083984 + }, + { + "auxiliary_loss_clip": 0.0109114, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.0386858, + "balance_loss_mlp": 1.01523399, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 3.185181096880692, + "language_loss": 0.69923198, + "learning_rate": 6.042877367909633e-07, + "loss": 0.72042155, + "num_input_tokens_seen": 270324595, + "step": 12532, + "time_per_iteration": 2.5191657543182373 + }, + { + "auxiliary_loss_clip": 0.01080778, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.03630221, + "balance_loss_mlp": 1.01721263, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.5922492722779085, + "language_loss": 0.77532768, + "learning_rate": 6.040088160141132e-07, + "loss": 0.7964139, + "num_input_tokens_seen": 270344375, + "step": 12533, + "time_per_iteration": 2.5142459869384766 + }, + { + "auxiliary_loss_clip": 0.01022487, + "auxiliary_loss_mlp": 0.01005321, + "balance_loss_clip": 1.00725389, + "balance_loss_mlp": 1.00402153, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7846490904246437, + "language_loss": 0.57261455, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59289265, + "num_input_tokens_seen": 270405235, + "step": 12534, + "time_per_iteration": 3.0696053504943848 + }, + { + "auxiliary_loss_clip": 0.01088993, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.03716564, + "balance_loss_mlp": 1.01351368, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.4436484402063765, + "language_loss": 0.71179116, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73294127, + "num_input_tokens_seen": 270425820, + "step": 12535, + "time_per_iteration": 2.5423600673675537 + }, + { + "auxiliary_loss_clip": 0.01085964, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.03266573, + "balance_loss_mlp": 1.01643062, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 2.0554901273124004, + "language_loss": 0.81008899, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83123702, + "num_input_tokens_seen": 270447120, + "step": 12536, + "time_per_iteration": 2.5560686588287354 + }, + { + "auxiliary_loss_clip": 0.01078139, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.03326774, + "balance_loss_mlp": 1.02325118, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 2.1253581152277237, + "language_loss": 0.74385893, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76498866, + "num_input_tokens_seen": 270468680, + "step": 12537, + "time_per_iteration": 2.566923141479492 + }, + { + "auxiliary_loss_clip": 0.01110072, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.03677225, + "balance_loss_mlp": 1.02132726, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.649976718254607, + "language_loss": 0.74207574, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76351178, + "num_input_tokens_seen": 270486310, + "step": 12538, + "time_per_iteration": 2.409292221069336 + }, + { + "auxiliary_loss_clip": 0.01072745, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.03717899, + "balance_loss_mlp": 1.02279615, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.7481087121746912, + "language_loss": 0.67886841, + "learning_rate": 6.023364033816956e-07, + "loss": 0.6999464, + "num_input_tokens_seen": 270507210, + "step": 12539, + "time_per_iteration": 2.5630099773406982 + }, + { + "auxiliary_loss_clip": 0.01108558, + "auxiliary_loss_mlp": 0.0102786, + "balance_loss_clip": 1.03783989, + "balance_loss_mlp": 1.01575446, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.6884739196984428, + "language_loss": 0.74714637, + "learning_rate": 6.020578533797229e-07, + "loss": 0.76851058, + "num_input_tokens_seen": 270525250, + "step": 12540, + "time_per_iteration": 2.4530649185180664 + }, + { + "auxiliary_loss_clip": 0.01111858, + "auxiliary_loss_mlp": 0.0102827, + "balance_loss_clip": 1.03743136, + "balance_loss_mlp": 1.01596713, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 2.389601208401037, + "language_loss": 0.72794813, + "learning_rate": 6.017793563878566e-07, + "loss": 0.74934942, + "num_input_tokens_seen": 270539295, + "step": 12541, + "time_per_iteration": 2.3624656200408936 + }, + { + "auxiliary_loss_clip": 0.01108516, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.03717852, + "balance_loss_mlp": 1.02086937, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 1.8910796101689544, + "language_loss": 0.72210371, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74352098, + "num_input_tokens_seen": 270562815, + "step": 12542, + "time_per_iteration": 4.12801718711853 + }, + { + "auxiliary_loss_clip": 0.01084535, + "auxiliary_loss_mlp": 0.01026404, + "balance_loss_clip": 1.03424942, + "balance_loss_mlp": 1.01437569, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 1.8418582718285008, + "language_loss": 0.84546494, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86657435, + "num_input_tokens_seen": 270579055, + "step": 12543, + "time_per_iteration": 2.472139358520508 + }, + { + "auxiliary_loss_clip": 0.01078322, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.04022193, + "balance_loss_mlp": 1.01954532, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.612014224331545, + "language_loss": 0.7361418, + "learning_rate": 6.009441835784927e-07, + "loss": 0.75723839, + "num_input_tokens_seen": 270599080, + "step": 12544, + "time_per_iteration": 2.566938877105713 + }, + { + "auxiliary_loss_clip": 0.01094864, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.03622162, + "balance_loss_mlp": 1.02107716, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 1.8359254311448074, + "language_loss": 0.68506724, + "learning_rate": 6.006658987326383e-07, + "loss": 0.7063427, + "num_input_tokens_seen": 270618715, + "step": 12545, + "time_per_iteration": 2.481013536453247 + }, + { + "auxiliary_loss_clip": 0.01086514, + "auxiliary_loss_mlp": 0.01030699, + "balance_loss_clip": 1.03399134, + "balance_loss_mlp": 1.01840258, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 2.0585014142902462, + "language_loss": 0.68820131, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70937347, + "num_input_tokens_seen": 270635695, + "step": 12546, + "time_per_iteration": 2.487691879272461 + }, + { + "auxiliary_loss_clip": 0.01094688, + "auxiliary_loss_mlp": 0.01035242, + "balance_loss_clip": 1.03577042, + "balance_loss_mlp": 1.02262998, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.2070882400935936, + "language_loss": 0.73345196, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75475127, + "num_input_tokens_seen": 270654325, + "step": 12547, + "time_per_iteration": 2.457097291946411 + }, + { + "auxiliary_loss_clip": 0.01109099, + "auxiliary_loss_mlp": 0.01026547, + "balance_loss_clip": 1.03744102, + "balance_loss_mlp": 1.01334441, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 1.9451281477108506, + "language_loss": 0.67964101, + "learning_rate": 5.998313626146099e-07, + "loss": 0.70099747, + "num_input_tokens_seen": 270674260, + "step": 12548, + "time_per_iteration": 2.429086923599243 + }, + { + "auxiliary_loss_clip": 0.01088242, + "auxiliary_loss_mlp": 0.01031475, + "balance_loss_clip": 1.03565812, + "balance_loss_mlp": 1.01948833, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.8512016888602127, + "language_loss": 0.87169337, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89289057, + "num_input_tokens_seen": 270692200, + "step": 12549, + "time_per_iteration": 2.551741600036621 + }, + { + "auxiliary_loss_clip": 0.0106955, + "auxiliary_loss_mlp": 0.01033212, + "balance_loss_clip": 1.04374897, + "balance_loss_mlp": 1.02217293, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 2.0772487529546124, + "language_loss": 0.77169764, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79272521, + "num_input_tokens_seen": 270709675, + "step": 12550, + "time_per_iteration": 4.134663343429565 + }, + { + "auxiliary_loss_clip": 0.01110262, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.03681135, + "balance_loss_mlp": 1.0140785, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.5272603217128915, + "language_loss": 0.69799912, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71935755, + "num_input_tokens_seen": 270733055, + "step": 12551, + "time_per_iteration": 2.4982852935791016 + }, + { + "auxiliary_loss_clip": 0.01090755, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.03921819, + "balance_loss_mlp": 1.01696491, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 2.2613593942257224, + "language_loss": 0.86115986, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88236183, + "num_input_tokens_seen": 270749275, + "step": 12552, + "time_per_iteration": 2.5279455184936523 + }, + { + "auxiliary_loss_clip": 0.01099959, + "auxiliary_loss_mlp": 0.01031059, + "balance_loss_clip": 1.03672814, + "balance_loss_mlp": 1.01942444, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 1.966796555243514, + "language_loss": 0.78179854, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80310869, + "num_input_tokens_seen": 270768230, + "step": 12553, + "time_per_iteration": 2.5030312538146973 + }, + { + "auxiliary_loss_clip": 0.01099521, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.03845048, + "balance_loss_mlp": 1.01891506, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.1570575959480083, + "language_loss": 0.63207924, + "learning_rate": 5.981637242156135e-07, + "loss": 0.65338433, + "num_input_tokens_seen": 270786285, + "step": 12554, + "time_per_iteration": 3.943486452102661 + }, + { + "auxiliary_loss_clip": 0.01087585, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.03477144, + "balance_loss_mlp": 1.02301168, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.525222687578907, + "language_loss": 0.73275328, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75397688, + "num_input_tokens_seen": 270805505, + "step": 12555, + "time_per_iteration": 2.551936626434326 + }, + { + "auxiliary_loss_clip": 0.01087825, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.04083467, + "balance_loss_mlp": 1.01925373, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 1.8184696853235724, + "language_loss": 0.78333122, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80452758, + "num_input_tokens_seen": 270824610, + "step": 12556, + "time_per_iteration": 2.4942591190338135 + }, + { + "auxiliary_loss_clip": 0.01020069, + "auxiliary_loss_mlp": 0.0100728, + "balance_loss_clip": 1.00598907, + "balance_loss_mlp": 1.00599289, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.7106000718092084, + "language_loss": 0.50415719, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52443063, + "num_input_tokens_seen": 270886155, + "step": 12557, + "time_per_iteration": 3.01735520362854 + }, + { + "auxiliary_loss_clip": 0.01099621, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.03906775, + "balance_loss_mlp": 1.02119672, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.6547491030958186, + "language_loss": 0.71640122, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73773742, + "num_input_tokens_seen": 270905325, + "step": 12558, + "time_per_iteration": 2.4997575283050537 + }, + { + "auxiliary_loss_clip": 0.01085105, + "auxiliary_loss_mlp": 0.01041175, + "balance_loss_clip": 1.03401756, + "balance_loss_mlp": 1.02799678, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.7267541718053303, + "language_loss": 0.8003493, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82161212, + "num_input_tokens_seen": 270927535, + "step": 12559, + "time_per_iteration": 2.579906940460205 + }, + { + "auxiliary_loss_clip": 0.010639, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.03805125, + "balance_loss_mlp": 1.01876307, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.6788673910729637, + "language_loss": 0.78782719, + "learning_rate": 5.96497999496199e-07, + "loss": 0.80878049, + "num_input_tokens_seen": 270946920, + "step": 12560, + "time_per_iteration": 2.587895154953003 + }, + { + "auxiliary_loss_clip": 0.01059796, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.03416336, + "balance_loss_mlp": 1.02586222, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 1.9359220301140725, + "language_loss": 0.70382607, + "learning_rate": 5.96220564921515e-07, + "loss": 0.72480881, + "num_input_tokens_seen": 270965705, + "step": 12561, + "time_per_iteration": 2.55222225189209 + }, + { + "auxiliary_loss_clip": 0.01082493, + "auxiliary_loss_mlp": 0.00781597, + "balance_loss_clip": 1.0324173, + "balance_loss_mlp": 1.0006268, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.626196718267652, + "language_loss": 0.7579664, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77660739, + "num_input_tokens_seen": 270986550, + "step": 12562, + "time_per_iteration": 2.584995746612549 + }, + { + "auxiliary_loss_clip": 0.01090723, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.04306865, + "balance_loss_mlp": 1.01828372, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 2.1867033653237153, + "language_loss": 0.75731653, + "learning_rate": 5.956658554770371e-07, + "loss": 0.77852696, + "num_input_tokens_seen": 271006250, + "step": 12563, + "time_per_iteration": 2.5197041034698486 + }, + { + "auxiliary_loss_clip": 0.01087169, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.03844535, + "balance_loss_mlp": 1.02238023, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 3.8416427916850324, + "language_loss": 0.674281, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69553447, + "num_input_tokens_seen": 271025575, + "step": 12564, + "time_per_iteration": 4.1125524044036865 + }, + { + "auxiliary_loss_clip": 0.01084005, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.03634357, + "balance_loss_mlp": 1.02573562, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 2.115694200195861, + "language_loss": 0.68306977, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70430076, + "num_input_tokens_seen": 271045805, + "step": 12565, + "time_per_iteration": 2.51619553565979 + }, + { + "auxiliary_loss_clip": 0.01089071, + "auxiliary_loss_mlp": 0.01028846, + "balance_loss_clip": 1.03495288, + "balance_loss_mlp": 1.01613188, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 1.6673011371453896, + "language_loss": 0.75329727, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77447647, + "num_input_tokens_seen": 271066065, + "step": 12566, + "time_per_iteration": 2.5538747310638428 + }, + { + "auxiliary_loss_clip": 0.01105939, + "auxiliary_loss_mlp": 0.0103645, + "balance_loss_clip": 1.03932381, + "balance_loss_mlp": 1.02293181, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 1.8355013485655578, + "language_loss": 0.74350226, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76492614, + "num_input_tokens_seen": 271085870, + "step": 12567, + "time_per_iteration": 2.5635218620300293 + }, + { + "auxiliary_loss_clip": 0.0110764, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.03602862, + "balance_loss_mlp": 1.01676571, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 2.46875666416473, + "language_loss": 0.63253641, + "learning_rate": 5.942800139684073e-07, + "loss": 0.65389448, + "num_input_tokens_seen": 271104260, + "step": 12568, + "time_per_iteration": 2.4791266918182373 + }, + { + "auxiliary_loss_clip": 0.01028613, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.03557777, + "balance_loss_mlp": 1.02678895, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 1.941546739618011, + "language_loss": 0.66859555, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68928349, + "num_input_tokens_seen": 271125745, + "step": 12569, + "time_per_iteration": 3.0108578205108643 + }, + { + "auxiliary_loss_clip": 0.01103405, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.03842545, + "balance_loss_mlp": 1.02280402, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.584487144407197, + "language_loss": 0.67166072, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69306636, + "num_input_tokens_seen": 271147145, + "step": 12570, + "time_per_iteration": 2.894282341003418 + }, + { + "auxiliary_loss_clip": 0.01111039, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.03790855, + "balance_loss_mlp": 1.02001774, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 1.7650823530662414, + "language_loss": 0.71305192, + "learning_rate": 5.934491486396647e-07, + "loss": 0.73449135, + "num_input_tokens_seen": 271170865, + "step": 12571, + "time_per_iteration": 2.775466203689575 + }, + { + "auxiliary_loss_clip": 0.01069485, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.03743947, + "balance_loss_mlp": 1.01727557, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.6535077558731064, + "language_loss": 0.7388956, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75989723, + "num_input_tokens_seen": 271191450, + "step": 12572, + "time_per_iteration": 2.617969512939453 + }, + { + "auxiliary_loss_clip": 0.01092977, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.0411303, + "balance_loss_mlp": 1.01931202, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 2.17304718494024, + "language_loss": 0.76770443, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78894842, + "num_input_tokens_seen": 271207335, + "step": 12573, + "time_per_iteration": 2.478132486343384 + }, + { + "auxiliary_loss_clip": 0.01089271, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.04136586, + "balance_loss_mlp": 1.02000809, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.6248528395042532, + "language_loss": 0.69394565, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71516013, + "num_input_tokens_seen": 271226895, + "step": 12574, + "time_per_iteration": 2.587935447692871 + }, + { + "auxiliary_loss_clip": 0.0107468, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.03205311, + "balance_loss_mlp": 1.02073717, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.247592766424466, + "language_loss": 0.71941715, + "learning_rate": 5.923420749619974e-07, + "loss": 0.74050128, + "num_input_tokens_seen": 271244375, + "step": 12575, + "time_per_iteration": 2.5169661045074463 + }, + { + "auxiliary_loss_clip": 0.01107526, + "auxiliary_loss_mlp": 0.00777492, + "balance_loss_clip": 1.03578711, + "balance_loss_mlp": 1.00066972, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 3.3318497257407182, + "language_loss": 0.71963918, + "learning_rate": 5.92065439962673e-07, + "loss": 0.73848933, + "num_input_tokens_seen": 271259530, + "step": 12576, + "time_per_iteration": 2.4129345417022705 + }, + { + "auxiliary_loss_clip": 0.01077656, + "auxiliary_loss_mlp": 0.01030699, + "balance_loss_clip": 1.03911924, + "balance_loss_mlp": 1.01832521, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 3.811197409016126, + "language_loss": 0.67225671, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69334024, + "num_input_tokens_seen": 271276835, + "step": 12577, + "time_per_iteration": 2.5289547443389893 + }, + { + "auxiliary_loss_clip": 0.01088044, + "auxiliary_loss_mlp": 0.01035273, + "balance_loss_clip": 1.03561354, + "balance_loss_mlp": 1.02350152, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 19.06425768067926, + "language_loss": 0.78200543, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80323857, + "num_input_tokens_seen": 271296275, + "step": 12578, + "time_per_iteration": 2.5233359336853027 + }, + { + "auxiliary_loss_clip": 0.01100299, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.03745615, + "balance_loss_mlp": 1.01720178, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.5922274457711632, + "language_loss": 0.75697571, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77827299, + "num_input_tokens_seen": 271315685, + "step": 12579, + "time_per_iteration": 2.5284688472747803 + }, + { + "auxiliary_loss_clip": 0.0106993, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.03691459, + "balance_loss_mlp": 1.01883221, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 2.781250424211468, + "language_loss": 0.62909132, + "learning_rate": 5.90959433960437e-07, + "loss": 0.65011299, + "num_input_tokens_seen": 271336790, + "step": 12580, + "time_per_iteration": 2.7241411209106445 + }, + { + "auxiliary_loss_clip": 0.01069399, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.03676593, + "balance_loss_mlp": 1.01963544, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 2.5565553879802256, + "language_loss": 0.74795264, + "learning_rate": 5.906830660110691e-07, + "loss": 0.76896513, + "num_input_tokens_seen": 271355470, + "step": 12581, + "time_per_iteration": 4.314194440841675 + }, + { + "auxiliary_loss_clip": 0.01070048, + "auxiliary_loss_mlp": 0.01027725, + "balance_loss_clip": 1.03767288, + "balance_loss_mlp": 1.0154227, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 1.9386655873214576, + "language_loss": 0.6317482, + "learning_rate": 5.904067515031412e-07, + "loss": 0.65272593, + "num_input_tokens_seen": 271375810, + "step": 12582, + "time_per_iteration": 2.607361316680908 + }, + { + "auxiliary_loss_clip": 0.01029547, + "auxiliary_loss_mlp": 0.01003686, + "balance_loss_clip": 1.00580227, + "balance_loss_mlp": 1.00240445, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9524168859168248, + "language_loss": 0.60720789, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62754023, + "num_input_tokens_seen": 271424775, + "step": 12583, + "time_per_iteration": 2.758105754852295 + }, + { + "auxiliary_loss_clip": 0.01084447, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.0391283, + "balance_loss_mlp": 1.02177083, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.0993662296011646, + "language_loss": 0.78702629, + "learning_rate": 5.898542828535125e-07, + "loss": 0.80821419, + "num_input_tokens_seen": 271440500, + "step": 12584, + "time_per_iteration": 2.473708152770996 + }, + { + "auxiliary_loss_clip": 0.0108121, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.03391588, + "balance_loss_mlp": 1.02163815, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 2.1839858179788663, + "language_loss": 0.77595568, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79711998, + "num_input_tokens_seen": 271458180, + "step": 12585, + "time_per_iteration": 2.531717300415039 + }, + { + "auxiliary_loss_clip": 0.01114441, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.03973484, + "balance_loss_mlp": 1.02149034, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.8294158215060292, + "language_loss": 0.8301028, + "learning_rate": 5.893020280953493e-07, + "loss": 0.85159081, + "num_input_tokens_seen": 271475730, + "step": 12586, + "time_per_iteration": 2.4277830123901367 + }, + { + "auxiliary_loss_clip": 0.01112976, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.03830898, + "balance_loss_mlp": 1.01920176, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 2.3060147373478044, + "language_loss": 0.83919281, + "learning_rate": 5.890259809517459e-07, + "loss": 0.86063331, + "num_input_tokens_seen": 271495030, + "step": 12587, + "time_per_iteration": 2.433682680130005 + }, + { + "auxiliary_loss_clip": 0.0107889, + "auxiliary_loss_mlp": 0.01025779, + "balance_loss_clip": 1.03788424, + "balance_loss_mlp": 1.01350689, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.9101878547723874, + "language_loss": 0.71349519, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73454189, + "num_input_tokens_seen": 271515355, + "step": 12588, + "time_per_iteration": 2.5554206371307373 + }, + { + "auxiliary_loss_clip": 0.01112498, + "auxiliary_loss_mlp": 0.00778458, + "balance_loss_clip": 1.03804207, + "balance_loss_mlp": 1.00064135, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 3.335970736607098, + "language_loss": 0.69132936, + "learning_rate": 5.884740471878327e-07, + "loss": 0.71023893, + "num_input_tokens_seen": 271535090, + "step": 12589, + "time_per_iteration": 4.292093276977539 + }, + { + "auxiliary_loss_clip": 0.01099964, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.03807271, + "balance_loss_mlp": 1.01682043, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 1.6372042081105787, + "language_loss": 0.9235813, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94487149, + "num_input_tokens_seen": 271551075, + "step": 12590, + "time_per_iteration": 2.541537284851074 + }, + { + "auxiliary_loss_clip": 0.01082322, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.03461301, + "balance_loss_mlp": 1.017519, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 1.7811295400991662, + "language_loss": 0.65562618, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67674619, + "num_input_tokens_seen": 271571035, + "step": 12591, + "time_per_iteration": 2.633009910583496 + }, + { + "auxiliary_loss_clip": 0.01098344, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.03810728, + "balance_loss_mlp": 1.01502228, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 1.420702610039667, + "language_loss": 0.73584396, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75708956, + "num_input_tokens_seen": 271592950, + "step": 12592, + "time_per_iteration": 2.554856300354004 + }, + { + "auxiliary_loss_clip": 0.01100591, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.03754854, + "balance_loss_mlp": 1.02204025, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.876476061113279, + "language_loss": 0.71525115, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73660165, + "num_input_tokens_seen": 271608835, + "step": 12593, + "time_per_iteration": 3.8632469177246094 + }, + { + "auxiliary_loss_clip": 0.01113319, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.03895402, + "balance_loss_mlp": 1.01792741, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 4.728995046752604, + "language_loss": 0.66225147, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68368894, + "num_input_tokens_seen": 271627730, + "step": 12594, + "time_per_iteration": 2.4388716220855713 + }, + { + "auxiliary_loss_clip": 0.01080009, + "auxiliary_loss_mlp": 0.01029746, + "balance_loss_clip": 1.03751206, + "balance_loss_mlp": 1.01781309, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.7866657962291528, + "language_loss": 0.8092562, + "learning_rate": 5.86819530835722e-07, + "loss": 0.83035374, + "num_input_tokens_seen": 271646415, + "step": 12595, + "time_per_iteration": 2.5547900199890137 + }, + { + "auxiliary_loss_clip": 0.01064978, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.04000819, + "balance_loss_mlp": 1.02031755, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 2.032499679366096, + "language_loss": 0.71409905, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73506844, + "num_input_tokens_seen": 271666240, + "step": 12596, + "time_per_iteration": 2.6080362796783447 + }, + { + "auxiliary_loss_clip": 0.01040481, + "auxiliary_loss_mlp": 0.0103122, + "balance_loss_clip": 1.04393935, + "balance_loss_mlp": 1.01977563, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.4861746713970119, + "language_loss": 0.80557132, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82628834, + "num_input_tokens_seen": 271686370, + "step": 12597, + "time_per_iteration": 2.8779375553131104 + }, + { + "auxiliary_loss_clip": 0.01088284, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.04362273, + "balance_loss_mlp": 1.01707375, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 1.714533685749461, + "language_loss": 0.83342034, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85460544, + "num_input_tokens_seen": 271705050, + "step": 12598, + "time_per_iteration": 3.01021671295166 + }, + { + "auxiliary_loss_clip": 0.01087488, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.04131961, + "balance_loss_mlp": 1.01570475, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.7227183318308612, + "language_loss": 0.62738895, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64853311, + "num_input_tokens_seen": 271724915, + "step": 12599, + "time_per_iteration": 2.54990553855896 + }, + { + "auxiliary_loss_clip": 0.01087852, + "auxiliary_loss_mlp": 0.0077933, + "balance_loss_clip": 1.03630126, + "balance_loss_mlp": 1.00069094, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.779378082904454, + "language_loss": 0.63176501, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65043682, + "num_input_tokens_seen": 271742410, + "step": 12600, + "time_per_iteration": 2.4992895126342773 + }, + { + "auxiliary_loss_clip": 0.01082224, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.03383803, + "balance_loss_mlp": 1.02172565, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 2.6931138983095386, + "language_loss": 0.66269654, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68387043, + "num_input_tokens_seen": 271761425, + "step": 12601, + "time_per_iteration": 2.498598575592041 + }, + { + "auxiliary_loss_clip": 0.01080688, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.03503764, + "balance_loss_mlp": 1.01810384, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.658520024713853, + "language_loss": 0.67913896, + "learning_rate": 5.848917001679335e-07, + "loss": 0.70024157, + "num_input_tokens_seen": 271780875, + "step": 12602, + "time_per_iteration": 2.4864957332611084 + }, + { + "auxiliary_loss_clip": 0.01101788, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.0378325, + "balance_loss_mlp": 1.02129996, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 4.318654509754827, + "language_loss": 0.66896617, + "learning_rate": 5.846165103474967e-07, + "loss": 0.69032621, + "num_input_tokens_seen": 271799490, + "step": 12603, + "time_per_iteration": 2.4435856342315674 + }, + { + "auxiliary_loss_clip": 0.01086679, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.03384471, + "balance_loss_mlp": 1.01766098, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 2.40785239707449, + "language_loss": 0.61595476, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63711071, + "num_input_tokens_seen": 271817040, + "step": 12604, + "time_per_iteration": 4.06962251663208 + }, + { + "auxiliary_loss_clip": 0.01111443, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.03990507, + "balance_loss_mlp": 1.02339244, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.8950447226128924, + "language_loss": 0.80067807, + "learning_rate": 5.840662917315076e-07, + "loss": 0.82215273, + "num_input_tokens_seen": 271835480, + "step": 12605, + "time_per_iteration": 2.4203875064849854 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.03738904, + "balance_loss_mlp": 1.01882005, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 2.5720282456444, + "language_loss": 0.79443967, + "learning_rate": 5.837912629568198e-07, + "loss": 0.81589025, + "num_input_tokens_seen": 271849835, + "step": 12606, + "time_per_iteration": 2.4126243591308594 + }, + { + "auxiliary_loss_clip": 0.01093619, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.0355444, + "balance_loss_mlp": 1.0184828, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.4943302008314565, + "language_loss": 0.73349321, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75472116, + "num_input_tokens_seen": 271869560, + "step": 12607, + "time_per_iteration": 2.5419726371765137 + }, + { + "auxiliary_loss_clip": 0.01086038, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.03810167, + "balance_loss_mlp": 1.02025008, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 2.304874095995425, + "language_loss": 0.74801576, + "learning_rate": 5.83241366526202e-07, + "loss": 0.76920563, + "num_input_tokens_seen": 271887950, + "step": 12608, + "time_per_iteration": 2.4635114669799805 + }, + { + "auxiliary_loss_clip": 0.01077224, + "auxiliary_loss_mlp": 0.00777884, + "balance_loss_clip": 1.03798842, + "balance_loss_mlp": 1.00059962, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.615163579191823, + "language_loss": 0.71416819, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73271924, + "num_input_tokens_seen": 271907700, + "step": 12609, + "time_per_iteration": 2.582219362258911 + }, + { + "auxiliary_loss_clip": 0.01110735, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.03613281, + "balance_loss_mlp": 1.01871872, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 1.808066507589195, + "language_loss": 0.81469309, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83612186, + "num_input_tokens_seen": 271926840, + "step": 12610, + "time_per_iteration": 2.439692258834839 + }, + { + "auxiliary_loss_clip": 0.01096142, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.03928781, + "balance_loss_mlp": 1.02143097, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.7360792210987281, + "language_loss": 0.70360988, + "learning_rate": 5.824169248335488e-07, + "loss": 0.7249099, + "num_input_tokens_seen": 271946465, + "step": 12611, + "time_per_iteration": 2.4996085166931152 + }, + { + "auxiliary_loss_clip": 0.01109828, + "auxiliary_loss_mlp": 0.01027362, + "balance_loss_clip": 1.03769016, + "balance_loss_mlp": 1.01519656, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.532248109159315, + "language_loss": 0.70964712, + "learning_rate": 5.821422184318893e-07, + "loss": 0.73101902, + "num_input_tokens_seen": 271967295, + "step": 12612, + "time_per_iteration": 2.433835029602051 + }, + { + "auxiliary_loss_clip": 0.01054109, + "auxiliary_loss_mlp": 0.01038209, + "balance_loss_clip": 1.03692698, + "balance_loss_mlp": 1.02633572, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.4064217091535167, + "language_loss": 0.59710014, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61802328, + "num_input_tokens_seen": 271987960, + "step": 12613, + "time_per_iteration": 2.6419687271118164 + }, + { + "auxiliary_loss_clip": 0.01085911, + "auxiliary_loss_mlp": 0.01039327, + "balance_loss_clip": 1.03473783, + "balance_loss_mlp": 1.02588582, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 2.91135487671217, + "language_loss": 0.59863073, + "learning_rate": 5.815929669349135e-07, + "loss": 0.61988306, + "num_input_tokens_seen": 272011780, + "step": 12614, + "time_per_iteration": 2.6010148525238037 + }, + { + "auxiliary_loss_clip": 0.01077572, + "auxiliary_loss_mlp": 0.01027715, + "balance_loss_clip": 1.03444719, + "balance_loss_mlp": 1.01517415, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.8288371184001, + "language_loss": 0.73337978, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75443268, + "num_input_tokens_seen": 272030825, + "step": 12615, + "time_per_iteration": 2.636965036392212 + }, + { + "auxiliary_loss_clip": 0.01011489, + "auxiliary_loss_mlp": 0.01002891, + "balance_loss_clip": 1.01028562, + "balance_loss_mlp": 1.00167513, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8056886169256616, + "language_loss": 0.67650974, + "learning_rate": 5.810439305824828e-07, + "loss": 0.6966536, + "num_input_tokens_seen": 272095825, + "step": 12616, + "time_per_iteration": 3.167332649230957 + }, + { + "auxiliary_loss_clip": 0.01081529, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.03768826, + "balance_loss_mlp": 1.02294421, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.7624515239430634, + "language_loss": 0.84655702, + "learning_rate": 5.807694931114979e-07, + "loss": 0.86772537, + "num_input_tokens_seen": 272113950, + "step": 12617, + "time_per_iteration": 2.609905242919922 + }, + { + "auxiliary_loss_clip": 0.01077497, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.03830612, + "balance_loss_mlp": 1.02083611, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 2.527518315047915, + "language_loss": 0.75104511, + "learning_rate": 5.804951094578757e-07, + "loss": 0.77214587, + "num_input_tokens_seen": 272130315, + "step": 12618, + "time_per_iteration": 2.517651081085205 + }, + { + "auxiliary_loss_clip": 0.01090044, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.03821516, + "balance_loss_mlp": 1.02155042, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 2.598748739602318, + "language_loss": 0.77334714, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79459435, + "num_input_tokens_seen": 272149080, + "step": 12619, + "time_per_iteration": 2.511033058166504 + }, + { + "auxiliary_loss_clip": 0.01070272, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.03320301, + "balance_loss_mlp": 1.02241087, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.7637711783100305, + "language_loss": 0.82373226, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84478074, + "num_input_tokens_seen": 272168285, + "step": 12620, + "time_per_iteration": 2.607206344604492 + }, + { + "auxiliary_loss_clip": 0.01087038, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.03488719, + "balance_loss_mlp": 1.02212381, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.647478620930922, + "language_loss": 0.83095706, + "learning_rate": 5.796722815052242e-07, + "loss": 0.85218465, + "num_input_tokens_seen": 272184585, + "step": 12621, + "time_per_iteration": 3.9177474975585938 + }, + { + "auxiliary_loss_clip": 0.010897, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.03693604, + "balance_loss_mlp": 1.02092135, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 2.0542630586483837, + "language_loss": 0.73710603, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75833195, + "num_input_tokens_seen": 272200205, + "step": 12622, + "time_per_iteration": 2.495867967605591 + }, + { + "auxiliary_loss_clip": 0.01021469, + "auxiliary_loss_mlp": 0.01010217, + "balance_loss_clip": 1.00786424, + "balance_loss_mlp": 1.00891733, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 1.0339961195624303, + "language_loss": 0.60949159, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62980843, + "num_input_tokens_seen": 272259670, + "step": 12623, + "time_per_iteration": 3.0661633014678955 + }, + { + "auxiliary_loss_clip": 0.01108, + "auxiliary_loss_mlp": 0.01035801, + "balance_loss_clip": 1.03773308, + "balance_loss_mlp": 1.02447033, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 1.9734324345464256, + "language_loss": 0.67312771, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69456571, + "num_input_tokens_seen": 272277925, + "step": 12624, + "time_per_iteration": 2.428316116333008 + }, + { + "auxiliary_loss_clip": 0.01107537, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.03720534, + "balance_loss_mlp": 1.01758862, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.7405337154152565, + "language_loss": 0.76049906, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78187585, + "num_input_tokens_seen": 272296010, + "step": 12625, + "time_per_iteration": 2.4234402179718018 + }, + { + "auxiliary_loss_clip": 0.01085027, + "auxiliary_loss_mlp": 0.01044913, + "balance_loss_clip": 1.03499734, + "balance_loss_mlp": 1.03152013, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.6330574220236584, + "language_loss": 0.62588584, + "learning_rate": 5.783019789020977e-07, + "loss": 0.64718521, + "num_input_tokens_seen": 272318330, + "step": 12626, + "time_per_iteration": 2.569528102874756 + }, + { + "auxiliary_loss_clip": 0.0108178, + "auxiliary_loss_mlp": 0.00779382, + "balance_loss_clip": 1.0447278, + "balance_loss_mlp": 1.00068069, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 2.3748754924373308, + "language_loss": 0.73529577, + "learning_rate": 5.780280800727084e-07, + "loss": 0.75390738, + "num_input_tokens_seen": 272335265, + "step": 12627, + "time_per_iteration": 2.6008517742156982 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.03771591, + "balance_loss_mlp": 1.02338958, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 2.409534073512265, + "language_loss": 0.68960232, + "learning_rate": 5.777542351646356e-07, + "loss": 0.71096516, + "num_input_tokens_seen": 272354795, + "step": 12628, + "time_per_iteration": 2.4837028980255127 + }, + { + "auxiliary_loss_clip": 0.01107803, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.04455948, + "balance_loss_mlp": 1.02304697, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 2.047338011993226, + "language_loss": 0.6320585, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65350175, + "num_input_tokens_seen": 272372875, + "step": 12629, + "time_per_iteration": 4.046267747879028 + }, + { + "auxiliary_loss_clip": 0.0108406, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.03413522, + "balance_loss_mlp": 1.02004695, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.660874453646014, + "language_loss": 0.77812856, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79928613, + "num_input_tokens_seen": 272394715, + "step": 12630, + "time_per_iteration": 2.629655361175537 + }, + { + "auxiliary_loss_clip": 0.01027881, + "auxiliary_loss_mlp": 0.01006443, + "balance_loss_clip": 1.00429308, + "balance_loss_mlp": 1.00522089, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8464170916124696, + "language_loss": 0.6145699, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63491321, + "num_input_tokens_seen": 272458775, + "step": 12631, + "time_per_iteration": 3.060415029525757 + }, + { + "auxiliary_loss_clip": 0.01083652, + "auxiliary_loss_mlp": 0.00779272, + "balance_loss_clip": 1.03805184, + "balance_loss_mlp": 1.00077379, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.708389336505774, + "language_loss": 0.74017012, + "learning_rate": 5.766593949531767e-07, + "loss": 0.75879937, + "num_input_tokens_seen": 272479355, + "step": 12632, + "time_per_iteration": 4.092661380767822 + }, + { + "auxiliary_loss_clip": 0.01089033, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.03699756, + "balance_loss_mlp": 1.01778269, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 2.0563936666648677, + "language_loss": 0.75095665, + "learning_rate": 5.763858198074154e-07, + "loss": 0.7721473, + "num_input_tokens_seen": 272493555, + "step": 12633, + "time_per_iteration": 2.4793825149536133 + }, + { + "auxiliary_loss_clip": 0.01085788, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.03678679, + "balance_loss_mlp": 1.01957154, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 3.0898081764996688, + "language_loss": 0.73561728, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75678122, + "num_input_tokens_seen": 272508925, + "step": 12634, + "time_per_iteration": 2.5273478031158447 + }, + { + "auxiliary_loss_clip": 0.01111936, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.03894687, + "balance_loss_mlp": 1.01968777, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.8537568483857416, + "language_loss": 0.64475369, + "learning_rate": 5.758388314770408e-07, + "loss": 0.66619325, + "num_input_tokens_seen": 272528805, + "step": 12635, + "time_per_iteration": 2.546757936477661 + }, + { + "auxiliary_loss_clip": 0.01058798, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_clip": 1.03664255, + "balance_loss_mlp": 1.03099465, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 1.617360511538018, + "language_loss": 0.68861997, + "learning_rate": 5.7556541831317e-07, + "loss": 0.70967168, + "num_input_tokens_seen": 272546655, + "step": 12636, + "time_per_iteration": 2.5532960891723633 + }, + { + "auxiliary_loss_clip": 0.01092701, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.03782749, + "balance_loss_mlp": 1.01968527, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.9747391456267684, + "language_loss": 0.81245714, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83369935, + "num_input_tokens_seen": 272564010, + "step": 12637, + "time_per_iteration": 2.5151500701904297 + }, + { + "auxiliary_loss_clip": 0.01098663, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.03696775, + "balance_loss_mlp": 1.02395952, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 2.466448813212417, + "language_loss": 0.66478264, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68613183, + "num_input_tokens_seen": 272585840, + "step": 12638, + "time_per_iteration": 2.62332820892334 + }, + { + "auxiliary_loss_clip": 0.01112595, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.03904128, + "balance_loss_mlp": 1.01802313, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.1237656287121665, + "language_loss": 0.65730256, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67874646, + "num_input_tokens_seen": 272602300, + "step": 12639, + "time_per_iteration": 2.3968417644500732 + }, + { + "auxiliary_loss_clip": 0.01098506, + "auxiliary_loss_mlp": 0.01032515, + "balance_loss_clip": 1.03547478, + "balance_loss_mlp": 1.01955056, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 2.1153974714688593, + "language_loss": 0.69867539, + "learning_rate": 5.744723059083572e-07, + "loss": 0.7199856, + "num_input_tokens_seen": 272619595, + "step": 12640, + "time_per_iteration": 2.4723122119903564 + }, + { + "auxiliary_loss_clip": 0.01092713, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.0395447, + "balance_loss_mlp": 1.02206743, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 2.1908610832733557, + "language_loss": 0.66933918, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69062167, + "num_input_tokens_seen": 272638825, + "step": 12641, + "time_per_iteration": 2.5297625064849854 + }, + { + "auxiliary_loss_clip": 0.01097737, + "auxiliary_loss_mlp": 0.01034361, + "balance_loss_clip": 1.03526676, + "balance_loss_mlp": 1.02105141, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 6.866258962684853, + "language_loss": 0.6672802, + "learning_rate": 5.73926074001422e-07, + "loss": 0.6886012, + "num_input_tokens_seen": 272657240, + "step": 12642, + "time_per_iteration": 2.479062795639038 + }, + { + "auxiliary_loss_clip": 0.01089967, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.03819239, + "balance_loss_mlp": 1.02045155, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 1.9064350761075326, + "language_loss": 0.75700378, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77822775, + "num_input_tokens_seen": 272677520, + "step": 12643, + "time_per_iteration": 4.1263182163238525 + }, + { + "auxiliary_loss_clip": 0.01078823, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.04049516, + "balance_loss_mlp": 1.02128565, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 1.9002346791712381, + "language_loss": 0.78771651, + "learning_rate": 5.733800584019508e-07, + "loss": 0.80885887, + "num_input_tokens_seen": 272696770, + "step": 12644, + "time_per_iteration": 2.5257742404937744 + }, + { + "auxiliary_loss_clip": 0.01084366, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.03305435, + "balance_loss_mlp": 1.02056646, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.9511213594556756, + "language_loss": 0.80627418, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82744825, + "num_input_tokens_seen": 272718340, + "step": 12645, + "time_per_iteration": 2.532318353652954 + }, + { + "auxiliary_loss_clip": 0.01086979, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.03753352, + "balance_loss_mlp": 1.0218333, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.5029661796163545, + "language_loss": 0.72873074, + "learning_rate": 5.728342591927611e-07, + "loss": 0.74994719, + "num_input_tokens_seen": 272739575, + "step": 12646, + "time_per_iteration": 2.5188586711883545 + }, + { + "auxiliary_loss_clip": 0.01097915, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.03608489, + "balance_loss_mlp": 1.02343202, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 6.937729263798156, + "language_loss": 0.67494488, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69627619, + "num_input_tokens_seen": 272758710, + "step": 12647, + "time_per_iteration": 2.46285343170166 + }, + { + "auxiliary_loss_clip": 0.01019569, + "auxiliary_loss_mlp": 0.01006034, + "balance_loss_clip": 1.00407028, + "balance_loss_mlp": 1.00454986, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6823449532475714, + "language_loss": 0.48977664, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51003265, + "num_input_tokens_seen": 272814855, + "step": 12648, + "time_per_iteration": 2.9948360919952393 + }, + { + "auxiliary_loss_clip": 0.01095444, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.03523171, + "balance_loss_mlp": 1.02235508, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 2.3352047269146667, + "language_loss": 0.7643522, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78564757, + "num_input_tokens_seen": 272834400, + "step": 12649, + "time_per_iteration": 2.4549965858459473 + }, + { + "auxiliary_loss_clip": 0.01070059, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.03600073, + "balance_loss_mlp": 1.02017534, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.8408409602440436, + "language_loss": 0.6866942, + "learning_rate": 5.717433102763462e-07, + "loss": 0.70771682, + "num_input_tokens_seen": 272854760, + "step": 12650, + "time_per_iteration": 2.575037717819214 + }, + { + "auxiliary_loss_clip": 0.0101895, + "auxiliary_loss_mlp": 0.01001295, + "balance_loss_clip": 1.00478661, + "balance_loss_mlp": 1.00011468, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.751668361996298, + "language_loss": 0.62713891, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64734137, + "num_input_tokens_seen": 272919030, + "step": 12651, + "time_per_iteration": 3.035489559173584 + }, + { + "auxiliary_loss_clip": 0.0107447, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.03523874, + "balance_loss_mlp": 1.02103949, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.4868919787305441, + "language_loss": 0.71277755, + "learning_rate": 5.711981607345951e-07, + "loss": 0.7338444, + "num_input_tokens_seen": 272938925, + "step": 12652, + "time_per_iteration": 2.6270411014556885 + }, + { + "auxiliary_loss_clip": 0.01061931, + "auxiliary_loss_mlp": 0.01038391, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.02562332, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 2.2554259519482, + "language_loss": 0.80282342, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82382673, + "num_input_tokens_seen": 272954945, + "step": 12653, + "time_per_iteration": 2.633107900619507 + }, + { + "auxiliary_loss_clip": 0.01116049, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.04011011, + "balance_loss_mlp": 1.02090216, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.5479726975039874, + "language_loss": 0.80281174, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82430816, + "num_input_tokens_seen": 272972855, + "step": 12654, + "time_per_iteration": 2.4765501022338867 + }, + { + "auxiliary_loss_clip": 0.01080035, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.03917325, + "balance_loss_mlp": 1.02086377, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.1819570896430425, + "language_loss": 0.79199696, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81313765, + "num_input_tokens_seen": 272989895, + "step": 12655, + "time_per_iteration": 2.544433832168579 + }, + { + "auxiliary_loss_clip": 0.01092004, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.03610563, + "balance_loss_mlp": 1.01764917, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.8748798515426301, + "language_loss": 0.68251932, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70372236, + "num_input_tokens_seen": 273011695, + "step": 12656, + "time_per_iteration": 2.5306570529937744 + }, + { + "auxiliary_loss_clip": 0.01102159, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.03352761, + "balance_loss_mlp": 1.01835418, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 2.0505935154037065, + "language_loss": 0.73391712, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75525624, + "num_input_tokens_seen": 273028815, + "step": 12657, + "time_per_iteration": 2.4453413486480713 + }, + { + "auxiliary_loss_clip": 0.01012549, + "auxiliary_loss_mlp": 0.00999921, + "balance_loss_clip": 1.00647807, + "balance_loss_mlp": 0.99860328, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8577024813666639, + "language_loss": 0.64941275, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66953748, + "num_input_tokens_seen": 273084080, + "step": 12658, + "time_per_iteration": 3.014397382736206 + }, + { + "auxiliary_loss_clip": 0.01099076, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.04310632, + "balance_loss_mlp": 1.02497244, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 2.7739983385208755, + "language_loss": 0.79381359, + "learning_rate": 5.692918445605293e-07, + "loss": 0.81517863, + "num_input_tokens_seen": 273102295, + "step": 12659, + "time_per_iteration": 2.4693186283111572 + }, + { + "auxiliary_loss_clip": 0.01096627, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.03549695, + "balance_loss_mlp": 1.01560163, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.5136353160329699, + "language_loss": 0.69066584, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71191466, + "num_input_tokens_seen": 273123400, + "step": 12660, + "time_per_iteration": 2.503993034362793 + }, + { + "auxiliary_loss_clip": 0.01110557, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.03697026, + "balance_loss_mlp": 1.02175236, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.6512710351757531, + "language_loss": 0.70753747, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72898757, + "num_input_tokens_seen": 273145150, + "step": 12661, + "time_per_iteration": 3.9222466945648193 + }, + { + "auxiliary_loss_clip": 0.01098241, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.03494334, + "balance_loss_mlp": 1.02091885, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.87569359135601, + "language_loss": 0.83296549, + "learning_rate": 5.68475665496966e-07, + "loss": 0.8542794, + "num_input_tokens_seen": 273165180, + "step": 12662, + "time_per_iteration": 2.545300006866455 + }, + { + "auxiliary_loss_clip": 0.01085781, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.03518903, + "balance_loss_mlp": 1.02881706, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.7657224743291782, + "language_loss": 0.69004333, + "learning_rate": 5.682037143624505e-07, + "loss": 0.71130788, + "num_input_tokens_seen": 273184005, + "step": 12663, + "time_per_iteration": 2.5261077880859375 + }, + { + "auxiliary_loss_clip": 0.01098277, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.03777766, + "balance_loss_mlp": 1.01481688, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.818868996415043, + "language_loss": 0.70312905, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72437978, + "num_input_tokens_seen": 273203565, + "step": 12664, + "time_per_iteration": 2.473376989364624 + }, + { + "auxiliary_loss_clip": 0.01105476, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.03954208, + "balance_loss_mlp": 1.0216012, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 24.68451236292453, + "language_loss": 0.79283297, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81423777, + "num_input_tokens_seen": 273221645, + "step": 12665, + "time_per_iteration": 2.4708220958709717 + }, + { + "auxiliary_loss_clip": 0.01110315, + "auxiliary_loss_mlp": 0.00776892, + "balance_loss_clip": 1.04015589, + "balance_loss_mlp": 1.00070882, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.7141834505066673, + "language_loss": 0.87730861, + "learning_rate": 5.673881867632959e-07, + "loss": 0.89618069, + "num_input_tokens_seen": 273242040, + "step": 12666, + "time_per_iteration": 2.4844751358032227 + }, + { + "auxiliary_loss_clip": 0.01063806, + "auxiliary_loss_mlp": 0.01035793, + "balance_loss_clip": 1.04134011, + "balance_loss_mlp": 1.02282357, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 2.386123786083154, + "language_loss": 0.83236009, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85335606, + "num_input_tokens_seen": 273257365, + "step": 12667, + "time_per_iteration": 2.543212652206421 + }, + { + "auxiliary_loss_clip": 0.01086039, + "auxiliary_loss_mlp": 0.01038226, + "balance_loss_clip": 1.03643823, + "balance_loss_mlp": 1.02675784, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.6934177680745108, + "language_loss": 0.78584266, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80708528, + "num_input_tokens_seen": 273274710, + "step": 12668, + "time_per_iteration": 4.072836875915527 + }, + { + "auxiliary_loss_clip": 0.01076845, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.03499699, + "balance_loss_mlp": 1.02000308, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.856020662145507, + "language_loss": 0.64236218, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66344833, + "num_input_tokens_seen": 273292870, + "step": 12669, + "time_per_iteration": 2.534457206726074 + }, + { + "auxiliary_loss_clip": 0.01086035, + "auxiliary_loss_mlp": 0.01039066, + "balance_loss_clip": 1.03508425, + "balance_loss_mlp": 1.02534533, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 1.8015257014910404, + "language_loss": 0.66366249, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68491352, + "num_input_tokens_seen": 273312375, + "step": 12670, + "time_per_iteration": 2.5441536903381348 + }, + { + "auxiliary_loss_clip": 0.01101007, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.03712118, + "balance_loss_mlp": 1.0238179, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.6312351331983337, + "language_loss": 0.72993773, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75130951, + "num_input_tokens_seen": 273332590, + "step": 12671, + "time_per_iteration": 2.493377447128296 + }, + { + "auxiliary_loss_clip": 0.01072745, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.03106356, + "balance_loss_mlp": 1.02354145, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 1.7392472564610346, + "language_loss": 0.73511684, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75620461, + "num_input_tokens_seen": 273352885, + "step": 12672, + "time_per_iteration": 4.031461000442505 + }, + { + "auxiliary_loss_clip": 0.01002904, + "auxiliary_loss_mlp": 0.01001981, + "balance_loss_clip": 1.00809026, + "balance_loss_mlp": 1.000664, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7696563102977529, + "language_loss": 0.56681144, + "learning_rate": 5.654871908800506e-07, + "loss": 0.5868603, + "num_input_tokens_seen": 273411730, + "step": 12673, + "time_per_iteration": 3.0623724460601807 + }, + { + "auxiliary_loss_clip": 0.01100129, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.03727007, + "balance_loss_mlp": 1.01677537, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 1.851732666265631, + "language_loss": 0.7506423, + "learning_rate": 5.652158375447102e-07, + "loss": 0.77193856, + "num_input_tokens_seen": 273430020, + "step": 12674, + "time_per_iteration": 2.5011351108551025 + }, + { + "auxiliary_loss_clip": 0.01080668, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.03245568, + "balance_loss_mlp": 1.02520299, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 3.228655204495105, + "language_loss": 0.72432041, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74550927, + "num_input_tokens_seen": 273448690, + "step": 12675, + "time_per_iteration": 2.579730272293091 + }, + { + "auxiliary_loss_clip": 0.01095681, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.03513217, + "balance_loss_mlp": 1.02020335, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.249048025173843, + "language_loss": 0.72881889, + "learning_rate": 5.646732941057936e-07, + "loss": 0.75009155, + "num_input_tokens_seen": 273465190, + "step": 12676, + "time_per_iteration": 2.472357749938965 + }, + { + "auxiliary_loss_clip": 0.01081755, + "auxiliary_loss_mlp": 0.00779624, + "balance_loss_clip": 1.03961301, + "balance_loss_mlp": 1.00070059, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 5.826227774204625, + "language_loss": 0.54217142, + "learning_rate": 5.644021040227927e-07, + "loss": 0.56078523, + "num_input_tokens_seen": 273478620, + "step": 12677, + "time_per_iteration": 2.5396084785461426 + }, + { + "auxiliary_loss_clip": 0.01057599, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.03483009, + "balance_loss_mlp": 1.02494538, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 1.9992808299628253, + "language_loss": 0.78962785, + "learning_rate": 5.641309683778064e-07, + "loss": 0.81058133, + "num_input_tokens_seen": 273497635, + "step": 12678, + "time_per_iteration": 2.5848779678344727 + }, + { + "auxiliary_loss_clip": 0.01074418, + "auxiliary_loss_mlp": 0.01040344, + "balance_loss_clip": 1.03342843, + "balance_loss_mlp": 1.02677166, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 2.8077339355793702, + "language_loss": 0.77288997, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79403758, + "num_input_tokens_seen": 273513955, + "step": 12679, + "time_per_iteration": 2.5227742195129395 + }, + { + "auxiliary_loss_clip": 0.01099095, + "auxiliary_loss_mlp": 0.01027069, + "balance_loss_clip": 1.03726315, + "balance_loss_mlp": 1.01467693, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.3794764319667872, + "language_loss": 0.80114102, + "learning_rate": 5.635888604430059e-07, + "loss": 0.8224026, + "num_input_tokens_seen": 273533970, + "step": 12680, + "time_per_iteration": 2.496851682662964 + }, + { + "auxiliary_loss_clip": 0.01089351, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.03667641, + "balance_loss_mlp": 1.01606488, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.9731933287469203, + "language_loss": 0.62830186, + "learning_rate": 5.633178881737493e-07, + "loss": 0.6494962, + "num_input_tokens_seen": 273553090, + "step": 12681, + "time_per_iteration": 2.512617826461792 + }, + { + "auxiliary_loss_clip": 0.01072934, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.03552568, + "balance_loss_mlp": 1.02498102, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 2.7902668077107005, + "language_loss": 0.7629649, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78406167, + "num_input_tokens_seen": 273572460, + "step": 12682, + "time_per_iteration": 2.549285888671875 + }, + { + "auxiliary_loss_clip": 0.01084503, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.03357029, + "balance_loss_mlp": 1.01833582, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.4911766563940865, + "language_loss": 0.68285912, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70399928, + "num_input_tokens_seen": 273592815, + "step": 12683, + "time_per_iteration": 4.0233376026153564 + }, + { + "auxiliary_loss_clip": 0.01074218, + "auxiliary_loss_mlp": 0.00781726, + "balance_loss_clip": 1.03646445, + "balance_loss_mlp": 1.00068521, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 1.9881346631345613, + "language_loss": 0.82890773, + "learning_rate": 5.625052982818472e-07, + "loss": 0.84746718, + "num_input_tokens_seen": 273611790, + "step": 12684, + "time_per_iteration": 2.585624933242798 + }, + { + "auxiliary_loss_clip": 0.01090725, + "auxiliary_loss_mlp": 0.01037843, + "balance_loss_clip": 1.03843403, + "balance_loss_mlp": 1.02416372, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 2.545102129821603, + "language_loss": 0.8264361, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84772182, + "num_input_tokens_seen": 273628340, + "step": 12685, + "time_per_iteration": 2.467445135116577 + }, + { + "auxiliary_loss_clip": 0.0107997, + "auxiliary_loss_mlp": 0.00778189, + "balance_loss_clip": 1.03634894, + "balance_loss_mlp": 1.00065637, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 2.0991920856053277, + "language_loss": 0.77174902, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79033053, + "num_input_tokens_seen": 273646585, + "step": 12686, + "time_per_iteration": 2.6084816455841064 + }, + { + "auxiliary_loss_clip": 0.01056375, + "auxiliary_loss_mlp": 0.0105588, + "balance_loss_clip": 1.03365302, + "balance_loss_mlp": 1.03931022, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 2.2528117555918237, + "language_loss": 0.71864915, + "learning_rate": 5.616931989794198e-07, + "loss": 0.7397716, + "num_input_tokens_seen": 273665410, + "step": 12687, + "time_per_iteration": 2.5740818977355957 + }, + { + "auxiliary_loss_clip": 0.0108485, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_clip": 1.03284693, + "balance_loss_mlp": 1.03003836, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 3.1050093332317217, + "language_loss": 0.6518333, + "learning_rate": 5.614226082797369e-07, + "loss": 0.67312896, + "num_input_tokens_seen": 273683035, + "step": 12688, + "time_per_iteration": 2.484930992126465 + }, + { + "auxiliary_loss_clip": 0.01098914, + "auxiliary_loss_mlp": 0.01028881, + "balance_loss_clip": 1.037498, + "balance_loss_mlp": 1.01743686, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 1.9809116579927344, + "language_loss": 0.71243536, + "learning_rate": 5.611520721310515e-07, + "loss": 0.73371327, + "num_input_tokens_seen": 273700130, + "step": 12689, + "time_per_iteration": 2.4644339084625244 + }, + { + "auxiliary_loss_clip": 0.01079659, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.02433491, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.7779364803245477, + "language_loss": 0.69651723, + "learning_rate": 5.608815905436238e-07, + "loss": 0.71768546, + "num_input_tokens_seen": 273720310, + "step": 12690, + "time_per_iteration": 2.6654653549194336 + }, + { + "auxiliary_loss_clip": 0.01084283, + "auxiliary_loss_mlp": 0.01040902, + "balance_loss_clip": 1.03347445, + "balance_loss_mlp": 1.02741933, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.5260638728297797, + "language_loss": 0.69506437, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71631622, + "num_input_tokens_seen": 273744475, + "step": 12691, + "time_per_iteration": 2.6333019733428955 + }, + { + "auxiliary_loss_clip": 0.01096879, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_clip": 1.04067802, + "balance_loss_mlp": 1.02029347, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.600141262602501, + "language_loss": 0.81635398, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83763862, + "num_input_tokens_seen": 273764635, + "step": 12692, + "time_per_iteration": 2.4979212284088135 + }, + { + "auxiliary_loss_clip": 0.01084826, + "auxiliary_loss_mlp": 0.01029242, + "balance_loss_clip": 1.04545546, + "balance_loss_mlp": 1.01764846, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 4.618889778683295, + "language_loss": 0.77059048, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79173118, + "num_input_tokens_seen": 273780115, + "step": 12693, + "time_per_iteration": 2.5230610370635986 + }, + { + "auxiliary_loss_clip": 0.01076888, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.04158211, + "balance_loss_mlp": 1.01943851, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 2.126754883219944, + "language_loss": 0.72655725, + "learning_rate": 5.598002100115933e-07, + "loss": 0.74765086, + "num_input_tokens_seen": 273796605, + "step": 12694, + "time_per_iteration": 2.5328266620635986 + }, + { + "auxiliary_loss_clip": 0.01096607, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.03673363, + "balance_loss_mlp": 1.01550508, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 1.9662480486888563, + "language_loss": 0.7059108, + "learning_rate": 5.595300013842625e-07, + "loss": 0.72715604, + "num_input_tokens_seen": 273816515, + "step": 12695, + "time_per_iteration": 2.4743754863739014 + }, + { + "auxiliary_loss_clip": 0.01108757, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.03644919, + "balance_loss_mlp": 1.02032661, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.4385123824508224, + "language_loss": 0.72339451, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74480122, + "num_input_tokens_seen": 273837060, + "step": 12696, + "time_per_iteration": 2.4643988609313965 + }, + { + "auxiliary_loss_clip": 0.01054741, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.03694069, + "balance_loss_mlp": 1.02428126, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.3945076919180495, + "language_loss": 0.71303916, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73396504, + "num_input_tokens_seen": 273853365, + "step": 12697, + "time_per_iteration": 2.6003899574279785 + }, + { + "auxiliary_loss_clip": 0.01074381, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.03803372, + "balance_loss_mlp": 1.0184375, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 2.0114623867465347, + "language_loss": 0.67174751, + "learning_rate": 5.587197032798461e-07, + "loss": 0.69279194, + "num_input_tokens_seen": 273870750, + "step": 12698, + "time_per_iteration": 2.628498077392578 + }, + { + "auxiliary_loss_clip": 0.01097897, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.03427148, + "balance_loss_mlp": 1.01539159, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.7540855575532726, + "language_loss": 0.72439611, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74565518, + "num_input_tokens_seen": 273890890, + "step": 12699, + "time_per_iteration": 2.4905364513397217 + }, + { + "auxiliary_loss_clip": 0.01087144, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.03444934, + "balance_loss_mlp": 1.02162838, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.664816775680221, + "language_loss": 0.73328215, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75448132, + "num_input_tokens_seen": 273914015, + "step": 12700, + "time_per_iteration": 4.0787577629089355 + }, + { + "auxiliary_loss_clip": 0.01109514, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.03596413, + "balance_loss_mlp": 1.01851654, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 2.1553861824061236, + "language_loss": 0.69072652, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71212685, + "num_input_tokens_seen": 273927415, + "step": 12701, + "time_per_iteration": 2.4339993000030518 + }, + { + "auxiliary_loss_clip": 0.01082335, + "auxiliary_loss_mlp": 0.01033241, + "balance_loss_clip": 1.03900075, + "balance_loss_mlp": 1.02114105, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 1.626440016172939, + "language_loss": 0.64607823, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66723394, + "num_input_tokens_seen": 273946690, + "step": 12702, + "time_per_iteration": 2.5069708824157715 + }, + { + "auxiliary_loss_clip": 0.01078812, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.03873861, + "balance_loss_mlp": 1.01901913, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 2.3399100739675713, + "language_loss": 0.65577734, + "learning_rate": 5.57370299645477e-07, + "loss": 0.6768719, + "num_input_tokens_seen": 273966870, + "step": 12703, + "time_per_iteration": 2.5849616527557373 + }, + { + "auxiliary_loss_clip": 0.01087668, + "auxiliary_loss_mlp": 0.01025866, + "balance_loss_clip": 1.03760171, + "balance_loss_mlp": 1.01358712, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 1.8090671833904424, + "language_loss": 0.83453184, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85566717, + "num_input_tokens_seen": 273986360, + "step": 12704, + "time_per_iteration": 2.49429988861084 + }, + { + "auxiliary_loss_clip": 0.01086643, + "auxiliary_loss_mlp": 0.01030688, + "balance_loss_clip": 1.03476644, + "balance_loss_mlp": 1.01850438, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.4886787156880406, + "language_loss": 0.6807887, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70196205, + "num_input_tokens_seen": 274009745, + "step": 12705, + "time_per_iteration": 2.5803637504577637 + }, + { + "auxiliary_loss_clip": 0.0108631, + "auxiliary_loss_mlp": 0.01027362, + "balance_loss_clip": 1.0371058, + "balance_loss_mlp": 1.01503587, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.6670086614071589, + "language_loss": 0.73778582, + "learning_rate": 5.565613138389427e-07, + "loss": 0.75892258, + "num_input_tokens_seen": 274028775, + "step": 12706, + "time_per_iteration": 2.6206977367401123 + }, + { + "auxiliary_loss_clip": 0.01093441, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.03447223, + "balance_loss_mlp": 1.02150106, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.9025619688834718, + "language_loss": 0.78406847, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80534285, + "num_input_tokens_seen": 274047520, + "step": 12707, + "time_per_iteration": 3.9782519340515137 + }, + { + "auxiliary_loss_clip": 0.01082613, + "auxiliary_loss_mlp": 0.01027133, + "balance_loss_clip": 1.03561711, + "balance_loss_mlp": 1.01513457, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 2.1770639017819695, + "language_loss": 0.79992127, + "learning_rate": 5.560222636275751e-07, + "loss": 0.82101882, + "num_input_tokens_seen": 274065350, + "step": 12708, + "time_per_iteration": 2.4671287536621094 + }, + { + "auxiliary_loss_clip": 0.01019792, + "auxiliary_loss_mlp": 0.01000376, + "balance_loss_clip": 1.00907922, + "balance_loss_mlp": 0.99916029, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8171742847899006, + "language_loss": 0.56468588, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58488756, + "num_input_tokens_seen": 274122315, + "step": 12709, + "time_per_iteration": 3.0861079692840576 + }, + { + "auxiliary_loss_clip": 0.01098961, + "auxiliary_loss_mlp": 0.01036588, + "balance_loss_clip": 1.0359509, + "balance_loss_mlp": 1.02322507, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.9871127002600903, + "language_loss": 0.63405937, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65541482, + "num_input_tokens_seen": 274140555, + "step": 12710, + "time_per_iteration": 2.4579575061798096 + }, + { + "auxiliary_loss_clip": 0.01068265, + "auxiliary_loss_mlp": 0.00780097, + "balance_loss_clip": 1.03813124, + "balance_loss_mlp": 1.00068355, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 5.644058473470487, + "language_loss": 0.65033436, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66881794, + "num_input_tokens_seen": 274161125, + "step": 12711, + "time_per_iteration": 4.067026615142822 + }, + { + "auxiliary_loss_clip": 0.01088482, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.03789091, + "balance_loss_mlp": 1.01974273, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.6652924821434123, + "language_loss": 0.72863066, + "learning_rate": 5.549448203559293e-07, + "loss": 0.74983543, + "num_input_tokens_seen": 274180835, + "step": 12712, + "time_per_iteration": 2.533705949783325 + }, + { + "auxiliary_loss_clip": 0.01075481, + "auxiliary_loss_mlp": 0.01029317, + "balance_loss_clip": 1.03739488, + "balance_loss_mlp": 1.01819479, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.5688483707898302, + "language_loss": 0.80426109, + "learning_rate": 5.546755965040804e-07, + "loss": 0.8253091, + "num_input_tokens_seen": 274201190, + "step": 12713, + "time_per_iteration": 2.5470147132873535 + }, + { + "auxiliary_loss_clip": 0.01101046, + "auxiliary_loss_mlp": 0.00778948, + "balance_loss_clip": 1.03589284, + "balance_loss_mlp": 1.00075245, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.9614365751111085, + "language_loss": 0.83368993, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85248989, + "num_input_tokens_seen": 274217595, + "step": 12714, + "time_per_iteration": 2.4710166454315186 + }, + { + "auxiliary_loss_clip": 0.01102423, + "auxiliary_loss_mlp": 0.01034646, + "balance_loss_clip": 1.03690791, + "balance_loss_mlp": 1.02195024, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.5621152555373168, + "language_loss": 0.7279228, + "learning_rate": 5.541373132311287e-07, + "loss": 0.74929345, + "num_input_tokens_seen": 274237885, + "step": 12715, + "time_per_iteration": 2.4867570400238037 + }, + { + "auxiliary_loss_clip": 0.01072863, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.03687871, + "balance_loss_mlp": 1.01822352, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 1.6666037248370167, + "language_loss": 0.63084489, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65187693, + "num_input_tokens_seen": 274258820, + "step": 12716, + "time_per_iteration": 2.588102102279663 + }, + { + "auxiliary_loss_clip": 0.0111184, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.0368278, + "balance_loss_mlp": 1.02177095, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 1.5990669936601238, + "language_loss": 0.79949272, + "learning_rate": 5.535992492672068e-07, + "loss": 0.82096058, + "num_input_tokens_seen": 274278835, + "step": 12717, + "time_per_iteration": 2.439319133758545 + }, + { + "auxiliary_loss_clip": 0.01108523, + "auxiliary_loss_mlp": 0.01037935, + "balance_loss_clip": 1.03720284, + "balance_loss_mlp": 1.02614498, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 4.0997371621009675, + "language_loss": 0.66303837, + "learning_rate": 5.53330299551638e-07, + "loss": 0.68450296, + "num_input_tokens_seen": 274297110, + "step": 12718, + "time_per_iteration": 2.493929147720337 + }, + { + "auxiliary_loss_clip": 0.01071358, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.04196715, + "balance_loss_mlp": 1.02243018, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 2.8003862657874556, + "language_loss": 0.77257317, + "learning_rate": 5.530614046939286e-07, + "loss": 0.79362696, + "num_input_tokens_seen": 274315610, + "step": 12719, + "time_per_iteration": 2.5417866706848145 + }, + { + "auxiliary_loss_clip": 0.01109168, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.03615284, + "balance_loss_mlp": 1.01586294, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 1.6992853391819964, + "language_loss": 0.70150995, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72289002, + "num_input_tokens_seen": 274333975, + "step": 12720, + "time_per_iteration": 2.4435489177703857 + }, + { + "auxiliary_loss_clip": 0.0107801, + "auxiliary_loss_mlp": 0.01030201, + "balance_loss_clip": 1.03837037, + "balance_loss_mlp": 1.01827371, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.975411644912589, + "language_loss": 0.73879933, + "learning_rate": 5.52523779592875e-07, + "loss": 0.75988138, + "num_input_tokens_seen": 274353695, + "step": 12721, + "time_per_iteration": 2.575495481491089 + }, + { + "auxiliary_loss_clip": 0.0107281, + "auxiliary_loss_mlp": 0.01028735, + "balance_loss_clip": 1.03717589, + "balance_loss_mlp": 1.01638508, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.7493709387660497, + "language_loss": 0.73459685, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75561225, + "num_input_tokens_seen": 274371120, + "step": 12722, + "time_per_iteration": 4.029497861862183 + }, + { + "auxiliary_loss_clip": 0.01098708, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.03585744, + "balance_loss_mlp": 1.01898217, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 2.050703422296442, + "language_loss": 0.74119842, + "learning_rate": 5.519863740455912e-07, + "loss": 0.76249814, + "num_input_tokens_seen": 274389665, + "step": 12723, + "time_per_iteration": 2.484347343444824 + }, + { + "auxiliary_loss_clip": 0.01109584, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.03556478, + "balance_loss_mlp": 1.01819849, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.9223557474322512, + "language_loss": 0.73040223, + "learning_rate": 5.517177536300881e-07, + "loss": 0.75181007, + "num_input_tokens_seen": 274408750, + "step": 12724, + "time_per_iteration": 2.4648327827453613 + }, + { + "auxiliary_loss_clip": 0.01095208, + "auxiliary_loss_mlp": 0.01024306, + "balance_loss_clip": 1.03604674, + "balance_loss_mlp": 1.01265979, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 3.1522698985956406, + "language_loss": 0.842282, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86347711, + "num_input_tokens_seen": 274424600, + "step": 12725, + "time_per_iteration": 2.4092626571655273 + }, + { + "auxiliary_loss_clip": 0.01077493, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.04507649, + "balance_loss_mlp": 1.02234769, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 2.051904069249849, + "language_loss": 0.77666521, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79779398, + "num_input_tokens_seen": 274443075, + "step": 12726, + "time_per_iteration": 2.580375909805298 + }, + { + "auxiliary_loss_clip": 0.01098435, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.03612089, + "balance_loss_mlp": 1.01872551, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 4.135930609769161, + "language_loss": 0.70638287, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72767782, + "num_input_tokens_seen": 274463240, + "step": 12727, + "time_per_iteration": 2.500709295272827 + }, + { + "auxiliary_loss_clip": 0.01103157, + "auxiliary_loss_mlp": 0.01026202, + "balance_loss_clip": 1.03394866, + "balance_loss_mlp": 1.01474595, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.665126667571751, + "language_loss": 0.7939043, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81519789, + "num_input_tokens_seen": 274482750, + "step": 12728, + "time_per_iteration": 2.450868606567383 + }, + { + "auxiliary_loss_clip": 0.01111377, + "auxiliary_loss_mlp": 0.01031906, + "balance_loss_clip": 1.03804195, + "balance_loss_mlp": 1.01918006, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 2.7325508613138183, + "language_loss": 0.55505693, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57648981, + "num_input_tokens_seen": 274503545, + "step": 12729, + "time_per_iteration": 2.468790054321289 + }, + { + "auxiliary_loss_clip": 0.01086074, + "auxiliary_loss_mlp": 0.00777285, + "balance_loss_clip": 1.03565824, + "balance_loss_mlp": 1.00063348, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 2.0835380649041393, + "language_loss": 0.77922559, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79785919, + "num_input_tokens_seen": 274523825, + "step": 12730, + "time_per_iteration": 2.5215322971343994 + }, + { + "auxiliary_loss_clip": 0.01102712, + "auxiliary_loss_mlp": 0.01041683, + "balance_loss_clip": 1.03943825, + "balance_loss_mlp": 1.02850461, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 1.6382399018687224, + "language_loss": 0.68803751, + "learning_rate": 5.498389490239495e-07, + "loss": 0.70948148, + "num_input_tokens_seen": 274541625, + "step": 12731, + "time_per_iteration": 2.460440158843994 + }, + { + "auxiliary_loss_clip": 0.01110686, + "auxiliary_loss_mlp": 0.0103138, + "balance_loss_clip": 1.03677714, + "balance_loss_mlp": 1.0190177, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.1442416399992323, + "language_loss": 0.7015714, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72299206, + "num_input_tokens_seen": 274557580, + "step": 12732, + "time_per_iteration": 2.4179904460906982 + }, + { + "auxiliary_loss_clip": 0.01091567, + "auxiliary_loss_mlp": 0.01026794, + "balance_loss_clip": 1.03692985, + "balance_loss_mlp": 1.01405656, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 2.166647130741053, + "language_loss": 0.78479922, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80598289, + "num_input_tokens_seen": 274578135, + "step": 12733, + "time_per_iteration": 2.5844855308532715 + }, + { + "auxiliary_loss_clip": 0.01097438, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.03579152, + "balance_loss_mlp": 1.01993883, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.7140157478912066, + "language_loss": 0.77376175, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79505974, + "num_input_tokens_seen": 274595655, + "step": 12734, + "time_per_iteration": 2.5358760356903076 + }, + { + "auxiliary_loss_clip": 0.01085887, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.03555179, + "balance_loss_mlp": 1.02213705, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.8353051943314782, + "language_loss": 0.72924614, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75046062, + "num_input_tokens_seen": 274616305, + "step": 12735, + "time_per_iteration": 2.6276862621307373 + }, + { + "auxiliary_loss_clip": 0.01080883, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.03611732, + "balance_loss_mlp": 1.02182746, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.4930825895046385, + "language_loss": 0.72815669, + "learning_rate": 5.484985952378145e-07, + "loss": 0.74930859, + "num_input_tokens_seen": 274638110, + "step": 12736, + "time_per_iteration": 2.6190197467803955 + }, + { + "auxiliary_loss_clip": 0.011044, + "auxiliary_loss_mlp": 0.00778935, + "balance_loss_clip": 1.04332256, + "balance_loss_mlp": 1.00077224, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 3.989956516219445, + "language_loss": 0.77515453, + "learning_rate": 5.482306895631728e-07, + "loss": 0.79398787, + "num_input_tokens_seen": 274656565, + "step": 12737, + "time_per_iteration": 2.5001003742218018 + }, + { + "auxiliary_loss_clip": 0.01087366, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.03770995, + "balance_loss_mlp": 1.01774311, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.7495436343981887, + "language_loss": 0.76514757, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78632462, + "num_input_tokens_seen": 274674215, + "step": 12738, + "time_per_iteration": 4.019729375839233 + }, + { + "auxiliary_loss_clip": 0.01091119, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.03711581, + "balance_loss_mlp": 1.01738095, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.8751574309442542, + "language_loss": 0.62475652, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64597023, + "num_input_tokens_seen": 274693445, + "step": 12739, + "time_per_iteration": 2.6083855628967285 + }, + { + "auxiliary_loss_clip": 0.01110947, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.0377773, + "balance_loss_mlp": 1.02041054, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 1.7664963743203206, + "language_loss": 0.79022372, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81167042, + "num_input_tokens_seen": 274712815, + "step": 12740, + "time_per_iteration": 2.418431043624878 + }, + { + "auxiliary_loss_clip": 0.01098811, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.03528702, + "balance_loss_mlp": 1.02211893, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 2.3592571431605878, + "language_loss": 0.65313506, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67447019, + "num_input_tokens_seen": 274732690, + "step": 12741, + "time_per_iteration": 2.5621228218078613 + }, + { + "auxiliary_loss_clip": 0.01082274, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.03321648, + "balance_loss_mlp": 1.01465821, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.6030419356871637, + "language_loss": 0.7596159, + "learning_rate": 5.468919871616386e-07, + "loss": 0.78071356, + "num_input_tokens_seen": 274752460, + "step": 12742, + "time_per_iteration": 2.506061315536499 + }, + { + "auxiliary_loss_clip": 0.01084366, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.03736842, + "balance_loss_mlp": 1.02239871, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.4324340741913748, + "language_loss": 0.76765645, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78883946, + "num_input_tokens_seen": 274773070, + "step": 12743, + "time_per_iteration": 2.5463788509368896 + }, + { + "auxiliary_loss_clip": 0.01084375, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.03318071, + "balance_loss_mlp": 1.01959789, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 1.9784263359062952, + "language_loss": 0.74869114, + "learning_rate": 5.463568918439805e-07, + "loss": 0.76985073, + "num_input_tokens_seen": 274790220, + "step": 12744, + "time_per_iteration": 2.5010294914245605 + }, + { + "auxiliary_loss_clip": 0.01100228, + "auxiliary_loss_mlp": 0.01032335, + "balance_loss_clip": 1.03658175, + "balance_loss_mlp": 1.01975226, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.187782720959486, + "language_loss": 0.71311074, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73443639, + "num_input_tokens_seen": 274805095, + "step": 12745, + "time_per_iteration": 2.485064744949341 + }, + { + "auxiliary_loss_clip": 0.01093508, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_clip": 1.03269315, + "balance_loss_mlp": 1.03044271, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.1987718196992727, + "language_loss": 0.77006119, + "learning_rate": 5.458220170154896e-07, + "loss": 0.79144132, + "num_input_tokens_seen": 274821800, + "step": 12746, + "time_per_iteration": 2.4370622634887695 + }, + { + "auxiliary_loss_clip": 0.0100863, + "auxiliary_loss_mlp": 0.00999448, + "balance_loss_clip": 1.01315367, + "balance_loss_mlp": 0.99808866, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6619720019160427, + "language_loss": 0.56767964, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58776033, + "num_input_tokens_seen": 274886970, + "step": 12747, + "time_per_iteration": 4.593010902404785 + }, + { + "auxiliary_loss_clip": 0.01105698, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.03588617, + "balance_loss_mlp": 1.02172184, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.6376246317805865, + "language_loss": 0.72713411, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74851394, + "num_input_tokens_seen": 274907240, + "step": 12748, + "time_per_iteration": 2.4779231548309326 + }, + { + "auxiliary_loss_clip": 0.01073525, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.03279889, + "balance_loss_mlp": 1.01917362, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 1.754196607446163, + "language_loss": 0.68914747, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71020198, + "num_input_tokens_seen": 274924650, + "step": 12749, + "time_per_iteration": 2.540968894958496 + }, + { + "auxiliary_loss_clip": 0.01098974, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.03672552, + "balance_loss_mlp": 1.01977646, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 2.0634541767180195, + "language_loss": 0.73381758, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75513011, + "num_input_tokens_seen": 274944550, + "step": 12750, + "time_per_iteration": 2.5221474170684814 + }, + { + "auxiliary_loss_clip": 0.01097508, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.03859305, + "balance_loss_mlp": 1.01782274, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 3.237404097048176, + "language_loss": 0.75797701, + "learning_rate": 5.444857951167026e-07, + "loss": 0.7792424, + "num_input_tokens_seen": 274961330, + "step": 12751, + "time_per_iteration": 3.8744728565216064 + }, + { + "auxiliary_loss_clip": 0.0107568, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.03493166, + "balance_loss_mlp": 1.02351761, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 1.771447194211212, + "language_loss": 0.61360717, + "learning_rate": 5.442187162761537e-07, + "loss": 0.63472199, + "num_input_tokens_seen": 274981655, + "step": 12752, + "time_per_iteration": 2.574404001235962 + }, + { + "auxiliary_loss_clip": 0.01100456, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.03729677, + "balance_loss_mlp": 1.02204132, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 2.17964905895517, + "language_loss": 0.68869179, + "learning_rate": 5.439516926389767e-07, + "loss": 0.7100426, + "num_input_tokens_seen": 274999970, + "step": 12753, + "time_per_iteration": 2.488309383392334 + }, + { + "auxiliary_loss_clip": 0.01099977, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.03747118, + "balance_loss_mlp": 1.0250541, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 2.2322863765407943, + "language_loss": 0.62542129, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64679003, + "num_input_tokens_seen": 275015805, + "step": 12754, + "time_per_iteration": 2.4667248725891113 + }, + { + "auxiliary_loss_clip": 0.01108459, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.0381546, + "balance_loss_mlp": 1.01765299, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 2.3574720325347664, + "language_loss": 0.80264723, + "learning_rate": 5.434178110152401e-07, + "loss": 0.82402283, + "num_input_tokens_seen": 275031810, + "step": 12755, + "time_per_iteration": 2.451502799987793 + }, + { + "auxiliary_loss_clip": 0.01108035, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.03683937, + "balance_loss_mlp": 1.01952553, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 1.9752340521727139, + "language_loss": 0.70060861, + "learning_rate": 5.431509530489242e-07, + "loss": 0.72200143, + "num_input_tokens_seen": 275049325, + "step": 12756, + "time_per_iteration": 2.4273478984832764 + }, + { + "auxiliary_loss_clip": 0.01098076, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.03688252, + "balance_loss_mlp": 1.02522326, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 1.565641633109043, + "language_loss": 0.70393503, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72528243, + "num_input_tokens_seen": 275070865, + "step": 12757, + "time_per_iteration": 2.526906967163086 + }, + { + "auxiliary_loss_clip": 0.01092353, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.04025459, + "balance_loss_mlp": 1.02238965, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 2.2393601233113345, + "language_loss": 0.75854766, + "learning_rate": 5.426174028579955e-07, + "loss": 0.77982205, + "num_input_tokens_seen": 275088015, + "step": 12758, + "time_per_iteration": 2.5481255054473877 + }, + { + "auxiliary_loss_clip": 0.0109497, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.03408587, + "balance_loss_mlp": 1.02571821, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 2.5841854768397585, + "language_loss": 0.7658264, + "learning_rate": 5.423507106536156e-07, + "loss": 0.78715044, + "num_input_tokens_seen": 275106975, + "step": 12759, + "time_per_iteration": 2.46081280708313 + }, + { + "auxiliary_loss_clip": 0.01087205, + "auxiliary_loss_mlp": 0.01026032, + "balance_loss_clip": 1.0346303, + "balance_loss_mlp": 1.01451695, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 2.013252123399931, + "language_loss": 0.68490398, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70603633, + "num_input_tokens_seen": 275129560, + "step": 12760, + "time_per_iteration": 2.635517120361328 + }, + { + "auxiliary_loss_clip": 0.01090326, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03832936, + "balance_loss_mlp": 1.0186305, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.4520165609797289, + "language_loss": 0.79411197, + "learning_rate": 5.418174920775871e-07, + "loss": 0.8153255, + "num_input_tokens_seen": 275151180, + "step": 12761, + "time_per_iteration": 2.5563597679138184 + }, + { + "auxiliary_loss_clip": 0.01085107, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.03650188, + "balance_loss_mlp": 1.01907992, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 1.6520732284502568, + "language_loss": 0.65684271, + "learning_rate": 5.415509657261589e-07, + "loss": 0.67800409, + "num_input_tokens_seen": 275170605, + "step": 12762, + "time_per_iteration": 4.089935541152954 + }, + { + "auxiliary_loss_clip": 0.01101328, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.03876925, + "balance_loss_mlp": 1.01670587, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.7002677114012847, + "language_loss": 0.74243283, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76373982, + "num_input_tokens_seen": 275188750, + "step": 12763, + "time_per_iteration": 2.457963466644287 + }, + { + "auxiliary_loss_clip": 0.01086222, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.03744984, + "balance_loss_mlp": 1.01983285, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.4453592601336462, + "language_loss": 0.70831722, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72949296, + "num_input_tokens_seen": 275211365, + "step": 12764, + "time_per_iteration": 2.622386932373047 + }, + { + "auxiliary_loss_clip": 0.01098465, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.03753924, + "balance_loss_mlp": 1.01705408, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.8833099364307897, + "language_loss": 0.69406796, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71534228, + "num_input_tokens_seen": 275231670, + "step": 12765, + "time_per_iteration": 2.5124247074127197 + }, + { + "auxiliary_loss_clip": 0.01080491, + "auxiliary_loss_mlp": 0.01026783, + "balance_loss_clip": 1.03435743, + "balance_loss_mlp": 1.0159173, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 1.7179312815977412, + "language_loss": 0.60893238, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63000512, + "num_input_tokens_seen": 275249425, + "step": 12766, + "time_per_iteration": 2.454448699951172 + }, + { + "auxiliary_loss_clip": 0.01014639, + "auxiliary_loss_mlp": 0.00999623, + "balance_loss_clip": 1.02441072, + "balance_loss_mlp": 0.99838936, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7327510868978085, + "language_loss": 0.60791636, + "learning_rate": 5.402191637390803e-07, + "loss": 0.62805891, + "num_input_tokens_seen": 275312485, + "step": 12767, + "time_per_iteration": 3.233798027038574 + }, + { + "auxiliary_loss_clip": 0.01085184, + "auxiliary_loss_mlp": 0.01023399, + "balance_loss_clip": 1.03746498, + "balance_loss_mlp": 1.012146, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.7503741411424845, + "language_loss": 0.69285131, + "learning_rate": 5.399529693663801e-07, + "loss": 0.7139371, + "num_input_tokens_seen": 275331680, + "step": 12768, + "time_per_iteration": 2.508331775665283 + }, + { + "auxiliary_loss_clip": 0.01104916, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.03879666, + "balance_loss_mlp": 1.01975536, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.6111863291956228, + "language_loss": 0.71029121, + "learning_rate": 5.3968683035881e-07, + "loss": 0.73166597, + "num_input_tokens_seen": 275351615, + "step": 12769, + "time_per_iteration": 2.5030369758605957 + }, + { + "auxiliary_loss_clip": 0.01099293, + "auxiliary_loss_mlp": 0.01026154, + "balance_loss_clip": 1.03736377, + "balance_loss_mlp": 1.01407766, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 2.45361410272004, + "language_loss": 0.80623722, + "learning_rate": 5.394207467264611e-07, + "loss": 0.8274917, + "num_input_tokens_seen": 275368815, + "step": 12770, + "time_per_iteration": 2.491438865661621 + }, + { + "auxiliary_loss_clip": 0.01073125, + "auxiliary_loss_mlp": 0.01035635, + "balance_loss_clip": 1.03461826, + "balance_loss_mlp": 1.0238874, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.8832823399495522, + "language_loss": 0.78534782, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80643547, + "num_input_tokens_seen": 275389345, + "step": 12771, + "time_per_iteration": 2.6331725120544434 + }, + { + "auxiliary_loss_clip": 0.0110699, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.03577316, + "balance_loss_mlp": 1.01902151, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.358990430464829, + "language_loss": 0.68310249, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70447904, + "num_input_tokens_seen": 275411240, + "step": 12772, + "time_per_iteration": 2.4775259494781494 + }, + { + "auxiliary_loss_clip": 0.01093626, + "auxiliary_loss_mlp": 0.01024779, + "balance_loss_clip": 1.0352397, + "balance_loss_mlp": 1.01340628, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.4893642152613928, + "language_loss": 0.73048997, + "learning_rate": 5.386228281816349e-07, + "loss": 0.751674, + "num_input_tokens_seen": 275432010, + "step": 12773, + "time_per_iteration": 2.4980084896087646 + }, + { + "auxiliary_loss_clip": 0.01070729, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.03302288, + "balance_loss_mlp": 1.01728034, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.6411276275216675, + "language_loss": 0.81335664, + "learning_rate": 5.383569661510512e-07, + "loss": 0.83434492, + "num_input_tokens_seen": 275453710, + "step": 12774, + "time_per_iteration": 2.604945659637451 + }, + { + "auxiliary_loss_clip": 0.01099275, + "auxiliary_loss_mlp": 0.00777151, + "balance_loss_clip": 1.03786945, + "balance_loss_mlp": 1.00062943, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.651791789209086, + "language_loss": 0.69960529, + "learning_rate": 5.380911595461177e-07, + "loss": 0.7183696, + "num_input_tokens_seen": 275472915, + "step": 12775, + "time_per_iteration": 2.5469796657562256 + }, + { + "auxiliary_loss_clip": 0.00996681, + "auxiliary_loss_mlp": 0.01000353, + "balance_loss_clip": 1.01218402, + "balance_loss_mlp": 0.99922067, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.708252795046028, + "language_loss": 0.56831026, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58828062, + "num_input_tokens_seen": 275534785, + "step": 12776, + "time_per_iteration": 3.1931161880493164 + }, + { + "auxiliary_loss_clip": 0.01094903, + "auxiliary_loss_mlp": 0.01033812, + "balance_loss_clip": 1.0342418, + "balance_loss_mlp": 1.02205813, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.8885817298373984, + "language_loss": 0.73737901, + "learning_rate": 5.375597126535188e-07, + "loss": 0.75866616, + "num_input_tokens_seen": 275553205, + "step": 12777, + "time_per_iteration": 2.471508026123047 + }, + { + "auxiliary_loss_clip": 0.01082034, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.03920674, + "balance_loss_mlp": 1.02408504, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.375234480637604, + "language_loss": 0.70777953, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72895527, + "num_input_tokens_seen": 275571490, + "step": 12778, + "time_per_iteration": 4.034358739852905 + }, + { + "auxiliary_loss_clip": 0.01093786, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.03748107, + "balance_loss_mlp": 1.02044833, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.788592064212169, + "language_loss": 0.70395136, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72520751, + "num_input_tokens_seen": 275589665, + "step": 12779, + "time_per_iteration": 2.473217010498047 + }, + { + "auxiliary_loss_clip": 0.01089311, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.0413785, + "balance_loss_mlp": 1.01834249, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.6124571436449378, + "language_loss": 0.58245945, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60365462, + "num_input_tokens_seen": 275615605, + "step": 12780, + "time_per_iteration": 2.910571575164795 + }, + { + "auxiliary_loss_clip": 0.0110074, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.03601575, + "balance_loss_mlp": 1.02522779, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 2.3867346668255873, + "language_loss": 0.68197966, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70337653, + "num_input_tokens_seen": 275634965, + "step": 12781, + "time_per_iteration": 2.4607253074645996 + }, + { + "auxiliary_loss_clip": 0.01058705, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.03431082, + "balance_loss_mlp": 1.01951885, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.5663751786875235, + "language_loss": 0.79536474, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81626284, + "num_input_tokens_seen": 275655785, + "step": 12782, + "time_per_iteration": 2.5955097675323486 + }, + { + "auxiliary_loss_clip": 0.01083307, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.03763556, + "balance_loss_mlp": 1.01669955, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 1.7696814595843673, + "language_loss": 0.66572177, + "learning_rate": 5.35966703239153e-07, + "loss": 0.68684673, + "num_input_tokens_seen": 275676160, + "step": 12783, + "time_per_iteration": 2.5248701572418213 + }, + { + "auxiliary_loss_clip": 0.01087186, + "auxiliary_loss_mlp": 0.01034408, + "balance_loss_clip": 1.03574729, + "balance_loss_mlp": 1.02172375, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 1.711084706126839, + "language_loss": 0.69264799, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71386391, + "num_input_tokens_seen": 275695660, + "step": 12784, + "time_per_iteration": 2.482046604156494 + }, + { + "auxiliary_loss_clip": 0.01067199, + "auxiliary_loss_mlp": 0.01024188, + "balance_loss_clip": 1.03828359, + "balance_loss_mlp": 1.01372766, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 1.6865740260778805, + "language_loss": 0.80585682, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82677078, + "num_input_tokens_seen": 275714025, + "step": 12785, + "time_per_iteration": 2.5817339420318604 + }, + { + "auxiliary_loss_clip": 0.01094541, + "auxiliary_loss_mlp": 0.01037015, + "balance_loss_clip": 1.03458619, + "balance_loss_mlp": 1.02257276, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 3.164084612145753, + "language_loss": 0.77620411, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79751968, + "num_input_tokens_seen": 275737300, + "step": 12786, + "time_per_iteration": 4.172705173492432 + }, + { + "auxiliary_loss_clip": 0.01107183, + "auxiliary_loss_mlp": 0.01027071, + "balance_loss_clip": 1.0357008, + "balance_loss_mlp": 1.01555562, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 2.0162290280957973, + "language_loss": 0.59050792, + "learning_rate": 5.349058071544468e-07, + "loss": 0.61185044, + "num_input_tokens_seen": 275757895, + "step": 12787, + "time_per_iteration": 2.497263193130493 + }, + { + "auxiliary_loss_clip": 0.01084926, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.03568566, + "balance_loss_mlp": 1.01752484, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.569261196357148, + "language_loss": 0.75850928, + "learning_rate": 5.346407219994292e-07, + "loss": 0.77965111, + "num_input_tokens_seen": 275776745, + "step": 12788, + "time_per_iteration": 2.5035359859466553 + }, + { + "auxiliary_loss_clip": 0.01071954, + "auxiliary_loss_mlp": 0.00777508, + "balance_loss_clip": 1.04042029, + "balance_loss_mlp": 1.00061607, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.685340437966531, + "language_loss": 0.66699874, + "learning_rate": 5.343756924109821e-07, + "loss": 0.68549341, + "num_input_tokens_seen": 275797205, + "step": 12789, + "time_per_iteration": 2.603498697280884 + }, + { + "auxiliary_loss_clip": 0.01089052, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.03629375, + "balance_loss_mlp": 1.02041459, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 1.6944000227543585, + "language_loss": 0.69045997, + "learning_rate": 5.341107183991553e-07, + "loss": 0.71169102, + "num_input_tokens_seen": 275817935, + "step": 12790, + "time_per_iteration": 3.9974722862243652 + }, + { + "auxiliary_loss_clip": 0.01082056, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.03551269, + "balance_loss_mlp": 1.01982343, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.4759277449358295, + "language_loss": 0.68576431, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70690274, + "num_input_tokens_seen": 275837145, + "step": 12791, + "time_per_iteration": 2.486234426498413 + }, + { + "auxiliary_loss_clip": 0.01095704, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.03696227, + "balance_loss_mlp": 1.02047491, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.7035613383993948, + "language_loss": 0.79842532, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81969815, + "num_input_tokens_seen": 275855705, + "step": 12792, + "time_per_iteration": 2.4439127445220947 + }, + { + "auxiliary_loss_clip": 0.01089973, + "auxiliary_loss_mlp": 0.00778312, + "balance_loss_clip": 1.04776108, + "balance_loss_mlp": 1.0006969, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.7907805316470131, + "language_loss": 0.72704989, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74573272, + "num_input_tokens_seen": 275873930, + "step": 12793, + "time_per_iteration": 2.557831287384033 + }, + { + "auxiliary_loss_clip": 0.01071026, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.03787017, + "balance_loss_mlp": 1.02008367, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.7552294257423777, + "language_loss": 0.63748825, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65851676, + "num_input_tokens_seen": 275895895, + "step": 12794, + "time_per_iteration": 2.6771318912506104 + }, + { + "auxiliary_loss_clip": 0.01086368, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.03500295, + "balance_loss_mlp": 1.02257872, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 2.549874053922251, + "language_loss": 0.76449835, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78571248, + "num_input_tokens_seen": 275917825, + "step": 12795, + "time_per_iteration": 2.5402863025665283 + }, + { + "auxiliary_loss_clip": 0.01075676, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.03692222, + "balance_loss_mlp": 1.0161469, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.7383004742843438, + "language_loss": 0.71434677, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73538435, + "num_input_tokens_seen": 275937890, + "step": 12796, + "time_per_iteration": 2.5824549198150635 + }, + { + "auxiliary_loss_clip": 0.0110881, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.03722119, + "balance_loss_mlp": 1.01604033, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 1.958095405959963, + "language_loss": 0.65078175, + "learning_rate": 5.32257457305499e-07, + "loss": 0.67214811, + "num_input_tokens_seen": 275954495, + "step": 12797, + "time_per_iteration": 2.431363105773926 + }, + { + "auxiliary_loss_clip": 0.01073151, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.03356361, + "balance_loss_mlp": 1.01904595, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 1.81852283908313, + "language_loss": 0.91656005, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93761301, + "num_input_tokens_seen": 275972395, + "step": 12798, + "time_per_iteration": 2.5755326747894287 + }, + { + "auxiliary_loss_clip": 0.01061154, + "auxiliary_loss_mlp": 0.01024968, + "balance_loss_clip": 1.03498387, + "balance_loss_mlp": 1.01328564, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 2.000303202539186, + "language_loss": 0.82629538, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84715658, + "num_input_tokens_seen": 275989020, + "step": 12799, + "time_per_iteration": 2.5281107425689697 + }, + { + "auxiliary_loss_clip": 0.01060946, + "auxiliary_loss_mlp": 0.01026373, + "balance_loss_clip": 1.03991258, + "balance_loss_mlp": 1.01393342, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.2317118363377073, + "language_loss": 0.77941895, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80029207, + "num_input_tokens_seen": 276006525, + "step": 12800, + "time_per_iteration": 2.589770555496216 + }, + { + "auxiliary_loss_clip": 0.01092401, + "auxiliary_loss_mlp": 0.01025825, + "balance_loss_clip": 1.0361321, + "balance_loss_mlp": 1.01286101, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 3.0362771170637757, + "language_loss": 0.83701396, + "learning_rate": 5.31199675198198e-07, + "loss": 0.85819626, + "num_input_tokens_seen": 276027130, + "step": 12801, + "time_per_iteration": 4.146435022354126 + }, + { + "auxiliary_loss_clip": 0.01087846, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.03576589, + "balance_loss_mlp": 1.01647341, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 2.01887059366828, + "language_loss": 0.72293746, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74409974, + "num_input_tokens_seen": 276045715, + "step": 12802, + "time_per_iteration": 2.5033962726593018 + }, + { + "auxiliary_loss_clip": 0.01085064, + "auxiliary_loss_mlp": 0.01032557, + "balance_loss_clip": 1.03470421, + "balance_loss_mlp": 1.02074361, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.6936398341624128, + "language_loss": 0.75940365, + "learning_rate": 5.306711182867747e-07, + "loss": 0.78057981, + "num_input_tokens_seen": 276065375, + "step": 12803, + "time_per_iteration": 2.525583267211914 + }, + { + "auxiliary_loss_clip": 0.01016199, + "auxiliary_loss_mlp": 0.01004444, + "balance_loss_clip": 1.01124501, + "balance_loss_mlp": 1.00301313, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.732569001294166, + "language_loss": 0.55807942, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57828587, + "num_input_tokens_seen": 276131405, + "step": 12804, + "time_per_iteration": 3.093324661254883 + }, + { + "auxiliary_loss_clip": 0.01014292, + "auxiliary_loss_mlp": 0.01008089, + "balance_loss_clip": 1.00967908, + "balance_loss_mlp": 1.00672388, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.7477072165733571, + "language_loss": 0.53968489, + "learning_rate": 5.301427842437429e-07, + "loss": 0.55990875, + "num_input_tokens_seen": 276200755, + "step": 12805, + "time_per_iteration": 3.2014071941375732 + }, + { + "auxiliary_loss_clip": 0.01078684, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.03818154, + "balance_loss_mlp": 1.02324653, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 2.121114139149138, + "language_loss": 0.72818637, + "learning_rate": 5.298787008229187e-07, + "loss": 0.74932706, + "num_input_tokens_seen": 276217880, + "step": 12806, + "time_per_iteration": 2.5374555587768555 + }, + { + "auxiliary_loss_clip": 0.01084675, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.03485608, + "balance_loss_mlp": 1.02666855, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 2.0137484575745987, + "language_loss": 0.7500568, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77129275, + "num_input_tokens_seen": 276234810, + "step": 12807, + "time_per_iteration": 2.4887149333953857 + }, + { + "auxiliary_loss_clip": 0.01103936, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.03834796, + "balance_loss_mlp": 1.0204277, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.196796666151385, + "language_loss": 0.80143142, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82279909, + "num_input_tokens_seen": 276252850, + "step": 12808, + "time_per_iteration": 2.5357797145843506 + }, + { + "auxiliary_loss_clip": 0.01102191, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.0384382, + "balance_loss_mlp": 1.01967609, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 2.0733925489456904, + "language_loss": 0.79330921, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81465101, + "num_input_tokens_seen": 276272525, + "step": 12809, + "time_per_iteration": 2.5591940879821777 + }, + { + "auxiliary_loss_clip": 0.01073179, + "auxiliary_loss_mlp": 0.01025501, + "balance_loss_clip": 1.03412676, + "balance_loss_mlp": 1.01453948, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.608300739760332, + "language_loss": 0.70259994, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72358668, + "num_input_tokens_seen": 276294210, + "step": 12810, + "time_per_iteration": 2.5997791290283203 + }, + { + "auxiliary_loss_clip": 0.01085697, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.03386784, + "balance_loss_mlp": 1.02378726, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.613165797436213, + "language_loss": 0.78244388, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80369073, + "num_input_tokens_seen": 276310290, + "step": 12811, + "time_per_iteration": 2.4610836505889893 + }, + { + "auxiliary_loss_clip": 0.01003505, + "auxiliary_loss_mlp": 0.01004854, + "balance_loss_clip": 1.00600874, + "balance_loss_mlp": 1.00329196, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8089993018177897, + "language_loss": 0.56627393, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58635747, + "num_input_tokens_seen": 276371715, + "step": 12812, + "time_per_iteration": 3.184932231903076 + }, + { + "auxiliary_loss_clip": 0.01071789, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.03562629, + "balance_loss_mlp": 1.02298939, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.7432225013646585, + "language_loss": 0.71971023, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74077976, + "num_input_tokens_seen": 276389895, + "step": 12813, + "time_per_iteration": 2.5636420249938965 + }, + { + "auxiliary_loss_clip": 0.01099116, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.03602433, + "balance_loss_mlp": 1.01576495, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.5919309578098115, + "language_loss": 0.66196913, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68324995, + "num_input_tokens_seen": 276408990, + "step": 12814, + "time_per_iteration": 2.4559926986694336 + }, + { + "auxiliary_loss_clip": 0.01085724, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.03377414, + "balance_loss_mlp": 1.01880312, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.9486247746052974, + "language_loss": 0.66094071, + "learning_rate": 5.275044598581018e-07, + "loss": 0.68210453, + "num_input_tokens_seen": 276428190, + "step": 12815, + "time_per_iteration": 2.509536027908325 + }, + { + "auxiliary_loss_clip": 0.01098873, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.03702641, + "balance_loss_mlp": 1.01687622, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 2.2871436003136774, + "language_loss": 0.65085578, + "learning_rate": 5.272409343590322e-07, + "loss": 0.67213458, + "num_input_tokens_seen": 276446855, + "step": 12816, + "time_per_iteration": 2.4487557411193848 + }, + { + "auxiliary_loss_clip": 0.01099371, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.03753829, + "balance_loss_mlp": 1.02048349, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.4495143798023165, + "language_loss": 0.71735156, + "learning_rate": 5.26977464707133e-07, + "loss": 0.73866987, + "num_input_tokens_seen": 276462000, + "step": 12817, + "time_per_iteration": 3.9142873287200928 + }, + { + "auxiliary_loss_clip": 0.01068282, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.03815639, + "balance_loss_mlp": 1.01904058, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 1.971444057579933, + "language_loss": 0.60917652, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63016278, + "num_input_tokens_seen": 276481190, + "step": 12818, + "time_per_iteration": 2.5702223777770996 + }, + { + "auxiliary_loss_clip": 0.01095299, + "auxiliary_loss_mlp": 0.01026587, + "balance_loss_clip": 1.03572392, + "balance_loss_mlp": 1.01561952, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.6980259418031118, + "language_loss": 0.67301869, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69423759, + "num_input_tokens_seen": 276499520, + "step": 12819, + "time_per_iteration": 2.4620249271392822 + }, + { + "auxiliary_loss_clip": 0.01110791, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.0378859, + "balance_loss_mlp": 1.01930153, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 2.2079231503933134, + "language_loss": 0.57561541, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59703672, + "num_input_tokens_seen": 276519110, + "step": 12820, + "time_per_iteration": 2.424304246902466 + }, + { + "auxiliary_loss_clip": 0.01081502, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.03686595, + "balance_loss_mlp": 1.01920366, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 1.8733778549843996, + "language_loss": 0.8093341, + "learning_rate": 5.259241447710343e-07, + "loss": 0.83046639, + "num_input_tokens_seen": 276538805, + "step": 12821, + "time_per_iteration": 2.551295757293701 + }, + { + "auxiliary_loss_clip": 0.01109947, + "auxiliary_loss_mlp": 0.01033527, + "balance_loss_clip": 1.03776526, + "balance_loss_mlp": 1.02127838, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.189149977323606, + "language_loss": 0.68669879, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70813352, + "num_input_tokens_seen": 276554770, + "step": 12822, + "time_per_iteration": 2.381263017654419 + }, + { + "auxiliary_loss_clip": 0.01087108, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.03604972, + "balance_loss_mlp": 1.02104843, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.6162571552475145, + "language_loss": 0.72336745, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74457318, + "num_input_tokens_seen": 276574535, + "step": 12823, + "time_per_iteration": 2.586148977279663 + }, + { + "auxiliary_loss_clip": 0.0110416, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.03816843, + "balance_loss_mlp": 1.02620697, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.7752248557794883, + "language_loss": 0.76588017, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78732824, + "num_input_tokens_seen": 276592925, + "step": 12824, + "time_per_iteration": 2.4461476802825928 + }, + { + "auxiliary_loss_clip": 0.0108949, + "auxiliary_loss_mlp": 0.01029198, + "balance_loss_clip": 1.04099739, + "balance_loss_mlp": 1.01663303, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 2.057966433374108, + "language_loss": 0.72295481, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74414176, + "num_input_tokens_seen": 276610540, + "step": 12825, + "time_per_iteration": 4.12800931930542 + }, + { + "auxiliary_loss_clip": 0.01105552, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.03704011, + "balance_loss_mlp": 1.02380371, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.5178490477061473, + "language_loss": 0.73686051, + "learning_rate": 5.246087526105343e-07, + "loss": 0.7582643, + "num_input_tokens_seen": 276629200, + "step": 12826, + "time_per_iteration": 2.4132192134857178 + }, + { + "auxiliary_loss_clip": 0.01109232, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.03456783, + "balance_loss_mlp": 1.01949, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.6137678420542005, + "language_loss": 0.81285441, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83427137, + "num_input_tokens_seen": 276648655, + "step": 12827, + "time_per_iteration": 2.444885492324829 + }, + { + "auxiliary_loss_clip": 0.01029419, + "auxiliary_loss_mlp": 0.01003066, + "balance_loss_clip": 1.00591028, + "balance_loss_mlp": 1.00173652, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.871603184284265, + "language_loss": 0.55167007, + "learning_rate": 5.240829873054051e-07, + "loss": 0.5719949, + "num_input_tokens_seen": 276716500, + "step": 12828, + "time_per_iteration": 3.161386013031006 + }, + { + "auxiliary_loss_clip": 0.01063628, + "auxiliary_loss_mlp": 0.01030709, + "balance_loss_clip": 1.03162622, + "balance_loss_mlp": 1.01900816, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.7870042370664516, + "language_loss": 0.70061952, + "learning_rate": 5.23820188598238e-07, + "loss": 0.72156286, + "num_input_tokens_seen": 276733535, + "step": 12829, + "time_per_iteration": 3.906813621520996 + }, + { + "auxiliary_loss_clip": 0.01083343, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.03996801, + "balance_loss_mlp": 1.01896548, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 22.38624379529385, + "language_loss": 0.7917257, + "learning_rate": 5.235574458679579e-07, + "loss": 0.81287903, + "num_input_tokens_seen": 276749575, + "step": 12830, + "time_per_iteration": 2.4571902751922607 + }, + { + "auxiliary_loss_clip": 0.01101092, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.03759718, + "balance_loss_mlp": 1.02089703, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.916036888186953, + "language_loss": 0.78061527, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80196249, + "num_input_tokens_seen": 276769460, + "step": 12831, + "time_per_iteration": 2.495783805847168 + }, + { + "auxiliary_loss_clip": 0.01082269, + "auxiliary_loss_mlp": 0.01032645, + "balance_loss_clip": 1.03656697, + "balance_loss_mlp": 1.01950788, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.4830429509834777, + "language_loss": 0.61234289, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63349199, + "num_input_tokens_seen": 276790820, + "step": 12832, + "time_per_iteration": 2.5828206539154053 + }, + { + "auxiliary_loss_clip": 0.01083938, + "auxiliary_loss_mlp": 0.01036765, + "balance_loss_clip": 1.03657782, + "balance_loss_mlp": 1.02443266, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.5637610548748426, + "language_loss": 0.79594582, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81715286, + "num_input_tokens_seen": 276811345, + "step": 12833, + "time_per_iteration": 2.5254695415496826 + }, + { + "auxiliary_loss_clip": 0.01005096, + "auxiliary_loss_mlp": 0.01001613, + "balance_loss_clip": 1.02487385, + "balance_loss_mlp": 1.0003916, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8379016374778164, + "language_loss": 0.55356252, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57362962, + "num_input_tokens_seen": 276870950, + "step": 12834, + "time_per_iteration": 3.1724634170532227 + }, + { + "auxiliary_loss_clip": 0.01066313, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.034199, + "balance_loss_mlp": 1.01895404, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.355823654928672, + "language_loss": 0.73240894, + "learning_rate": 5.222445722184903e-07, + "loss": 0.75339395, + "num_input_tokens_seen": 276890760, + "step": 12835, + "time_per_iteration": 2.6005311012268066 + }, + { + "auxiliary_loss_clip": 0.01073308, + "auxiliary_loss_mlp": 0.00777608, + "balance_loss_clip": 1.03291738, + "balance_loss_mlp": 1.00068915, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 2.3034391074016316, + "language_loss": 0.70148134, + "learning_rate": 5.219821655586814e-07, + "loss": 0.71999049, + "num_input_tokens_seen": 276909625, + "step": 12836, + "time_per_iteration": 2.5204668045043945 + }, + { + "auxiliary_loss_clip": 0.01087989, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.03762424, + "balance_loss_mlp": 1.0186125, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 1.7548125854557424, + "language_loss": 0.59635693, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61754382, + "num_input_tokens_seen": 276930760, + "step": 12837, + "time_per_iteration": 2.621713399887085 + }, + { + "auxiliary_loss_clip": 0.01030203, + "auxiliary_loss_mlp": 0.01004481, + "balance_loss_clip": 1.02127385, + "balance_loss_mlp": 1.00304413, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.860598588862306, + "language_loss": 0.55764723, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57799399, + "num_input_tokens_seen": 276989580, + "step": 12838, + "time_per_iteration": 3.0170209407806396 + }, + { + "auxiliary_loss_clip": 0.01097066, + "auxiliary_loss_mlp": 0.01026171, + "balance_loss_clip": 1.03512859, + "balance_loss_mlp": 1.01473343, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.46196165617155, + "language_loss": 0.69304955, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71428192, + "num_input_tokens_seen": 277005450, + "step": 12839, + "time_per_iteration": 2.434296131134033 + }, + { + "auxiliary_loss_clip": 0.01097034, + "auxiliary_loss_mlp": 0.01025068, + "balance_loss_clip": 1.03749228, + "balance_loss_mlp": 1.01332629, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 1.8665869079210784, + "language_loss": 0.80378067, + "learning_rate": 5.209330994847647e-07, + "loss": 0.82500172, + "num_input_tokens_seen": 277023055, + "step": 12840, + "time_per_iteration": 2.481398344039917 + }, + { + "auxiliary_loss_clip": 0.01099568, + "auxiliary_loss_mlp": 0.00778125, + "balance_loss_clip": 1.03767037, + "balance_loss_mlp": 1.00064945, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.8541053406451204, + "language_loss": 0.80023158, + "learning_rate": 5.206709731573402e-07, + "loss": 0.81900853, + "num_input_tokens_seen": 277041150, + "step": 12841, + "time_per_iteration": 4.042226076126099 + }, + { + "auxiliary_loss_clip": 0.01074684, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.0379498, + "balance_loss_mlp": 1.01603317, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.4247340998106361, + "language_loss": 0.76635951, + "learning_rate": 5.204089029262208e-07, + "loss": 0.78738683, + "num_input_tokens_seen": 277063895, + "step": 12842, + "time_per_iteration": 2.586920976638794 + }, + { + "auxiliary_loss_clip": 0.01057486, + "auxiliary_loss_mlp": 0.00778635, + "balance_loss_clip": 1.03595519, + "balance_loss_mlp": 1.00070214, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 1.8396278796573273, + "language_loss": 0.68564653, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70400774, + "num_input_tokens_seen": 277084045, + "step": 12843, + "time_per_iteration": 2.5930745601654053 + }, + { + "auxiliary_loss_clip": 0.01086983, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.03268385, + "balance_loss_mlp": 1.01722169, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 2.3912549699553822, + "language_loss": 0.73683858, + "learning_rate": 5.198849307926465e-07, + "loss": 0.75799751, + "num_input_tokens_seen": 277102625, + "step": 12844, + "time_per_iteration": 2.4917943477630615 + }, + { + "auxiliary_loss_clip": 0.01092788, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.03366137, + "balance_loss_mlp": 1.02331686, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.4121348165797651, + "language_loss": 0.71689177, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73817933, + "num_input_tokens_seen": 277123210, + "step": 12845, + "time_per_iteration": 2.5250003337860107 + }, + { + "auxiliary_loss_clip": 0.01106277, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.03598285, + "balance_loss_mlp": 1.02356935, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.8889737524263277, + "language_loss": 0.64304668, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66445816, + "num_input_tokens_seen": 277144895, + "step": 12846, + "time_per_iteration": 2.5437276363372803 + }, + { + "auxiliary_loss_clip": 0.01022005, + "auxiliary_loss_mlp": 0.00753377, + "balance_loss_clip": 1.00695968, + "balance_loss_mlp": 1.00014329, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.7994049123306484, + "language_loss": 0.61716044, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63491428, + "num_input_tokens_seen": 277205160, + "step": 12847, + "time_per_iteration": 2.9769911766052246 + }, + { + "auxiliary_loss_clip": 0.01106079, + "auxiliary_loss_mlp": 0.01026476, + "balance_loss_clip": 1.03393674, + "balance_loss_mlp": 1.01451921, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.6395889716447574, + "language_loss": 0.7868731, + "learning_rate": 5.188376601182732e-07, + "loss": 0.80819863, + "num_input_tokens_seen": 277223005, + "step": 12848, + "time_per_iteration": 2.4221949577331543 + }, + { + "auxiliary_loss_clip": 0.01072543, + "auxiliary_loss_mlp": 0.01040626, + "balance_loss_clip": 1.03460443, + "balance_loss_mlp": 1.02725041, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.9813672830854356, + "language_loss": 0.72823888, + "learning_rate": 5.185759828394261e-07, + "loss": 0.74937063, + "num_input_tokens_seen": 277241785, + "step": 12849, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01107153, + "auxiliary_loss_mlp": 0.01033278, + "balance_loss_clip": 1.03563714, + "balance_loss_mlp": 1.02135086, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.8312870874737748, + "language_loss": 0.77884722, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80025148, + "num_input_tokens_seen": 277259050, + "step": 12850, + "time_per_iteration": 2.379192590713501 + }, + { + "auxiliary_loss_clip": 0.01051953, + "auxiliary_loss_mlp": 0.00777255, + "balance_loss_clip": 1.03169549, + "balance_loss_mlp": 1.00064993, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.5453507863978007, + "language_loss": 0.79929411, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81758618, + "num_input_tokens_seen": 277278235, + "step": 12851, + "time_per_iteration": 2.6621861457824707 + }, + { + "auxiliary_loss_clip": 0.0109659, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.03555262, + "balance_loss_mlp": 1.01672935, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.5307039731318886, + "language_loss": 0.73776102, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75902832, + "num_input_tokens_seen": 277298355, + "step": 12852, + "time_per_iteration": 2.4753713607788086 + }, + { + "auxiliary_loss_clip": 0.01106142, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.03504109, + "balance_loss_mlp": 1.02026916, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.8462571353048527, + "language_loss": 0.82736623, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84874815, + "num_input_tokens_seen": 277316095, + "step": 12853, + "time_per_iteration": 2.4352054595947266 + }, + { + "auxiliary_loss_clip": 0.01029645, + "auxiliary_loss_mlp": 0.01004214, + "balance_loss_clip": 1.00619411, + "balance_loss_mlp": 1.00289655, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.8034353500182604, + "language_loss": 0.54472309, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56506169, + "num_input_tokens_seen": 277380130, + "step": 12854, + "time_per_iteration": 3.070286750793457 + }, + { + "auxiliary_loss_clip": 0.01098688, + "auxiliary_loss_mlp": 0.01028496, + "balance_loss_clip": 1.03567398, + "balance_loss_mlp": 1.01500773, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.764086763027134, + "language_loss": 0.72115874, + "learning_rate": 5.170070992041826e-07, + "loss": 0.74243057, + "num_input_tokens_seen": 277404015, + "step": 12855, + "time_per_iteration": 2.575446844100952 + }, + { + "auxiliary_loss_clip": 0.0110855, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.03642535, + "balance_loss_mlp": 1.01874685, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.7059004183792916, + "language_loss": 0.6828562, + "learning_rate": 5.167458153638254e-07, + "loss": 0.70425707, + "num_input_tokens_seen": 277421375, + "step": 12856, + "time_per_iteration": 3.934061050415039 + }, + { + "auxiliary_loss_clip": 0.01080444, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.0356487, + "balance_loss_mlp": 1.01932871, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.7359757532420705, + "language_loss": 0.78782785, + "learning_rate": 5.164845877686162e-07, + "loss": 0.80894339, + "num_input_tokens_seen": 277440170, + "step": 12857, + "time_per_iteration": 2.582516670227051 + }, + { + "auxiliary_loss_clip": 0.01064952, + "auxiliary_loss_mlp": 0.00776813, + "balance_loss_clip": 1.04288816, + "balance_loss_mlp": 1.00057316, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 2.094461982363808, + "language_loss": 0.78629291, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80471063, + "num_input_tokens_seen": 277456880, + "step": 12858, + "time_per_iteration": 2.596281051635742 + }, + { + "auxiliary_loss_clip": 0.01108363, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.03569138, + "balance_loss_mlp": 1.02030671, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 2.234514500694907, + "language_loss": 0.77183449, + "learning_rate": 5.159623013532591e-07, + "loss": 0.79324228, + "num_input_tokens_seen": 277475365, + "step": 12859, + "time_per_iteration": 2.430300712585449 + }, + { + "auxiliary_loss_clip": 0.01096683, + "auxiliary_loss_mlp": 0.01029585, + "balance_loss_clip": 1.03874373, + "balance_loss_mlp": 1.01881492, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.3632628280377495, + "language_loss": 0.67616868, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69743139, + "num_input_tokens_seen": 277494975, + "step": 12860, + "time_per_iteration": 2.4735469818115234 + }, + { + "auxiliary_loss_clip": 0.01110716, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.03605819, + "balance_loss_mlp": 1.02476561, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.1942186231265435, + "language_loss": 0.74732852, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76880908, + "num_input_tokens_seen": 277510520, + "step": 12861, + "time_per_iteration": 2.396406888961792 + }, + { + "auxiliary_loss_clip": 0.01101275, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.037099, + "balance_loss_mlp": 1.01718688, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 2.183491785652533, + "language_loss": 0.74743706, + "learning_rate": 5.15179293816405e-07, + "loss": 0.76874799, + "num_input_tokens_seen": 277530505, + "step": 12862, + "time_per_iteration": 2.49959397315979 + }, + { + "auxiliary_loss_clip": 0.01064094, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.03342319, + "balance_loss_mlp": 1.01881611, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.5429390043423288, + "language_loss": 0.83281749, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85376054, + "num_input_tokens_seen": 277550810, + "step": 12863, + "time_per_iteration": 2.5818183422088623 + }, + { + "auxiliary_loss_clip": 0.01106587, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.03580892, + "balance_loss_mlp": 1.01725912, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.6227014525447014, + "language_loss": 0.73094618, + "learning_rate": 5.146575702980898e-07, + "loss": 0.75230026, + "num_input_tokens_seen": 277567680, + "step": 12864, + "time_per_iteration": 3.8674466609954834 + }, + { + "auxiliary_loss_clip": 0.01086378, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.0332768, + "balance_loss_mlp": 1.01873946, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 2.1758302200988022, + "language_loss": 0.82650316, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84766424, + "num_input_tokens_seen": 277588970, + "step": 12865, + "time_per_iteration": 2.548021078109741 + }, + { + "auxiliary_loss_clip": 0.01114564, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.03957307, + "balance_loss_mlp": 1.01631665, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 2.1243615072466366, + "language_loss": 0.72049904, + "learning_rate": 5.141360720771077e-07, + "loss": 0.74194407, + "num_input_tokens_seen": 277605450, + "step": 12866, + "time_per_iteration": 2.4142324924468994 + }, + { + "auxiliary_loss_clip": 0.0106487, + "auxiliary_loss_mlp": 0.00777332, + "balance_loss_clip": 1.03650343, + "balance_loss_mlp": 1.00059831, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 2.3486922568390027, + "language_loss": 0.6468792, + "learning_rate": 5.138754074778371e-07, + "loss": 0.6653012, + "num_input_tokens_seen": 277622530, + "step": 12867, + "time_per_iteration": 2.546600341796875 + }, + { + "auxiliary_loss_clip": 0.01094917, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.0348984, + "balance_loss_mlp": 1.02458954, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.9943027439723646, + "language_loss": 0.71209645, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73340857, + "num_input_tokens_seen": 277642700, + "step": 12868, + "time_per_iteration": 2.4781248569488525 + }, + { + "auxiliary_loss_clip": 0.01100359, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.03734326, + "balance_loss_mlp": 1.01815069, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.0202156476695547, + "language_loss": 0.7799238, + "learning_rate": 5.133542473511578e-07, + "loss": 0.80122614, + "num_input_tokens_seen": 277660005, + "step": 12869, + "time_per_iteration": 3.815016984939575 + }, + { + "auxiliary_loss_clip": 0.01095811, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.03694749, + "balance_loss_mlp": 1.01509118, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 1.5964867151028685, + "language_loss": 0.73505569, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75628179, + "num_input_tokens_seen": 277682890, + "step": 12870, + "time_per_iteration": 2.5351338386535645 + }, + { + "auxiliary_loss_clip": 0.01099288, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.03612947, + "balance_loss_mlp": 1.0165782, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 2.050514180632564, + "language_loss": 0.75990069, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78118074, + "num_input_tokens_seen": 277699330, + "step": 12871, + "time_per_iteration": 2.4361913204193115 + }, + { + "auxiliary_loss_clip": 0.01085677, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.03462398, + "balance_loss_mlp": 1.01908755, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.6794213414646815, + "language_loss": 0.69012642, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71128803, + "num_input_tokens_seen": 277718750, + "step": 12872, + "time_per_iteration": 2.5031328201293945 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.0103208, + "balance_loss_clip": 1.03531504, + "balance_loss_mlp": 1.01907396, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.0368562275535838, + "language_loss": 0.85239595, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87379366, + "num_input_tokens_seen": 277734645, + "step": 12873, + "time_per_iteration": 2.406538724899292 + }, + { + "auxiliary_loss_clip": 0.01111367, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.0378747, + "balance_loss_mlp": 1.01888919, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.6242767210984876, + "language_loss": 0.65589952, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67732233, + "num_input_tokens_seen": 277755535, + "step": 12874, + "time_per_iteration": 2.4870364665985107 + }, + { + "auxiliary_loss_clip": 0.01070781, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.04011893, + "balance_loss_mlp": 1.0165379, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.5498633443232992, + "language_loss": 0.62428218, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64527619, + "num_input_tokens_seen": 277775585, + "step": 12875, + "time_per_iteration": 2.5994088649749756 + }, + { + "auxiliary_loss_clip": 0.01099773, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.0358429, + "balance_loss_mlp": 1.0188942, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 1.8660501162930265, + "language_loss": 0.65111154, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67242342, + "num_input_tokens_seen": 277794795, + "step": 12876, + "time_per_iteration": 2.4910290241241455 + }, + { + "auxiliary_loss_clip": 0.01086959, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.03985083, + "balance_loss_mlp": 1.0213387, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.094303530089199, + "language_loss": 0.71259326, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73379385, + "num_input_tokens_seen": 277813235, + "step": 12877, + "time_per_iteration": 2.5116052627563477 + }, + { + "auxiliary_loss_clip": 0.01073167, + "auxiliary_loss_mlp": 0.01037702, + "balance_loss_clip": 1.03469086, + "balance_loss_mlp": 1.02338529, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.934194585936924, + "language_loss": 0.82664979, + "learning_rate": 5.110118184224736e-07, + "loss": 0.84775853, + "num_input_tokens_seen": 277832560, + "step": 12878, + "time_per_iteration": 2.5259995460510254 + }, + { + "auxiliary_loss_clip": 0.01089271, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.03795588, + "balance_loss_mlp": 1.02121305, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.840503906565402, + "language_loss": 0.73173523, + "learning_rate": 5.10751830722885e-07, + "loss": 0.7529695, + "num_input_tokens_seen": 277850120, + "step": 12879, + "time_per_iteration": 2.4770140647888184 + }, + { + "auxiliary_loss_clip": 0.01085896, + "auxiliary_loss_mlp": 0.01026928, + "balance_loss_clip": 1.03720284, + "balance_loss_mlp": 1.01546562, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 1.7330635194254245, + "language_loss": 0.79772508, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81885332, + "num_input_tokens_seen": 277871020, + "step": 12880, + "time_per_iteration": 4.192495822906494 + }, + { + "auxiliary_loss_clip": 0.01086991, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.03830409, + "balance_loss_mlp": 1.02245212, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.4509847376282503, + "language_loss": 0.70183825, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72305489, + "num_input_tokens_seen": 277891525, + "step": 12881, + "time_per_iteration": 2.516888380050659 + }, + { + "auxiliary_loss_clip": 0.01090118, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.03454649, + "balance_loss_mlp": 1.02471972, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 2.4169568433952926, + "language_loss": 0.84453565, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86581481, + "num_input_tokens_seen": 277910425, + "step": 12882, + "time_per_iteration": 2.477705717086792 + }, + { + "auxiliary_loss_clip": 0.01010605, + "auxiliary_loss_mlp": 0.01003216, + "balance_loss_clip": 1.02147377, + "balance_loss_mlp": 1.00183952, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7823240488875011, + "language_loss": 0.60381174, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62395, + "num_input_tokens_seen": 277972795, + "step": 12883, + "time_per_iteration": 3.07382869720459 + }, + { + "auxiliary_loss_clip": 0.01065496, + "auxiliary_loss_mlp": 0.01038658, + "balance_loss_clip": 1.03698802, + "balance_loss_mlp": 1.02472806, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 2.0249527936219303, + "language_loss": 0.72732604, + "learning_rate": 5.094527395086416e-07, + "loss": 0.74836755, + "num_input_tokens_seen": 277990675, + "step": 12884, + "time_per_iteration": 2.5397608280181885 + }, + { + "auxiliary_loss_clip": 0.01098457, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.03673589, + "balance_loss_mlp": 1.02195036, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 1.5382136999328215, + "language_loss": 0.80993986, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83125609, + "num_input_tokens_seen": 278010050, + "step": 12885, + "time_per_iteration": 2.4659690856933594 + }, + { + "auxiliary_loss_clip": 0.01105793, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.03521574, + "balance_loss_mlp": 1.02355266, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.8557646123847884, + "language_loss": 0.64064777, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66205382, + "num_input_tokens_seen": 278030660, + "step": 12886, + "time_per_iteration": 2.4759013652801514 + }, + { + "auxiliary_loss_clip": 0.01073005, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.03467941, + "balance_loss_mlp": 1.02123404, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 1.9371513317765494, + "language_loss": 0.69957602, + "learning_rate": 5.086739629616987e-07, + "loss": 0.7206254, + "num_input_tokens_seen": 278047645, + "step": 12887, + "time_per_iteration": 2.504586935043335 + }, + { + "auxiliary_loss_clip": 0.0109752, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.03646696, + "balance_loss_mlp": 1.01697755, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 2.06666698075909, + "language_loss": 0.70570368, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72696006, + "num_input_tokens_seen": 278066170, + "step": 12888, + "time_per_iteration": 2.449967622756958 + }, + { + "auxiliary_loss_clip": 0.010968, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.03415954, + "balance_loss_mlp": 1.01796818, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 1.7273649787075527, + "language_loss": 0.81640303, + "learning_rate": 5.081550613368279e-07, + "loss": 0.83767438, + "num_input_tokens_seen": 278085545, + "step": 12889, + "time_per_iteration": 2.4584896564483643 + }, + { + "auxiliary_loss_clip": 0.01078128, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.03823102, + "balance_loss_mlp": 1.01842833, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 2.2465625517014196, + "language_loss": 0.79449916, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81558156, + "num_input_tokens_seen": 278102995, + "step": 12890, + "time_per_iteration": 2.5105886459350586 + }, + { + "auxiliary_loss_clip": 0.0108341, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.0379281, + "balance_loss_mlp": 1.01924562, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 2.333679732497441, + "language_loss": 0.66479415, + "learning_rate": 5.076363859955932e-07, + "loss": 0.6859408, + "num_input_tokens_seen": 278121460, + "step": 12891, + "time_per_iteration": 2.5305838584899902 + }, + { + "auxiliary_loss_clip": 0.01097084, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.03466356, + "balance_loss_mlp": 1.01976013, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.6579151990110979, + "language_loss": 0.78698778, + "learning_rate": 5.073771332059257e-07, + "loss": 0.80827701, + "num_input_tokens_seen": 278143905, + "step": 12892, + "time_per_iteration": 2.5333409309387207 + }, + { + "auxiliary_loss_clip": 0.01103207, + "auxiliary_loss_mlp": 0.01030751, + "balance_loss_clip": 1.04069602, + "balance_loss_mlp": 1.01831722, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 1.945651323980429, + "language_loss": 0.66644228, + "learning_rate": 5.071179370166669e-07, + "loss": 0.68778187, + "num_input_tokens_seen": 278160850, + "step": 12893, + "time_per_iteration": 2.4382739067077637 + }, + { + "auxiliary_loss_clip": 0.01021986, + "auxiliary_loss_mlp": 0.01004406, + "balance_loss_clip": 1.00828099, + "balance_loss_mlp": 1.00305939, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8175042780983386, + "language_loss": 0.58476937, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60503328, + "num_input_tokens_seen": 278219950, + "step": 12894, + "time_per_iteration": 3.104271411895752 + }, + { + "auxiliary_loss_clip": 0.01090636, + "auxiliary_loss_mlp": 0.01028695, + "balance_loss_clip": 1.03830957, + "balance_loss_mlp": 1.01668429, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 2.353066044078089, + "language_loss": 0.78437853, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80557185, + "num_input_tokens_seen": 278237805, + "step": 12895, + "time_per_iteration": 3.9408328533172607 + }, + { + "auxiliary_loss_clip": 0.01073596, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.03903794, + "balance_loss_mlp": 1.01873517, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.688058618741464, + "language_loss": 0.67590594, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69696367, + "num_input_tokens_seen": 278257660, + "step": 12896, + "time_per_iteration": 2.5349888801574707 + }, + { + "auxiliary_loss_clip": 0.01085214, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.0357101, + "balance_loss_mlp": 1.02665567, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.6983261306327353, + "language_loss": 0.69064027, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71186268, + "num_input_tokens_seen": 278275110, + "step": 12897, + "time_per_iteration": 2.502031087875366 + }, + { + "auxiliary_loss_clip": 0.01111079, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.0385536, + "balance_loss_mlp": 1.02384257, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.8082970460170356, + "language_loss": 0.74958992, + "learning_rate": 5.058228054204364e-07, + "loss": 0.7710712, + "num_input_tokens_seen": 278293035, + "step": 12898, + "time_per_iteration": 2.455655336380005 + }, + { + "auxiliary_loss_clip": 0.01096347, + "auxiliary_loss_mlp": 0.00777978, + "balance_loss_clip": 1.03465319, + "balance_loss_mlp": 1.00067711, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 2.230453295271947, + "language_loss": 0.70382309, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72256637, + "num_input_tokens_seen": 278311010, + "step": 12899, + "time_per_iteration": 2.444246768951416 + }, + { + "auxiliary_loss_clip": 0.01075958, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.03315544, + "balance_loss_mlp": 1.02430129, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 1.8531374094633144, + "language_loss": 0.75119936, + "learning_rate": 5.053051493286453e-07, + "loss": 0.77232921, + "num_input_tokens_seen": 278329900, + "step": 12900, + "time_per_iteration": 2.5299627780914307 + }, + { + "auxiliary_loss_clip": 0.0109148, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.03770018, + "balance_loss_mlp": 1.0242362, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 1.6115049879039522, + "language_loss": 0.77410245, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79536879, + "num_input_tokens_seen": 278349980, + "step": 12901, + "time_per_iteration": 2.5228960514068604 + }, + { + "auxiliary_loss_clip": 0.0109819, + "auxiliary_loss_mlp": 0.01030858, + "balance_loss_clip": 1.0386126, + "balance_loss_mlp": 1.01865649, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.519716922227088, + "language_loss": 0.77256382, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79385424, + "num_input_tokens_seen": 278372485, + "step": 12902, + "time_per_iteration": 2.5175511837005615 + }, + { + "auxiliary_loss_clip": 0.01097653, + "auxiliary_loss_mlp": 0.01030316, + "balance_loss_clip": 1.03555322, + "balance_loss_mlp": 1.01867533, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 2.8433711762720284, + "language_loss": 0.73063421, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75191391, + "num_input_tokens_seen": 278391660, + "step": 12903, + "time_per_iteration": 3.9625160694122314 + }, + { + "auxiliary_loss_clip": 0.01089292, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.03974485, + "balance_loss_mlp": 1.01656425, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 2.5616934689272237, + "language_loss": 0.76066625, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78184211, + "num_input_tokens_seen": 278409125, + "step": 12904, + "time_per_iteration": 2.4929263591766357 + }, + { + "auxiliary_loss_clip": 0.01104673, + "auxiliary_loss_mlp": 0.01024677, + "balance_loss_clip": 1.03611779, + "balance_loss_mlp": 1.0134778, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.121319920229027, + "language_loss": 0.68155962, + "learning_rate": 5.040120011529576e-07, + "loss": 0.7028532, + "num_input_tokens_seen": 278429450, + "step": 12905, + "time_per_iteration": 2.5134518146514893 + }, + { + "auxiliary_loss_clip": 0.01093524, + "auxiliary_loss_mlp": 0.00776856, + "balance_loss_clip": 1.03814769, + "balance_loss_mlp": 1.00056458, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 1.656853792786583, + "language_loss": 0.67382437, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69252813, + "num_input_tokens_seen": 278449925, + "step": 12906, + "time_per_iteration": 2.5501444339752197 + }, + { + "auxiliary_loss_clip": 0.01072269, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.03194022, + "balance_loss_mlp": 1.02107513, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 1.9717359483233607, + "language_loss": 0.81787628, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83893239, + "num_input_tokens_seen": 278467255, + "step": 12907, + "time_per_iteration": 2.5549886226654053 + }, + { + "auxiliary_loss_clip": 0.01094977, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.03621078, + "balance_loss_mlp": 1.02027905, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.2667491148612786, + "language_loss": 0.67031324, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69158, + "num_input_tokens_seen": 278484250, + "step": 12908, + "time_per_iteration": 2.512261390686035 + }, + { + "auxiliary_loss_clip": 0.01078388, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.03546572, + "balance_loss_mlp": 1.02319169, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.657947998760325, + "language_loss": 0.70353526, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72467029, + "num_input_tokens_seen": 278502740, + "step": 12909, + "time_per_iteration": 3.9908370971679688 + }, + { + "auxiliary_loss_clip": 0.01094621, + "auxiliary_loss_mlp": 0.01032554, + "balance_loss_clip": 1.03562605, + "balance_loss_mlp": 1.02104414, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.6772745770977813, + "language_loss": 0.67958301, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70085478, + "num_input_tokens_seen": 278523890, + "step": 12910, + "time_per_iteration": 2.551107883453369 + }, + { + "auxiliary_loss_clip": 0.01068952, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.02276361, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 1.8163698062196498, + "language_loss": 0.71750838, + "learning_rate": 5.024620954742646e-07, + "loss": 0.73854041, + "num_input_tokens_seen": 278543185, + "step": 12911, + "time_per_iteration": 2.6360037326812744 + }, + { + "auxiliary_loss_clip": 0.01112258, + "auxiliary_loss_mlp": 0.00778451, + "balance_loss_clip": 1.03849387, + "balance_loss_mlp": 1.00061774, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.58034087753321, + "language_loss": 0.63103545, + "learning_rate": 5.022039765577836e-07, + "loss": 0.64994252, + "num_input_tokens_seen": 278559220, + "step": 12912, + "time_per_iteration": 2.4732913970947266 + }, + { + "auxiliary_loss_clip": 0.01001084, + "auxiliary_loss_mlp": 0.01004541, + "balance_loss_clip": 1.00631523, + "balance_loss_mlp": 1.00327098, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.7681442075136409, + "language_loss": 0.53177154, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55182779, + "num_input_tokens_seen": 278618185, + "step": 12913, + "time_per_iteration": 3.1614763736724854 + }, + { + "auxiliary_loss_clip": 0.01091155, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.03989637, + "balance_loss_mlp": 1.02210832, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 2.2451286637043104, + "language_loss": 0.62323868, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64449835, + "num_input_tokens_seen": 278636210, + "step": 12914, + "time_per_iteration": 2.561840295791626 + }, + { + "auxiliary_loss_clip": 0.01087749, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.03698277, + "balance_loss_mlp": 1.01696634, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.6861128851310787, + "language_loss": 0.82238555, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84355009, + "num_input_tokens_seen": 278653305, + "step": 12915, + "time_per_iteration": 2.50539231300354 + }, + { + "auxiliary_loss_clip": 0.01101395, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.0408783, + "balance_loss_mlp": 1.02247906, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.6247688416968662, + "language_loss": 0.74576193, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76713127, + "num_input_tokens_seen": 278671850, + "step": 12916, + "time_per_iteration": 2.564072608947754 + }, + { + "auxiliary_loss_clip": 0.01057001, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.03663158, + "balance_loss_mlp": 1.02042699, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.5586083618897322, + "language_loss": 0.65479499, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67569852, + "num_input_tokens_seen": 278697860, + "step": 12917, + "time_per_iteration": 2.887450695037842 + }, + { + "auxiliary_loss_clip": 0.01099833, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.0365901, + "balance_loss_mlp": 1.02006996, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 2.0199456349505693, + "language_loss": 0.64678127, + "learning_rate": 5.006564561294065e-07, + "loss": 0.6681, + "num_input_tokens_seen": 278720655, + "step": 12918, + "time_per_iteration": 2.5692050457000732 + }, + { + "auxiliary_loss_clip": 0.01107361, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.03664374, + "balance_loss_mlp": 1.0220089, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.234754058301588, + "language_loss": 0.73745334, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75886273, + "num_input_tokens_seen": 278737375, + "step": 12919, + "time_per_iteration": 3.9148964881896973 + }, + { + "auxiliary_loss_clip": 0.01073742, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.04165137, + "balance_loss_mlp": 1.01941347, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 2.0829494729543585, + "language_loss": 0.79345894, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81451619, + "num_input_tokens_seen": 278756510, + "step": 12920, + "time_per_iteration": 2.6671431064605713 + }, + { + "auxiliary_loss_clip": 0.01101124, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.03847945, + "balance_loss_mlp": 1.01824296, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 2.058707214328914, + "language_loss": 0.70786905, + "learning_rate": 4.998834633291829e-07, + "loss": 0.72918779, + "num_input_tokens_seen": 278775410, + "step": 12921, + "time_per_iteration": 2.4771556854248047 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.03827477, + "balance_loss_mlp": 1.01798344, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.8660411106177472, + "language_loss": 0.76122117, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78255641, + "num_input_tokens_seen": 278794260, + "step": 12922, + "time_per_iteration": 2.50061297416687 + }, + { + "auxiliary_loss_clip": 0.01066251, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.03817582, + "balance_loss_mlp": 1.02880704, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.8067070777315897, + "language_loss": 0.80850136, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82958007, + "num_input_tokens_seen": 278813290, + "step": 12923, + "time_per_iteration": 2.5501749515533447 + }, + { + "auxiliary_loss_clip": 0.01074275, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.04043841, + "balance_loss_mlp": 1.02389872, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 1.8308213544696383, + "language_loss": 0.92330718, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94440293, + "num_input_tokens_seen": 278830610, + "step": 12924, + "time_per_iteration": 2.56514048576355 + }, + { + "auxiliary_loss_clip": 0.01096879, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.03563428, + "balance_loss_mlp": 1.01629078, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 2.483213512852805, + "language_loss": 0.65802526, + "learning_rate": 4.988536026917401e-07, + "loss": 0.67927682, + "num_input_tokens_seen": 278849530, + "step": 12925, + "time_per_iteration": 2.5010807514190674 + }, + { + "auxiliary_loss_clip": 0.01077655, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.0370096, + "balance_loss_mlp": 1.02165008, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 1.8193860058484115, + "language_loss": 0.7206744, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74178582, + "num_input_tokens_seen": 278869005, + "step": 12926, + "time_per_iteration": 2.586059808731079 + }, + { + "auxiliary_loss_clip": 0.0110129, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.03808331, + "balance_loss_mlp": 1.01496029, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 2.2059333102986582, + "language_loss": 0.65366375, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67495304, + "num_input_tokens_seen": 278888790, + "step": 12927, + "time_per_iteration": 2.5108025074005127 + }, + { + "auxiliary_loss_clip": 0.01087968, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.0377562, + "balance_loss_mlp": 1.02308869, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 1.7524352677542963, + "language_loss": 0.72419661, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74543512, + "num_input_tokens_seen": 278908150, + "step": 12928, + "time_per_iteration": 2.553917407989502 + }, + { + "auxiliary_loss_clip": 0.01071881, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.04027724, + "balance_loss_mlp": 1.01830912, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.6037490855101508, + "language_loss": 0.74158734, + "learning_rate": 4.978246528322036e-07, + "loss": 0.7626096, + "num_input_tokens_seen": 278927425, + "step": 12929, + "time_per_iteration": 2.603178024291992 + }, + { + "auxiliary_loss_clip": 0.01071555, + "auxiliary_loss_mlp": 0.01031197, + "balance_loss_clip": 1.03588343, + "balance_loss_mlp": 1.01901352, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 1.8410018247055289, + "language_loss": 0.77264738, + "learning_rate": 4.975675577495377e-07, + "loss": 0.79367495, + "num_input_tokens_seen": 278946475, + "step": 12930, + "time_per_iteration": 2.5852456092834473 + }, + { + "auxiliary_loss_clip": 0.01108795, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.03797102, + "balance_loss_mlp": 1.01997352, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 1.8342653602339096, + "language_loss": 0.79875207, + "learning_rate": 4.973105196392613e-07, + "loss": 0.82015955, + "num_input_tokens_seen": 278964345, + "step": 12931, + "time_per_iteration": 2.4354395866394043 + }, + { + "auxiliary_loss_clip": 0.01017649, + "auxiliary_loss_mlp": 0.01003287, + "balance_loss_clip": 1.03702164, + "balance_loss_mlp": 1.00167739, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8201759863304294, + "language_loss": 0.59739572, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61760509, + "num_input_tokens_seen": 279022380, + "step": 12932, + "time_per_iteration": 3.0937705039978027 + }, + { + "auxiliary_loss_clip": 0.01099042, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.03716016, + "balance_loss_mlp": 1.02151227, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.488742182882803, + "language_loss": 0.75959682, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78092158, + "num_input_tokens_seen": 279044275, + "step": 12933, + "time_per_iteration": 2.5566070079803467 + }, + { + "auxiliary_loss_clip": 0.01086583, + "auxiliary_loss_mlp": 0.01033887, + "balance_loss_clip": 1.0372175, + "balance_loss_mlp": 1.02054739, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 2.0868495466431605, + "language_loss": 0.73796511, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75916982, + "num_input_tokens_seen": 279063375, + "step": 12934, + "time_per_iteration": 4.019164562225342 + }, + { + "auxiliary_loss_clip": 0.0107092, + "auxiliary_loss_mlp": 0.01028608, + "balance_loss_clip": 1.03743219, + "balance_loss_mlp": 1.01579881, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 2.026806305909409, + "language_loss": 0.70300186, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72399712, + "num_input_tokens_seen": 279082680, + "step": 12935, + "time_per_iteration": 2.5933783054351807 + }, + { + "auxiliary_loss_clip": 0.01085385, + "auxiliary_loss_mlp": 0.00778721, + "balance_loss_clip": 1.03754282, + "balance_loss_mlp": 1.00065303, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.622253177981217, + "language_loss": 0.83881211, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85745311, + "num_input_tokens_seen": 279099805, + "step": 12936, + "time_per_iteration": 2.5556859970092773 + }, + { + "auxiliary_loss_clip": 0.01103738, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.03752983, + "balance_loss_mlp": 1.01910317, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 6.9679962686125165, + "language_loss": 0.67493391, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69628143, + "num_input_tokens_seen": 279117975, + "step": 12937, + "time_per_iteration": 2.518937110900879 + }, + { + "auxiliary_loss_clip": 0.01111697, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.03804386, + "balance_loss_mlp": 1.01877165, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.6019414033877788, + "language_loss": 0.87241542, + "learning_rate": 4.955128489126777e-07, + "loss": 0.8938452, + "num_input_tokens_seen": 279137255, + "step": 12938, + "time_per_iteration": 2.461178779602051 + }, + { + "auxiliary_loss_clip": 0.01098666, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.03616333, + "balance_loss_mlp": 1.01897812, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 2.280096560387159, + "language_loss": 0.85148478, + "learning_rate": 4.95256266932218e-07, + "loss": 0.8727833, + "num_input_tokens_seen": 279154500, + "step": 12939, + "time_per_iteration": 2.459061861038208 + }, + { + "auxiliary_loss_clip": 0.01104909, + "auxiliary_loss_mlp": 0.00777378, + "balance_loss_clip": 1.03542686, + "balance_loss_mlp": 1.00056171, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.682233681092348, + "language_loss": 0.68875283, + "learning_rate": 4.949997420117915e-07, + "loss": 0.70757574, + "num_input_tokens_seen": 279173635, + "step": 12940, + "time_per_iteration": 2.432530641555786 + }, + { + "auxiliary_loss_clip": 0.01080834, + "auxiliary_loss_mlp": 0.01023638, + "balance_loss_clip": 1.04114652, + "balance_loss_mlp": 1.0125401, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.5134663386996812, + "language_loss": 0.77576733, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79681206, + "num_input_tokens_seen": 279194430, + "step": 12941, + "time_per_iteration": 2.62325119972229 + }, + { + "auxiliary_loss_clip": 0.01103865, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.03671956, + "balance_loss_mlp": 1.02163804, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.8458933253292584, + "language_loss": 0.72955704, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75095606, + "num_input_tokens_seen": 279212920, + "step": 12942, + "time_per_iteration": 2.57314395904541 + }, + { + "auxiliary_loss_clip": 0.01059429, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.03693318, + "balance_loss_mlp": 1.02435136, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 3.4613689095916005, + "language_loss": 0.67659044, + "learning_rate": 4.942305097079751e-07, + "loss": 0.69756103, + "num_input_tokens_seen": 279232310, + "step": 12943, + "time_per_iteration": 4.369645595550537 + }, + { + "auxiliary_loss_clip": 0.01002358, + "auxiliary_loss_mlp": 0.01011325, + "balance_loss_clip": 1.00409293, + "balance_loss_mlp": 1.00982881, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7721627845510091, + "language_loss": 0.5852555, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60539234, + "num_input_tokens_seen": 279295375, + "step": 12944, + "time_per_iteration": 3.245314836502075 + }, + { + "auxiliary_loss_clip": 0.01111022, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.03681612, + "balance_loss_mlp": 1.02117372, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 2.0544818071027993, + "language_loss": 0.67198503, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69344628, + "num_input_tokens_seen": 279313660, + "step": 12945, + "time_per_iteration": 2.438117265701294 + }, + { + "auxiliary_loss_clip": 0.01096427, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.0347451, + "balance_loss_mlp": 1.02144694, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 2.3289163901743275, + "language_loss": 0.69780397, + "learning_rate": 4.93461791294516e-07, + "loss": 0.7191115, + "num_input_tokens_seen": 279334495, + "step": 12946, + "time_per_iteration": 2.5004630088806152 + }, + { + "auxiliary_loss_clip": 0.01108955, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.03675067, + "balance_loss_mlp": 1.01781321, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 1.8221486000420153, + "language_loss": 0.65352625, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67491961, + "num_input_tokens_seen": 279352985, + "step": 12947, + "time_per_iteration": 2.4406652450561523 + }, + { + "auxiliary_loss_clip": 0.01052747, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.03432536, + "balance_loss_mlp": 1.02402115, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 2.183050220834742, + "language_loss": 0.6609301, + "learning_rate": 4.929495979764147e-07, + "loss": 0.68183798, + "num_input_tokens_seen": 279371360, + "step": 12948, + "time_per_iteration": 4.090183734893799 + }, + { + "auxiliary_loss_clip": 0.01109478, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.03682327, + "balance_loss_mlp": 1.02073503, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.811163992180685, + "language_loss": 0.75134254, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77276641, + "num_input_tokens_seen": 279389400, + "step": 12949, + "time_per_iteration": 2.4625790119171143 + }, + { + "auxiliary_loss_clip": 0.01112833, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.03851008, + "balance_loss_mlp": 1.0204792, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.442289992142634, + "language_loss": 0.68919486, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71065378, + "num_input_tokens_seen": 279409715, + "step": 12950, + "time_per_iteration": 2.474665641784668 + }, + { + "auxiliary_loss_clip": 0.01094838, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.03718281, + "balance_loss_mlp": 1.01767099, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.7411398970290475, + "language_loss": 0.71921515, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74046576, + "num_input_tokens_seen": 279427705, + "step": 12951, + "time_per_iteration": 2.489478588104248 + }, + { + "auxiliary_loss_clip": 0.01084681, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.03384995, + "balance_loss_mlp": 1.02158594, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 2.355107681279081, + "language_loss": 0.65769023, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67887521, + "num_input_tokens_seen": 279448215, + "step": 12952, + "time_per_iteration": 2.5636305809020996 + }, + { + "auxiliary_loss_clip": 0.01080353, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.03466547, + "balance_loss_mlp": 1.0162735, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.5544901384476666, + "language_loss": 0.81356502, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83464509, + "num_input_tokens_seen": 279466260, + "step": 12953, + "time_per_iteration": 2.5381405353546143 + }, + { + "auxiliary_loss_clip": 0.01112718, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.03893352, + "balance_loss_mlp": 1.02287626, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 2.4257520586227588, + "language_loss": 0.7654171, + "learning_rate": 4.91414389872737e-07, + "loss": 0.78689903, + "num_input_tokens_seen": 279484520, + "step": 12954, + "time_per_iteration": 2.4180595874786377 + }, + { + "auxiliary_loss_clip": 0.01098746, + "auxiliary_loss_mlp": 0.01029939, + "balance_loss_clip": 1.0347904, + "balance_loss_mlp": 1.01817322, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.6030588827363816, + "language_loss": 0.72737366, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7486605, + "num_input_tokens_seen": 279503130, + "step": 12955, + "time_per_iteration": 2.458547830581665 + }, + { + "auxiliary_loss_clip": 0.01078626, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.03464115, + "balance_loss_mlp": 1.02501488, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.4931458195703247, + "language_loss": 0.68723089, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70839345, + "num_input_tokens_seen": 279521930, + "step": 12956, + "time_per_iteration": 2.5086116790771484 + }, + { + "auxiliary_loss_clip": 0.010743, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.03451478, + "balance_loss_mlp": 1.0219512, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.6883685142730251, + "language_loss": 0.75590318, + "learning_rate": 4.906475579671252e-07, + "loss": 0.77698314, + "num_input_tokens_seen": 279542375, + "step": 12957, + "time_per_iteration": 2.5984129905700684 + }, + { + "auxiliary_loss_clip": 0.01044151, + "auxiliary_loss_mlp": 0.01025968, + "balance_loss_clip": 1.03967452, + "balance_loss_mlp": 1.01411867, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 1.620121832872776, + "language_loss": 0.77936387, + "learning_rate": 4.903920617885917e-07, + "loss": 0.80006504, + "num_input_tokens_seen": 279561885, + "step": 12958, + "time_per_iteration": 2.6862337589263916 + }, + { + "auxiliary_loss_clip": 0.01096275, + "auxiliary_loss_mlp": 0.01040991, + "balance_loss_clip": 1.03477287, + "balance_loss_mlp": 1.02614999, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 1.9257896309830447, + "language_loss": 0.71997267, + "learning_rate": 4.901366228545418e-07, + "loss": 0.74134535, + "num_input_tokens_seen": 279579965, + "step": 12959, + "time_per_iteration": 3.9199535846710205 + }, + { + "auxiliary_loss_clip": 0.01092889, + "auxiliary_loss_mlp": 0.0077804, + "balance_loss_clip": 1.035923, + "balance_loss_mlp": 1.00060797, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 1.9368546824377821, + "language_loss": 0.7778827, + "learning_rate": 4.898812411746632e-07, + "loss": 0.796592, + "num_input_tokens_seen": 279599030, + "step": 12960, + "time_per_iteration": 2.4947385787963867 + }, + { + "auxiliary_loss_clip": 0.01102347, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.03759456, + "balance_loss_mlp": 1.02242553, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 1.9203014383473078, + "language_loss": 0.74986398, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77124232, + "num_input_tokens_seen": 279614400, + "step": 12961, + "time_per_iteration": 2.5574512481689453 + }, + { + "auxiliary_loss_clip": 0.0108526, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.03750062, + "balance_loss_mlp": 1.02380943, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.6814529690539954, + "language_loss": 0.73842335, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75962794, + "num_input_tokens_seen": 279633745, + "step": 12962, + "time_per_iteration": 2.608811378479004 + }, + { + "auxiliary_loss_clip": 0.01098081, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.0367794, + "balance_loss_mlp": 1.01680422, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 1.8666777064612667, + "language_loss": 0.69596326, + "learning_rate": 4.891154397568795e-07, + "loss": 0.71723121, + "num_input_tokens_seen": 279651165, + "step": 12963, + "time_per_iteration": 2.4637036323547363 + }, + { + "auxiliary_loss_clip": 0.01097792, + "auxiliary_loss_mlp": 0.00776662, + "balance_loss_clip": 1.03726017, + "balance_loss_mlp": 1.00060463, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 4.849301031040931, + "language_loss": 0.6388464, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65759087, + "num_input_tokens_seen": 279671175, + "step": 12964, + "time_per_iteration": 2.529498338699341 + }, + { + "auxiliary_loss_clip": 0.01088898, + "auxiliary_loss_mlp": 0.01028013, + "balance_loss_clip": 1.03601193, + "balance_loss_mlp": 1.01634812, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.6573532694428352, + "language_loss": 0.76646107, + "learning_rate": 4.88605191926694e-07, + "loss": 0.78763014, + "num_input_tokens_seen": 279688675, + "step": 12965, + "time_per_iteration": 2.6051931381225586 + }, + { + "auxiliary_loss_clip": 0.01087683, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.03235114, + "balance_loss_mlp": 1.02312994, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 1.7585920343018955, + "language_loss": 0.72734588, + "learning_rate": 4.883501539751289e-07, + "loss": 0.74858034, + "num_input_tokens_seen": 279710245, + "step": 12966, + "time_per_iteration": 2.4850072860717773 + }, + { + "auxiliary_loss_clip": 0.01088114, + "auxiliary_loss_mlp": 0.00775644, + "balance_loss_clip": 1.0400933, + "balance_loss_mlp": 1.00066781, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.5101401939766048, + "language_loss": 0.74265772, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76129532, + "num_input_tokens_seen": 279729045, + "step": 12967, + "time_per_iteration": 2.5223777294158936 + }, + { + "auxiliary_loss_clip": 0.01109919, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.03699863, + "balance_loss_mlp": 1.01826429, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 2.3283820451982367, + "language_loss": 0.7238304, + "learning_rate": 4.878402500474073e-07, + "loss": 0.74524164, + "num_input_tokens_seen": 279748350, + "step": 12968, + "time_per_iteration": 2.4138097763061523 + }, + { + "auxiliary_loss_clip": 0.01088424, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.04095805, + "balance_loss_mlp": 1.0256381, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 1.907295308457861, + "language_loss": 0.61006057, + "learning_rate": 4.875853840905874e-07, + "loss": 0.6313231, + "num_input_tokens_seen": 279765620, + "step": 12969, + "time_per_iteration": 2.4741432666778564 + }, + { + "auxiliary_loss_clip": 0.01094145, + "auxiliary_loss_mlp": 0.01032939, + "balance_loss_clip": 1.0399102, + "balance_loss_mlp": 1.02184057, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.8775099330718643, + "language_loss": 0.70336121, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72463202, + "num_input_tokens_seen": 279782485, + "step": 12970, + "time_per_iteration": 2.447267532348633 + }, + { + "auxiliary_loss_clip": 0.01075056, + "auxiliary_loss_mlp": 0.00778346, + "balance_loss_clip": 1.03948998, + "balance_loss_mlp": 1.00061917, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.8290576112697532, + "language_loss": 0.72007489, + "learning_rate": 4.870758242393507e-07, + "loss": 0.7386089, + "num_input_tokens_seen": 279804170, + "step": 12971, + "time_per_iteration": 2.64775013923645 + }, + { + "auxiliary_loss_clip": 0.01069787, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.03216898, + "balance_loss_mlp": 1.0200851, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.9147202755034465, + "language_loss": 0.74449337, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76552618, + "num_input_tokens_seen": 279823730, + "step": 12972, + "time_per_iteration": 2.5153446197509766 + }, + { + "auxiliary_loss_clip": 0.01109308, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.03639567, + "balance_loss_mlp": 1.01709676, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 1.9879492184076013, + "language_loss": 0.71647131, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73786211, + "num_input_tokens_seen": 279843035, + "step": 12973, + "time_per_iteration": 2.4000155925750732 + }, + { + "auxiliary_loss_clip": 0.01097112, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.03718901, + "balance_loss_mlp": 1.02029252, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 1.7365413218110515, + "language_loss": 0.77455634, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79584229, + "num_input_tokens_seen": 279861450, + "step": 12974, + "time_per_iteration": 3.914130210876465 + }, + { + "auxiliary_loss_clip": 0.01077319, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.03719258, + "balance_loss_mlp": 1.01982474, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.778267266318874, + "language_loss": 0.69201875, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71311283, + "num_input_tokens_seen": 279878660, + "step": 12975, + "time_per_iteration": 2.515577793121338 + }, + { + "auxiliary_loss_clip": 0.01075559, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.03766537, + "balance_loss_mlp": 1.01983571, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 1.9610063637781787, + "language_loss": 0.8189764, + "learning_rate": 4.858029287593739e-07, + "loss": 0.84004986, + "num_input_tokens_seen": 279895685, + "step": 12976, + "time_per_iteration": 2.5415709018707275 + }, + { + "auxiliary_loss_clip": 0.01088174, + "auxiliary_loss_mlp": 0.00777408, + "balance_loss_clip": 1.03313851, + "balance_loss_mlp": 1.0006392, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.5073449190282064, + "language_loss": 0.66045123, + "learning_rate": 4.85548521880289e-07, + "loss": 0.67910707, + "num_input_tokens_seen": 279917240, + "step": 12977, + "time_per_iteration": 2.5416224002838135 + }, + { + "auxiliary_loss_clip": 0.01089674, + "auxiliary_loss_mlp": 0.0102668, + "balance_loss_clip": 1.03915596, + "balance_loss_mlp": 1.01545632, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 1.4643597174205547, + "language_loss": 0.74967277, + "learning_rate": 4.852941724293554e-07, + "loss": 0.77083623, + "num_input_tokens_seen": 279938665, + "step": 12978, + "time_per_iteration": 2.585840940475464 + }, + { + "auxiliary_loss_clip": 0.01086436, + "auxiliary_loss_mlp": 0.01039924, + "balance_loss_clip": 1.03490782, + "balance_loss_mlp": 1.02545774, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 2.1517414371656436, + "language_loss": 0.62344497, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64470863, + "num_input_tokens_seen": 279957965, + "step": 12979, + "time_per_iteration": 2.52323579788208 + }, + { + "auxiliary_loss_clip": 0.01109254, + "auxiliary_loss_mlp": 0.01027352, + "balance_loss_clip": 1.03770566, + "balance_loss_mlp": 1.01515055, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 2.0243291078702534, + "language_loss": 0.77232975, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79369581, + "num_input_tokens_seen": 279977490, + "step": 12980, + "time_per_iteration": 2.47757625579834 + }, + { + "auxiliary_loss_clip": 0.01111928, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.03825068, + "balance_loss_mlp": 1.0168699, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 2.1743027676899347, + "language_loss": 0.78381914, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80522811, + "num_input_tokens_seen": 279994220, + "step": 12981, + "time_per_iteration": 2.4320242404937744 + }, + { + "auxiliary_loss_clip": 0.01068575, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.03607523, + "balance_loss_mlp": 1.02071357, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 1.776315601573394, + "language_loss": 0.72793913, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74895042, + "num_input_tokens_seen": 280012590, + "step": 12982, + "time_per_iteration": 4.17628812789917 + }, + { + "auxiliary_loss_clip": 0.01084844, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.03640652, + "balance_loss_mlp": 1.01978397, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.4113601362755797, + "language_loss": 0.73413706, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75530076, + "num_input_tokens_seen": 280033700, + "step": 12983, + "time_per_iteration": 2.552476167678833 + }, + { + "auxiliary_loss_clip": 0.01087154, + "auxiliary_loss_mlp": 0.01027558, + "balance_loss_clip": 1.04157424, + "balance_loss_mlp": 1.01593482, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 2.535133639142541, + "language_loss": 0.74924701, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77039415, + "num_input_tokens_seen": 280052215, + "step": 12984, + "time_per_iteration": 2.4701995849609375 + }, + { + "auxiliary_loss_clip": 0.01084142, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.03296649, + "balance_loss_mlp": 1.0239172, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 2.3594333750195027, + "language_loss": 0.81429571, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83549905, + "num_input_tokens_seen": 280070525, + "step": 12985, + "time_per_iteration": 2.477011203765869 + }, + { + "auxiliary_loss_clip": 0.01088219, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.03660989, + "balance_loss_mlp": 1.01825869, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.437718627416533, + "language_loss": 0.77210259, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79328752, + "num_input_tokens_seen": 280089855, + "step": 12986, + "time_per_iteration": 2.4857094287872314 + }, + { + "auxiliary_loss_clip": 0.01098837, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.03499138, + "balance_loss_mlp": 1.0213989, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 1.790061127387644, + "language_loss": 0.73965484, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76098073, + "num_input_tokens_seen": 280109960, + "step": 12987, + "time_per_iteration": 2.5396769046783447 + }, + { + "auxiliary_loss_clip": 0.01023506, + "auxiliary_loss_mlp": 0.00999982, + "balance_loss_clip": 1.01012778, + "balance_loss_mlp": 0.9985401, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7336554910631207, + "language_loss": 0.55067515, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57090998, + "num_input_tokens_seen": 280169805, + "step": 12988, + "time_per_iteration": 4.467754602432251 + }, + { + "auxiliary_loss_clip": 0.01078353, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.03441274, + "balance_loss_mlp": 1.02786565, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 2.625637437878165, + "language_loss": 0.80968505, + "learning_rate": 4.82500121484009e-07, + "loss": 0.8308664, + "num_input_tokens_seen": 280184630, + "step": 12989, + "time_per_iteration": 2.449106454849243 + }, + { + "auxiliary_loss_clip": 0.0107719, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.03688216, + "balance_loss_mlp": 1.01632428, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.5959206282421938, + "language_loss": 0.70337337, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72442895, + "num_input_tokens_seen": 280203880, + "step": 12990, + "time_per_iteration": 2.54276180267334 + }, + { + "auxiliary_loss_clip": 0.01086537, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.03571904, + "balance_loss_mlp": 1.02067173, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 2.145270434415028, + "language_loss": 0.77830207, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79950678, + "num_input_tokens_seen": 280220460, + "step": 12991, + "time_per_iteration": 2.4700846672058105 + }, + { + "auxiliary_loss_clip": 0.01075284, + "auxiliary_loss_mlp": 0.01037786, + "balance_loss_clip": 1.03774726, + "balance_loss_mlp": 1.0248518, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.772852456931894, + "language_loss": 0.65754604, + "learning_rate": 4.817393154694398e-07, + "loss": 0.67867672, + "num_input_tokens_seen": 280242680, + "step": 12992, + "time_per_iteration": 2.738086223602295 + }, + { + "auxiliary_loss_clip": 0.01111595, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.03781104, + "balance_loss_mlp": 1.01955533, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.8517871085575819, + "language_loss": 0.61801291, + "learning_rate": 4.814858285969578e-07, + "loss": 0.63944423, + "num_input_tokens_seen": 280260655, + "step": 12993, + "time_per_iteration": 2.429055690765381 + }, + { + "auxiliary_loss_clip": 0.01084847, + "auxiliary_loss_mlp": 0.01028591, + "balance_loss_clip": 1.03454733, + "balance_loss_mlp": 1.01629484, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.6604616355562036, + "language_loss": 0.68578875, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70692319, + "num_input_tokens_seen": 280281185, + "step": 12994, + "time_per_iteration": 2.537234306335449 + }, + { + "auxiliary_loss_clip": 0.01108042, + "auxiliary_loss_mlp": 0.01027289, + "balance_loss_clip": 1.03679752, + "balance_loss_mlp": 1.01550555, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 2.2604657545145996, + "language_loss": 0.69677711, + "learning_rate": 4.809790276082335e-07, + "loss": 0.71813041, + "num_input_tokens_seen": 280298255, + "step": 12995, + "time_per_iteration": 2.4411909580230713 + }, + { + "auxiliary_loss_clip": 0.0106711, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.03602803, + "balance_loss_mlp": 1.01690078, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.7262283189502776, + "language_loss": 0.75033438, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77128667, + "num_input_tokens_seen": 280319000, + "step": 12996, + "time_per_iteration": 2.5738213062286377 + }, + { + "auxiliary_loss_clip": 0.01113405, + "auxiliary_loss_mlp": 0.01033915, + "balance_loss_clip": 1.03774321, + "balance_loss_mlp": 1.02114797, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.560749296210905, + "language_loss": 0.68191385, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70338708, + "num_input_tokens_seen": 280336375, + "step": 12997, + "time_per_iteration": 2.3978774547576904 + }, + { + "auxiliary_loss_clip": 0.01112946, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.03732467, + "balance_loss_mlp": 1.02138162, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.8897123436236325, + "language_loss": 0.82223678, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84371156, + "num_input_tokens_seen": 280358760, + "step": 12998, + "time_per_iteration": 3.8886327743530273 + }, + { + "auxiliary_loss_clip": 0.01082288, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.03175282, + "balance_loss_mlp": 1.02195394, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 1.9711909481359082, + "language_loss": 0.74537474, + "learning_rate": 4.799661169247453e-07, + "loss": 0.7665506, + "num_input_tokens_seen": 280377085, + "step": 12999, + "time_per_iteration": 2.4806277751922607 + }, + { + "auxiliary_loss_clip": 0.01097295, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_clip": 1.03513265, + "balance_loss_mlp": 1.0278672, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.7935613318777235, + "language_loss": 0.84498656, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86638093, + "num_input_tokens_seen": 280395465, + "step": 13000, + "time_per_iteration": 2.4707231521606445 + }, + { + "auxiliary_loss_clip": 0.01099301, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.03739154, + "balance_loss_mlp": 1.01598251, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 2.414459476631889, + "language_loss": 0.66249228, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68376589, + "num_input_tokens_seen": 280412775, + "step": 13001, + "time_per_iteration": 2.4519269466400146 + }, + { + "auxiliary_loss_clip": 0.01073183, + "auxiliary_loss_mlp": 0.01037536, + "balance_loss_clip": 1.03423142, + "balance_loss_mlp": 1.02527511, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.5662207604897025, + "language_loss": 0.66656756, + "learning_rate": 4.792070390968027e-07, + "loss": 0.68767476, + "num_input_tokens_seen": 280432905, + "step": 13002, + "time_per_iteration": 2.670454740524292 + }, + { + "auxiliary_loss_clip": 0.01104382, + "auxiliary_loss_mlp": 0.01034783, + "balance_loss_clip": 1.04004729, + "balance_loss_mlp": 1.0220511, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.6725937597813845, + "language_loss": 0.73471075, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75610244, + "num_input_tokens_seen": 280450785, + "step": 13003, + "time_per_iteration": 2.5023927688598633 + }, + { + "auxiliary_loss_clip": 0.0110062, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.03726828, + "balance_loss_mlp": 1.02260208, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.7264937338703186, + "language_loss": 0.62074029, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64209634, + "num_input_tokens_seen": 280468400, + "step": 13004, + "time_per_iteration": 2.4475643634796143 + }, + { + "auxiliary_loss_clip": 0.01101859, + "auxiliary_loss_mlp": 0.01030013, + "balance_loss_clip": 1.03407979, + "balance_loss_mlp": 1.0188849, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 2.443921661559563, + "language_loss": 0.82940137, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85072011, + "num_input_tokens_seen": 280483930, + "step": 13005, + "time_per_iteration": 2.3840932846069336 + }, + { + "auxiliary_loss_clip": 0.01068756, + "auxiliary_loss_mlp": 0.00777361, + "balance_loss_clip": 1.03170896, + "balance_loss_mlp": 1.00049746, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.8287421851368768, + "language_loss": 0.72843236, + "learning_rate": 4.781957427316432e-07, + "loss": 0.74689353, + "num_input_tokens_seen": 280503465, + "step": 13006, + "time_per_iteration": 2.5482101440429688 + }, + { + "auxiliary_loss_clip": 0.01101031, + "auxiliary_loss_mlp": 0.00778495, + "balance_loss_clip": 1.03839362, + "balance_loss_mlp": 1.00060666, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 2.064543525183666, + "language_loss": 0.72114706, + "learning_rate": 4.779430628838157e-07, + "loss": 0.73994231, + "num_input_tokens_seen": 280523375, + "step": 13007, + "time_per_iteration": 2.466804027557373 + }, + { + "auxiliary_loss_clip": 0.01110289, + "auxiliary_loss_mlp": 0.01029454, + "balance_loss_clip": 1.03604937, + "balance_loss_mlp": 1.01659727, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 1.857503896149125, + "language_loss": 0.69189137, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71328878, + "num_input_tokens_seen": 280542920, + "step": 13008, + "time_per_iteration": 2.4165899753570557 + }, + { + "auxiliary_loss_clip": 0.01083649, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.03726673, + "balance_loss_mlp": 1.01629305, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 2.088672505773417, + "language_loss": 0.69794846, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71907866, + "num_input_tokens_seen": 280561700, + "step": 13009, + "time_per_iteration": 2.5341174602508545 + }, + { + "auxiliary_loss_clip": 0.01069229, + "auxiliary_loss_mlp": 0.01027223, + "balance_loss_clip": 1.03240991, + "balance_loss_mlp": 1.01506376, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 1.7856635429904864, + "language_loss": 0.81524086, + "learning_rate": 4.771853696779586e-07, + "loss": 0.83620536, + "num_input_tokens_seen": 280580605, + "step": 13010, + "time_per_iteration": 2.544450044631958 + }, + { + "auxiliary_loss_clip": 0.01093869, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.0333426, + "balance_loss_mlp": 1.02084112, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.6715667662572053, + "language_loss": 0.62347603, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64473641, + "num_input_tokens_seen": 280601495, + "step": 13011, + "time_per_iteration": 2.514768362045288 + }, + { + "auxiliary_loss_clip": 0.01096245, + "auxiliary_loss_mlp": 0.01027569, + "balance_loss_clip": 1.03583479, + "balance_loss_mlp": 1.01685238, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.6935993004961245, + "language_loss": 0.6987623, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72000045, + "num_input_tokens_seen": 280622760, + "step": 13012, + "time_per_iteration": 2.49756121635437 + }, + { + "auxiliary_loss_clip": 0.01028304, + "auxiliary_loss_mlp": 0.01006001, + "balance_loss_clip": 1.00485921, + "balance_loss_mlp": 1.00482714, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7028075197291899, + "language_loss": 0.55025268, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57059574, + "num_input_tokens_seen": 280687115, + "step": 13013, + "time_per_iteration": 4.508320093154907 + }, + { + "auxiliary_loss_clip": 0.0108529, + "auxiliary_loss_mlp": 0.0103573, + "balance_loss_clip": 1.03540182, + "balance_loss_mlp": 1.02337956, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 1.7923672444071277, + "language_loss": 0.65553707, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67674726, + "num_input_tokens_seen": 280705000, + "step": 13014, + "time_per_iteration": 2.5109689235687256 + }, + { + "auxiliary_loss_clip": 0.01014741, + "auxiliary_loss_mlp": 0.01002666, + "balance_loss_clip": 1.01580048, + "balance_loss_mlp": 1.00153923, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.7267553229427406, + "language_loss": 0.58445513, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60462916, + "num_input_tokens_seen": 280773525, + "step": 13015, + "time_per_iteration": 3.1520493030548096 + }, + { + "auxiliary_loss_clip": 0.01080601, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.03530014, + "balance_loss_mlp": 1.02162325, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 3.30807349538009, + "language_loss": 0.74188137, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76301754, + "num_input_tokens_seen": 280791915, + "step": 13016, + "time_per_iteration": 2.4848153591156006 + }, + { + "auxiliary_loss_clip": 0.01108967, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.03634834, + "balance_loss_mlp": 1.01822639, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.6264521785204786, + "language_loss": 0.74963224, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77104163, + "num_input_tokens_seen": 280811460, + "step": 13017, + "time_per_iteration": 2.433137893676758 + }, + { + "auxiliary_loss_clip": 0.01083721, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.03443658, + "balance_loss_mlp": 1.02186084, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 2.2580658652226044, + "language_loss": 0.75365615, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77484429, + "num_input_tokens_seen": 280825415, + "step": 13018, + "time_per_iteration": 2.467620611190796 + }, + { + "auxiliary_loss_clip": 0.01106771, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.03530312, + "balance_loss_mlp": 1.01647747, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.4942393364516622, + "language_loss": 0.77318662, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79454374, + "num_input_tokens_seen": 280845335, + "step": 13019, + "time_per_iteration": 2.4322049617767334 + }, + { + "auxiliary_loss_clip": 0.01064237, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.0371244, + "balance_loss_mlp": 1.0167706, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.4807577787005686, + "language_loss": 0.67731291, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69823563, + "num_input_tokens_seen": 280867145, + "step": 13020, + "time_per_iteration": 2.6469264030456543 + }, + { + "auxiliary_loss_clip": 0.01095352, + "auxiliary_loss_mlp": 0.01027541, + "balance_loss_clip": 1.03750837, + "balance_loss_mlp": 1.01554275, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 2.2144169611494813, + "language_loss": 0.62612128, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64735019, + "num_input_tokens_seen": 280886185, + "step": 13021, + "time_per_iteration": 2.466681957244873 + }, + { + "auxiliary_loss_clip": 0.01106129, + "auxiliary_loss_mlp": 0.01034431, + "balance_loss_clip": 1.03633761, + "balance_loss_mlp": 1.02325511, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.7027457424139965, + "language_loss": 0.69222289, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71362853, + "num_input_tokens_seen": 280907665, + "step": 13022, + "time_per_iteration": 4.08112096786499 + }, + { + "auxiliary_loss_clip": 0.00997913, + "auxiliary_loss_mlp": 0.01000989, + "balance_loss_clip": 1.02622628, + "balance_loss_mlp": 0.99959421, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6404323142718895, + "language_loss": 0.56224275, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58223176, + "num_input_tokens_seen": 280971405, + "step": 13023, + "time_per_iteration": 3.2998180389404297 + }, + { + "auxiliary_loss_clip": 0.01079045, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.01749957, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.6539186503146617, + "language_loss": 0.67251647, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69360316, + "num_input_tokens_seen": 280989615, + "step": 13024, + "time_per_iteration": 2.5090267658233643 + }, + { + "auxiliary_loss_clip": 0.01110677, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.03683734, + "balance_loss_mlp": 1.0156846, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.6869916489820378, + "language_loss": 0.77838689, + "learning_rate": 4.734047044272498e-07, + "loss": 0.79977447, + "num_input_tokens_seen": 281009450, + "step": 13025, + "time_per_iteration": 2.4584662914276123 + }, + { + "auxiliary_loss_clip": 0.01083459, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.03427601, + "balance_loss_mlp": 1.02273083, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.7562836968911555, + "language_loss": 0.78706259, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80824053, + "num_input_tokens_seen": 281028120, + "step": 13026, + "time_per_iteration": 2.539957046508789 + }, + { + "auxiliary_loss_clip": 0.01095635, + "auxiliary_loss_mlp": 0.01027544, + "balance_loss_clip": 1.03703666, + "balance_loss_mlp": 1.01549196, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 1.9799971317537746, + "language_loss": 0.75252086, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77375263, + "num_input_tokens_seen": 281042130, + "step": 13027, + "time_per_iteration": 2.4271562099456787 + }, + { + "auxiliary_loss_clip": 0.01099107, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.03743947, + "balance_loss_mlp": 1.01667356, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.9759215449166485, + "language_loss": 0.70311356, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72438693, + "num_input_tokens_seen": 281060945, + "step": 13028, + "time_per_iteration": 3.8470001220703125 + }, + { + "auxiliary_loss_clip": 0.01063787, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.03657293, + "balance_loss_mlp": 1.02337313, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 2.1555786485570128, + "language_loss": 0.69217062, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71317041, + "num_input_tokens_seen": 281079270, + "step": 13029, + "time_per_iteration": 2.5600078105926514 + }, + { + "auxiliary_loss_clip": 0.01085028, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.03845429, + "balance_loss_mlp": 1.01631415, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.9540345788264664, + "language_loss": 0.80991083, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83105063, + "num_input_tokens_seen": 281099500, + "step": 13030, + "time_per_iteration": 2.6133511066436768 + }, + { + "auxiliary_loss_clip": 0.01101897, + "auxiliary_loss_mlp": 0.01028767, + "balance_loss_clip": 1.03610504, + "balance_loss_mlp": 1.01644659, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 1.760001208134612, + "language_loss": 0.70141989, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72272646, + "num_input_tokens_seen": 281121250, + "step": 13031, + "time_per_iteration": 2.5566301345825195 + }, + { + "auxiliary_loss_clip": 0.01073017, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.03551531, + "balance_loss_mlp": 1.01866746, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 1.9781381658958657, + "language_loss": 0.78865433, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80969059, + "num_input_tokens_seen": 281138760, + "step": 13032, + "time_per_iteration": 2.517488956451416 + }, + { + "auxiliary_loss_clip": 0.01104685, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.03901982, + "balance_loss_mlp": 1.02420568, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.719122892209791, + "language_loss": 0.62617558, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.6475901, + "num_input_tokens_seen": 281157420, + "step": 13033, + "time_per_iteration": 2.4399209022521973 + }, + { + "auxiliary_loss_clip": 0.01097362, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.03613579, + "balance_loss_mlp": 1.01765704, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.6973209968749774, + "language_loss": 0.71977574, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74104702, + "num_input_tokens_seen": 281174620, + "step": 13034, + "time_per_iteration": 2.428420066833496 + }, + { + "auxiliary_loss_clip": 0.01111168, + "auxiliary_loss_mlp": 0.00778578, + "balance_loss_clip": 1.03734744, + "balance_loss_mlp": 1.00050759, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 2.3857817077098127, + "language_loss": 0.72020823, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.7391057, + "num_input_tokens_seen": 281193865, + "step": 13035, + "time_per_iteration": 2.422863006591797 + }, + { + "auxiliary_loss_clip": 0.01110446, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.0380336, + "balance_loss_mlp": 1.02008963, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1.994421461561947, + "language_loss": 0.65828729, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.67972255, + "num_input_tokens_seen": 281212250, + "step": 13036, + "time_per_iteration": 2.4420528411865234 + }, + { + "auxiliary_loss_clip": 0.01104079, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.03825533, + "balance_loss_mlp": 1.02202082, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.3399344198321326, + "language_loss": 0.72774565, + "learning_rate": 4.703895486362031e-07, + "loss": 0.74913383, + "num_input_tokens_seen": 281230850, + "step": 13037, + "time_per_iteration": 2.4867095947265625 + }, + { + "auxiliary_loss_clip": 0.0107235, + "auxiliary_loss_mlp": 0.01035199, + "balance_loss_clip": 1.03384662, + "balance_loss_mlp": 1.02236021, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.4129961166444107, + "language_loss": 0.60225749, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62333298, + "num_input_tokens_seen": 281249810, + "step": 13038, + "time_per_iteration": 3.9883615970611572 + }, + { + "auxiliary_loss_clip": 0.01087398, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.03638482, + "balance_loss_mlp": 1.0167582, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.8462093056577107, + "language_loss": 0.68184912, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70300543, + "num_input_tokens_seen": 281273730, + "step": 13039, + "time_per_iteration": 2.6535353660583496 + }, + { + "auxiliary_loss_clip": 0.01074769, + "auxiliary_loss_mlp": 0.01022879, + "balance_loss_clip": 1.03714788, + "balance_loss_mlp": 1.01253176, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 2.0228822877287893, + "language_loss": 0.69459486, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71557134, + "num_input_tokens_seen": 281293670, + "step": 13040, + "time_per_iteration": 2.6263034343719482 + }, + { + "auxiliary_loss_clip": 0.01067352, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.03579462, + "balance_loss_mlp": 1.02037048, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.8492477868212196, + "language_loss": 0.67437994, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69538087, + "num_input_tokens_seen": 281313070, + "step": 13041, + "time_per_iteration": 2.5659823417663574 + }, + { + "auxiliary_loss_clip": 0.0102087, + "auxiliary_loss_mlp": 0.0075298, + "balance_loss_clip": 1.0069983, + "balance_loss_mlp": 1.00014961, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6571963087679621, + "language_loss": 0.57426816, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59200668, + "num_input_tokens_seen": 281374880, + "step": 13042, + "time_per_iteration": 3.0241804122924805 + }, + { + "auxiliary_loss_clip": 0.01086421, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.0359087, + "balance_loss_mlp": 1.0165956, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 2.3370835979357705, + "language_loss": 0.84148926, + "learning_rate": 4.688851018730369e-07, + "loss": 0.86264747, + "num_input_tokens_seen": 281392620, + "step": 13043, + "time_per_iteration": 2.5687623023986816 + }, + { + "auxiliary_loss_clip": 0.0109449, + "auxiliary_loss_mlp": 0.01024087, + "balance_loss_clip": 1.03626668, + "balance_loss_mlp": 1.01229095, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.4678359273336918, + "language_loss": 0.88492405, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90610981, + "num_input_tokens_seen": 281413140, + "step": 13044, + "time_per_iteration": 2.543417453765869 + }, + { + "auxiliary_loss_clip": 0.01092774, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.03692913, + "balance_loss_mlp": 1.01818371, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 1.8056394946838716, + "language_loss": 0.78654754, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.80778122, + "num_input_tokens_seen": 281430860, + "step": 13045, + "time_per_iteration": 2.524940252304077 + }, + { + "auxiliary_loss_clip": 0.01081541, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.03509057, + "balance_loss_mlp": 1.01827526, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.4924661026871704, + "language_loss": 0.72142166, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74254084, + "num_input_tokens_seen": 281451385, + "step": 13046, + "time_per_iteration": 2.521181583404541 + }, + { + "auxiliary_loss_clip": 0.01070276, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.0388298, + "balance_loss_mlp": 1.02439284, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.4616982735420225, + "language_loss": 0.63150263, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65257561, + "num_input_tokens_seen": 281472255, + "step": 13047, + "time_per_iteration": 2.608886241912842 + }, + { + "auxiliary_loss_clip": 0.01097074, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.03647852, + "balance_loss_mlp": 1.01793432, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.4904739861308562, + "language_loss": 0.73083019, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75209618, + "num_input_tokens_seen": 281492860, + "step": 13048, + "time_per_iteration": 2.4886133670806885 + }, + { + "auxiliary_loss_clip": 0.01089727, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.04191601, + "balance_loss_mlp": 1.01959062, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 1.8414509818030984, + "language_loss": 0.74900115, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.77021569, + "num_input_tokens_seen": 281511815, + "step": 13049, + "time_per_iteration": 2.5321204662323 + }, + { + "auxiliary_loss_clip": 0.01111886, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.03591895, + "balance_loss_mlp": 1.02153218, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 4.684250082769345, + "language_loss": 0.73035818, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.75182664, + "num_input_tokens_seen": 281530090, + "step": 13050, + "time_per_iteration": 2.434931516647339 + }, + { + "auxiliary_loss_clip": 0.01096883, + "auxiliary_loss_mlp": 0.0103243, + "balance_loss_clip": 1.03459942, + "balance_loss_mlp": 1.02006197, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.6079308205419482, + "language_loss": 0.73993802, + "learning_rate": 4.668824245713825e-07, + "loss": 0.76123112, + "num_input_tokens_seen": 281547075, + "step": 13051, + "time_per_iteration": 2.5330381393432617 + }, + { + "auxiliary_loss_clip": 0.01111245, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.03804111, + "balance_loss_mlp": 1.02120924, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 1.9661225218985119, + "language_loss": 0.7307601, + "learning_rate": 4.666323514209227e-07, + "loss": 0.75221014, + "num_input_tokens_seen": 281568080, + "step": 13052, + "time_per_iteration": 2.5803275108337402 + }, + { + "auxiliary_loss_clip": 0.01083793, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.03781414, + "balance_loss_mlp": 1.01850224, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 3.4632732067592906, + "language_loss": 0.68974966, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71088636, + "num_input_tokens_seen": 281586925, + "step": 13053, + "time_per_iteration": 3.9198427200317383 + }, + { + "auxiliary_loss_clip": 0.01091647, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03538394, + "balance_loss_mlp": 1.0199095, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 2.4732419805834644, + "language_loss": 0.69874454, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.71997684, + "num_input_tokens_seen": 281603915, + "step": 13054, + "time_per_iteration": 2.4695050716400146 + }, + { + "auxiliary_loss_clip": 0.0109963, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.03602362, + "balance_loss_mlp": 1.01940703, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.6084925807311112, + "language_loss": 0.75914681, + "learning_rate": 4.658824808801938e-07, + "loss": 0.78046489, + "num_input_tokens_seen": 281624220, + "step": 13055, + "time_per_iteration": 2.4934604167938232 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.03850245, + "balance_loss_mlp": 1.01899898, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 1.9537734661404138, + "language_loss": 0.74717331, + "learning_rate": 4.656326403684283e-07, + "loss": 0.76863384, + "num_input_tokens_seen": 281642325, + "step": 13056, + "time_per_iteration": 2.409350633621216 + }, + { + "auxiliary_loss_clip": 0.01056166, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.04355836, + "balance_loss_mlp": 1.01509666, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.6026835251041973, + "language_loss": 0.70129204, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72212505, + "num_input_tokens_seen": 281663065, + "step": 13057, + "time_per_iteration": 2.6474409103393555 + }, + { + "auxiliary_loss_clip": 0.01068375, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.04099667, + "balance_loss_mlp": 1.01955473, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 3.331446933869388, + "language_loss": 0.76625657, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78725499, + "num_input_tokens_seen": 281681005, + "step": 13058, + "time_per_iteration": 2.5397121906280518 + }, + { + "auxiliary_loss_clip": 0.01100325, + "auxiliary_loss_mlp": 0.01034981, + "balance_loss_clip": 1.03727746, + "balance_loss_mlp": 1.02282131, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 2.0698380315223597, + "language_loss": 0.70806926, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.72942227, + "num_input_tokens_seen": 281697965, + "step": 13059, + "time_per_iteration": 2.4533915519714355 + }, + { + "auxiliary_loss_clip": 0.01080603, + "auxiliary_loss_mlp": 0.01040661, + "balance_loss_clip": 1.03451085, + "balance_loss_mlp": 1.02716088, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 1.7451815842184768, + "language_loss": 0.76303756, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78425026, + "num_input_tokens_seen": 281716035, + "step": 13060, + "time_per_iteration": 2.5187196731567383 + }, + { + "auxiliary_loss_clip": 0.01083122, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.036183, + "balance_loss_mlp": 1.01911759, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 2.1482712681542333, + "language_loss": 0.77384508, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79499406, + "num_input_tokens_seen": 281732815, + "step": 13061, + "time_per_iteration": 2.4680912494659424 + }, + { + "auxiliary_loss_clip": 0.01076303, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.01758182, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 1.8646844091399655, + "language_loss": 0.74058843, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76165676, + "num_input_tokens_seen": 281751980, + "step": 13062, + "time_per_iteration": 4.100548982620239 + }, + { + "auxiliary_loss_clip": 0.01096062, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03419161, + "balance_loss_mlp": 1.0212003, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.5662289204028224, + "language_loss": 0.68295205, + "learning_rate": 4.638853864505297e-07, + "loss": 0.7042433, + "num_input_tokens_seen": 281772670, + "step": 13063, + "time_per_iteration": 2.4802916049957275 + }, + { + "auxiliary_loss_clip": 0.01097357, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.03741813, + "balance_loss_mlp": 1.02159643, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 1.6512214212554117, + "language_loss": 0.72962153, + "learning_rate": 4.636360116707625e-07, + "loss": 0.75093341, + "num_input_tokens_seen": 281792930, + "step": 13064, + "time_per_iteration": 2.5266308784484863 + }, + { + "auxiliary_loss_clip": 0.01081754, + "auxiliary_loss_mlp": 0.01032437, + "balance_loss_clip": 1.03656936, + "balance_loss_mlp": 1.01990199, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 1.7751107550508793, + "language_loss": 0.6812014, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70234329, + "num_input_tokens_seen": 281811805, + "step": 13065, + "time_per_iteration": 2.518594264984131 + }, + { + "auxiliary_loss_clip": 0.01099502, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.04216194, + "balance_loss_mlp": 1.02298069, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.7865589610460209, + "language_loss": 0.76342559, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78477025, + "num_input_tokens_seen": 281831885, + "step": 13066, + "time_per_iteration": 2.4843809604644775 + }, + { + "auxiliary_loss_clip": 0.01028636, + "auxiliary_loss_mlp": 0.01003966, + "balance_loss_clip": 1.00532889, + "balance_loss_mlp": 1.00267887, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7061922075189971, + "language_loss": 0.53461945, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55494547, + "num_input_tokens_seen": 281900310, + "step": 13067, + "time_per_iteration": 3.0643632411956787 + }, + { + "auxiliary_loss_clip": 0.01064566, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.03490043, + "balance_loss_mlp": 1.01795292, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.5820482908084184, + "language_loss": 0.67696142, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69791126, + "num_input_tokens_seen": 281918870, + "step": 13068, + "time_per_iteration": 4.0406341552734375 + }, + { + "auxiliary_loss_clip": 0.01078022, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.03783178, + "balance_loss_mlp": 1.01851404, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 1.8996011354421332, + "language_loss": 0.6806587, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70174265, + "num_input_tokens_seen": 281936905, + "step": 13069, + "time_per_iteration": 2.5548863410949707 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.03866231, + "balance_loss_mlp": 1.01774836, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 2.06657826117903, + "language_loss": 0.76699495, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.78831375, + "num_input_tokens_seen": 281955625, + "step": 13070, + "time_per_iteration": 2.495884895324707 + }, + { + "auxiliary_loss_clip": 0.01050728, + "auxiliary_loss_mlp": 0.01037928, + "balance_loss_clip": 1.0295651, + "balance_loss_mlp": 1.02456498, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.6453370089858355, + "language_loss": 0.6559577, + "learning_rate": 4.618920199958083e-07, + "loss": 0.6768443, + "num_input_tokens_seen": 281973285, + "step": 13071, + "time_per_iteration": 2.5459601879119873 + }, + { + "auxiliary_loss_clip": 0.01063102, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.03115523, + "balance_loss_mlp": 1.01864672, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.5729493794082055, + "language_loss": 0.74108654, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76202536, + "num_input_tokens_seen": 281991410, + "step": 13072, + "time_per_iteration": 2.589670181274414 + }, + { + "auxiliary_loss_clip": 0.01097724, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.04056668, + "balance_loss_mlp": 1.01852059, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 1.8637797949641466, + "language_loss": 0.71836352, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73965508, + "num_input_tokens_seen": 282010845, + "step": 13073, + "time_per_iteration": 2.487412691116333 + }, + { + "auxiliary_loss_clip": 0.01083803, + "auxiliary_loss_mlp": 0.01032299, + "balance_loss_clip": 1.03835583, + "balance_loss_mlp": 1.01920962, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.6264863943788994, + "language_loss": 0.76577485, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78693581, + "num_input_tokens_seen": 282029635, + "step": 13074, + "time_per_iteration": 2.5158090591430664 + }, + { + "auxiliary_loss_clip": 0.01067599, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.03478098, + "balance_loss_mlp": 1.01943731, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.8723353374473588, + "language_loss": 0.75344735, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77443576, + "num_input_tokens_seen": 282050285, + "step": 13075, + "time_per_iteration": 2.545942544937134 + }, + { + "auxiliary_loss_clip": 0.0108148, + "auxiliary_loss_mlp": 0.01024924, + "balance_loss_clip": 1.03899014, + "balance_loss_mlp": 1.01361728, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.7854793870538712, + "language_loss": 0.68795216, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.7090162, + "num_input_tokens_seen": 282071040, + "step": 13076, + "time_per_iteration": 2.576559066772461 + }, + { + "auxiliary_loss_clip": 0.01096135, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.03513122, + "balance_loss_mlp": 1.01770091, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 2.067226520160794, + "language_loss": 0.79730427, + "learning_rate": 4.603994445488282e-07, + "loss": 0.81856596, + "num_input_tokens_seen": 282086610, + "step": 13077, + "time_per_iteration": 3.8223037719726562 + }, + { + "auxiliary_loss_clip": 0.01098763, + "auxiliary_loss_mlp": 0.01030666, + "balance_loss_clip": 1.03842187, + "balance_loss_mlp": 1.01810133, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.5367131359260529, + "language_loss": 0.70906281, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.73035705, + "num_input_tokens_seen": 282107440, + "step": 13078, + "time_per_iteration": 2.5737528800964355 + }, + { + "auxiliary_loss_clip": 0.01095244, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.03534758, + "balance_loss_mlp": 1.01884222, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.6293063848161786, + "language_loss": 0.81448483, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83574581, + "num_input_tokens_seen": 282127290, + "step": 13079, + "time_per_iteration": 2.5014212131500244 + }, + { + "auxiliary_loss_clip": 0.01076173, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.03602886, + "balance_loss_mlp": 1.01833963, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.4354330172685112, + "language_loss": 0.68575954, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70682383, + "num_input_tokens_seen": 282147505, + "step": 13080, + "time_per_iteration": 2.6085119247436523 + }, + { + "auxiliary_loss_clip": 0.01097051, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.03670335, + "balance_loss_mlp": 1.01976252, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.8020909507197738, + "language_loss": 0.69422001, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71551263, + "num_input_tokens_seen": 282166450, + "step": 13081, + "time_per_iteration": 2.4446051120758057 + }, + { + "auxiliary_loss_clip": 0.01086142, + "auxiliary_loss_mlp": 0.01036847, + "balance_loss_clip": 1.03575718, + "balance_loss_mlp": 1.02502155, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 2.000376695452657, + "language_loss": 0.68300545, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70423537, + "num_input_tokens_seen": 282186465, + "step": 13082, + "time_per_iteration": 2.49373722076416 + }, + { + "auxiliary_loss_clip": 0.01081969, + "auxiliary_loss_mlp": 0.01034636, + "balance_loss_clip": 1.03653991, + "balance_loss_mlp": 1.02177954, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.7740975843095717, + "language_loss": 0.66278082, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68394685, + "num_input_tokens_seen": 282207180, + "step": 13083, + "time_per_iteration": 2.5267844200134277 + }, + { + "auxiliary_loss_clip": 0.01091872, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.04012597, + "balance_loss_mlp": 1.01839042, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.3959644412502863, + "language_loss": 0.7420221, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76325917, + "num_input_tokens_seen": 282225865, + "step": 13084, + "time_per_iteration": 2.4876716136932373 + }, + { + "auxiliary_loss_clip": 0.01084763, + "auxiliary_loss_mlp": 0.01039597, + "balance_loss_clip": 1.0365746, + "balance_loss_mlp": 1.0265379, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 3.9205451057321214, + "language_loss": 0.70644295, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72768652, + "num_input_tokens_seen": 282242895, + "step": 13085, + "time_per_iteration": 2.4419126510620117 + }, + { + "auxiliary_loss_clip": 0.01085542, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.03536725, + "balance_loss_mlp": 1.02003694, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 1.8002088540382362, + "language_loss": 0.72017604, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74136353, + "num_input_tokens_seen": 282260425, + "step": 13086, + "time_per_iteration": 2.5023703575134277 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01028113, + "balance_loss_clip": 1.0351491, + "balance_loss_mlp": 1.01636529, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 2.1876787140790652, + "language_loss": 0.7485739, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.76992583, + "num_input_tokens_seen": 282279335, + "step": 13087, + "time_per_iteration": 2.409851551055908 + }, + { + "auxiliary_loss_clip": 0.01085785, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.03481126, + "balance_loss_mlp": 1.02096295, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 1.6300838143498917, + "language_loss": 0.71325213, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73443359, + "num_input_tokens_seen": 282299905, + "step": 13088, + "time_per_iteration": 2.5345664024353027 + }, + { + "auxiliary_loss_clip": 0.01028247, + "auxiliary_loss_mlp": 0.01003745, + "balance_loss_clip": 1.00478852, + "balance_loss_mlp": 1.00235009, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6724647293666999, + "language_loss": 0.55438924, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57470912, + "num_input_tokens_seen": 282367620, + "step": 13089, + "time_per_iteration": 3.070687770843506 + }, + { + "auxiliary_loss_clip": 0.01021554, + "auxiliary_loss_mlp": 0.01002288, + "balance_loss_clip": 1.00877786, + "balance_loss_mlp": 1.00100064, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7218707885057537, + "language_loss": 0.49993554, + "learning_rate": 4.571727439470976e-07, + "loss": 0.52017391, + "num_input_tokens_seen": 282435695, + "step": 13090, + "time_per_iteration": 3.1135940551757812 + }, + { + "auxiliary_loss_clip": 0.01096855, + "auxiliary_loss_mlp": 0.01029367, + "balance_loss_clip": 1.03670669, + "balance_loss_mlp": 1.01794624, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 1.6406664768236985, + "language_loss": 0.83466828, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.85593051, + "num_input_tokens_seen": 282456025, + "step": 13091, + "time_per_iteration": 2.5016653537750244 + }, + { + "auxiliary_loss_clip": 0.01020756, + "auxiliary_loss_mlp": 0.01000162, + "balance_loss_clip": 1.00705922, + "balance_loss_mlp": 0.99888098, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7379759311008713, + "language_loss": 0.64029682, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66050595, + "num_input_tokens_seen": 282520995, + "step": 13092, + "time_per_iteration": 4.511024713516235 + }, + { + "auxiliary_loss_clip": 0.01092721, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.04294848, + "balance_loss_mlp": 1.02192903, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 2.101101411618829, + "language_loss": 0.79258758, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81386298, + "num_input_tokens_seen": 282539355, + "step": 13093, + "time_per_iteration": 2.474940061569214 + }, + { + "auxiliary_loss_clip": 0.01081808, + "auxiliary_loss_mlp": 0.01027049, + "balance_loss_clip": 1.03870618, + "balance_loss_mlp": 1.01521146, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 1.8945306177717403, + "language_loss": 0.75551492, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77660346, + "num_input_tokens_seen": 282555735, + "step": 13094, + "time_per_iteration": 2.4748969078063965 + }, + { + "auxiliary_loss_clip": 0.01064827, + "auxiliary_loss_mlp": 0.01038921, + "balance_loss_clip": 1.03315735, + "balance_loss_mlp": 1.02508664, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.5767405833934063, + "language_loss": 0.79829216, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81932962, + "num_input_tokens_seen": 282574550, + "step": 13095, + "time_per_iteration": 2.594498872756958 + }, + { + "auxiliary_loss_clip": 0.01099102, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.03612852, + "balance_loss_mlp": 1.01690531, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 2.12167155683069, + "language_loss": 0.67965472, + "learning_rate": 4.556868310016715e-07, + "loss": 0.70093918, + "num_input_tokens_seen": 282596520, + "step": 13096, + "time_per_iteration": 2.537909507751465 + }, + { + "auxiliary_loss_clip": 0.0108122, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.03290558, + "balance_loss_mlp": 1.01651132, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.435446066315284, + "language_loss": 0.70551842, + "learning_rate": 4.55439383751125e-07, + "loss": 0.7266022, + "num_input_tokens_seen": 282620560, + "step": 13097, + "time_per_iteration": 2.7260594367980957 + }, + { + "auxiliary_loss_clip": 0.0109204, + "auxiliary_loss_mlp": 0.010375, + "balance_loss_clip": 1.03968084, + "balance_loss_mlp": 1.02501321, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.5563930835749755, + "language_loss": 0.80737078, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82866621, + "num_input_tokens_seen": 282639830, + "step": 13098, + "time_per_iteration": 2.515362024307251 + }, + { + "auxiliary_loss_clip": 0.01069908, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.03420842, + "balance_loss_mlp": 1.01799309, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.6295568622646774, + "language_loss": 0.74279487, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76378906, + "num_input_tokens_seen": 282660130, + "step": 13099, + "time_per_iteration": 2.567202568054199 + }, + { + "auxiliary_loss_clip": 0.01086341, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.03435826, + "balance_loss_mlp": 1.01505756, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.577012338440914, + "language_loss": 0.78277975, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80391705, + "num_input_tokens_seen": 282681125, + "step": 13100, + "time_per_iteration": 2.542212724685669 + }, + { + "auxiliary_loss_clip": 0.01101468, + "auxiliary_loss_mlp": 0.00779476, + "balance_loss_clip": 1.03586578, + "balance_loss_mlp": 1.00063038, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.4186649327936776, + "language_loss": 0.66373962, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68254906, + "num_input_tokens_seen": 282696690, + "step": 13101, + "time_per_iteration": 4.2875449657440186 + }, + { + "auxiliary_loss_clip": 0.01085174, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.03537691, + "balance_loss_mlp": 1.02020741, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.8034700435454087, + "language_loss": 0.77944267, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.80061805, + "num_input_tokens_seen": 282721210, + "step": 13102, + "time_per_iteration": 2.675083637237549 + }, + { + "auxiliary_loss_clip": 0.01097294, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.03433084, + "balance_loss_mlp": 1.028301, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 4.503669855733965, + "language_loss": 0.82435697, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84573036, + "num_input_tokens_seen": 282738505, + "step": 13103, + "time_per_iteration": 2.486084222793579 + }, + { + "auxiliary_loss_clip": 0.01101476, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.03785253, + "balance_loss_mlp": 1.02076364, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 2.1673997728759984, + "language_loss": 0.80265403, + "learning_rate": 4.537088934794913e-07, + "loss": 0.82400733, + "num_input_tokens_seen": 282756895, + "step": 13104, + "time_per_iteration": 2.488070249557495 + }, + { + "auxiliary_loss_clip": 0.01109318, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.03666437, + "balance_loss_mlp": 1.02217591, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.6102813215010618, + "language_loss": 0.74017692, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76161492, + "num_input_tokens_seen": 282774955, + "step": 13105, + "time_per_iteration": 2.408487319946289 + }, + { + "auxiliary_loss_clip": 0.01051504, + "auxiliary_loss_mlp": 0.01039955, + "balance_loss_clip": 1.03192401, + "balance_loss_mlp": 1.02728355, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.576544761623414, + "language_loss": 0.75844848, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.7793631, + "num_input_tokens_seen": 282793165, + "step": 13106, + "time_per_iteration": 2.6158385276794434 + }, + { + "auxiliary_loss_clip": 0.01061673, + "auxiliary_loss_mlp": 0.01031771, + "balance_loss_clip": 1.03979111, + "balance_loss_mlp": 1.02014196, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.2107335864118896, + "language_loss": 0.73297787, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75391233, + "num_input_tokens_seen": 282809820, + "step": 13107, + "time_per_iteration": 3.788947582244873 + }, + { + "auxiliary_loss_clip": 0.01106361, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.03536272, + "balance_loss_mlp": 1.02039468, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.571480982889577, + "language_loss": 0.73452723, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75591493, + "num_input_tokens_seen": 282828600, + "step": 13108, + "time_per_iteration": 2.424351215362549 + }, + { + "auxiliary_loss_clip": 0.0102871, + "auxiliary_loss_mlp": 0.01000686, + "balance_loss_clip": 1.00523114, + "balance_loss_mlp": 0.99936849, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.8917588652559852, + "language_loss": 0.60399282, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62428677, + "num_input_tokens_seen": 282882775, + "step": 13109, + "time_per_iteration": 2.9633431434631348 + }, + { + "auxiliary_loss_clip": 0.01068692, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.03530192, + "balance_loss_mlp": 1.01927257, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.9642464715885686, + "language_loss": 0.72193325, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74293578, + "num_input_tokens_seen": 282902680, + "step": 13110, + "time_per_iteration": 2.5959949493408203 + }, + { + "auxiliary_loss_clip": 0.01060621, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.03688836, + "balance_loss_mlp": 1.01822793, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.4068360670597626, + "language_loss": 0.75435483, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77525568, + "num_input_tokens_seen": 282923625, + "step": 13111, + "time_per_iteration": 2.6199684143066406 + }, + { + "auxiliary_loss_clip": 0.01093246, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.03420448, + "balance_loss_mlp": 1.02180529, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 1.8624193626567311, + "language_loss": 0.61266828, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63394284, + "num_input_tokens_seen": 282941955, + "step": 13112, + "time_per_iteration": 2.466154098510742 + }, + { + "auxiliary_loss_clip": 0.01088519, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.0378418, + "balance_loss_mlp": 1.01448607, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.6572924148118402, + "language_loss": 0.67367017, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69482487, + "num_input_tokens_seen": 282961280, + "step": 13113, + "time_per_iteration": 2.485344648361206 + }, + { + "auxiliary_loss_clip": 0.01072449, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.03509104, + "balance_loss_mlp": 1.02032959, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 2.644355043527286, + "language_loss": 0.58489788, + "learning_rate": 4.5124174933361e-07, + "loss": 0.60595131, + "num_input_tokens_seen": 282978210, + "step": 13114, + "time_per_iteration": 2.4869186878204346 + }, + { + "auxiliary_loss_clip": 0.01064273, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.03955197, + "balance_loss_mlp": 1.01777351, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.5853110729299522, + "language_loss": 0.66911465, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69005871, + "num_input_tokens_seen": 282998845, + "step": 13115, + "time_per_iteration": 2.5881495475769043 + }, + { + "auxiliary_loss_clip": 0.01084272, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.03394914, + "balance_loss_mlp": 1.0258739, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 3.5449416318431117, + "language_loss": 0.88902342, + "learning_rate": 4.50749024954048e-07, + "loss": 0.91025591, + "num_input_tokens_seen": 283015200, + "step": 13116, + "time_per_iteration": 3.9006099700927734 + }, + { + "auxiliary_loss_clip": 0.01093559, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.03681302, + "balance_loss_mlp": 1.01823211, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 2.9632372460408054, + "language_loss": 0.72717988, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74843103, + "num_input_tokens_seen": 283033680, + "step": 13117, + "time_per_iteration": 2.4852640628814697 + }, + { + "auxiliary_loss_clip": 0.01097699, + "auxiliary_loss_mlp": 0.01027828, + "balance_loss_clip": 1.0384531, + "balance_loss_mlp": 1.01637244, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.5350126701492304, + "language_loss": 0.80073452, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82198977, + "num_input_tokens_seen": 283050620, + "step": 13118, + "time_per_iteration": 2.441603660583496 + }, + { + "auxiliary_loss_clip": 0.01095457, + "auxiliary_loss_mlp": 0.01026336, + "balance_loss_clip": 1.0359391, + "balance_loss_mlp": 1.01421881, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 3.212387883217661, + "language_loss": 0.73317897, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75439692, + "num_input_tokens_seen": 283070215, + "step": 13119, + "time_per_iteration": 2.452587127685547 + }, + { + "auxiliary_loss_clip": 0.01096274, + "auxiliary_loss_mlp": 0.01024815, + "balance_loss_clip": 1.03454721, + "balance_loss_mlp": 1.01231575, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.3920907913560217, + "language_loss": 0.72183466, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.74304557, + "num_input_tokens_seen": 283091485, + "step": 13120, + "time_per_iteration": 2.478449821472168 + }, + { + "auxiliary_loss_clip": 0.01086264, + "auxiliary_loss_mlp": 0.00781775, + "balance_loss_clip": 1.0367893, + "balance_loss_mlp": 1.00066304, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.5740805503135522, + "language_loss": 0.79159737, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.81027776, + "num_input_tokens_seen": 283115040, + "step": 13121, + "time_per_iteration": 2.61837100982666 + }, + { + "auxiliary_loss_clip": 0.01095508, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.0341022, + "balance_loss_mlp": 1.01970744, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 2.0586599312505496, + "language_loss": 0.80117369, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82244903, + "num_input_tokens_seen": 283136925, + "step": 13122, + "time_per_iteration": 2.518143653869629 + }, + { + "auxiliary_loss_clip": 0.01080715, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.03411245, + "balance_loss_mlp": 1.01628947, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 3.5691667716328546, + "language_loss": 0.77993047, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80101299, + "num_input_tokens_seen": 283155725, + "step": 13123, + "time_per_iteration": 2.4567224979400635 + }, + { + "auxiliary_loss_clip": 0.01090857, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.04163718, + "balance_loss_mlp": 1.0208168, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 2.2736956222763145, + "language_loss": 0.67126441, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69249976, + "num_input_tokens_seen": 283173845, + "step": 13124, + "time_per_iteration": 2.519443988800049 + }, + { + "auxiliary_loss_clip": 0.01087279, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.03424692, + "balance_loss_mlp": 1.01393592, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 2.2404393276669605, + "language_loss": 0.72614443, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.74728465, + "num_input_tokens_seen": 283191985, + "step": 13125, + "time_per_iteration": 2.530273199081421 + }, + { + "auxiliary_loss_clip": 0.01092902, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.03558409, + "balance_loss_mlp": 1.01560092, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 2.0276338363350597, + "language_loss": 0.72417498, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74539095, + "num_input_tokens_seen": 283210855, + "step": 13126, + "time_per_iteration": 2.5258524417877197 + }, + { + "auxiliary_loss_clip": 0.0109134, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.03428197, + "balance_loss_mlp": 1.01711297, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 1.9434379298590836, + "language_loss": 0.76398915, + "learning_rate": 4.480432433327845e-07, + "loss": 0.78520405, + "num_input_tokens_seen": 283229665, + "step": 13127, + "time_per_iteration": 2.468291759490967 + }, + { + "auxiliary_loss_clip": 0.01092754, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.03491044, + "balance_loss_mlp": 1.02534866, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 2.5586823990710625, + "language_loss": 0.8566612, + "learning_rate": 4.47797616101103e-07, + "loss": 0.87797064, + "num_input_tokens_seen": 283248615, + "step": 13128, + "time_per_iteration": 2.479962110519409 + }, + { + "auxiliary_loss_clip": 0.01097665, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.03511548, + "balance_loss_mlp": 1.02437162, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.0839038294617005, + "language_loss": 0.69302356, + "learning_rate": 4.475520477290904e-07, + "loss": 0.71435887, + "num_input_tokens_seen": 283267135, + "step": 13129, + "time_per_iteration": 2.463879108428955 + }, + { + "auxiliary_loss_clip": 0.01020378, + "auxiliary_loss_mlp": 0.01017116, + "balance_loss_clip": 1.00517392, + "balance_loss_mlp": 1.01580453, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7173694701100658, + "language_loss": 0.61543083, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63580579, + "num_input_tokens_seen": 283328940, + "step": 13130, + "time_per_iteration": 3.041703939437866 + }, + { + "auxiliary_loss_clip": 0.01099768, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.0379473, + "balance_loss_mlp": 1.01651418, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.6719307553294636, + "language_loss": 0.73745501, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.75873494, + "num_input_tokens_seen": 283350000, + "step": 13131, + "time_per_iteration": 3.962228775024414 + }, + { + "auxiliary_loss_clip": 0.01089751, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.03661823, + "balance_loss_mlp": 1.01515746, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.2038882015822434, + "language_loss": 0.69150817, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.71269655, + "num_input_tokens_seen": 283368020, + "step": 13132, + "time_per_iteration": 2.4667701721191406 + }, + { + "auxiliary_loss_clip": 0.01100419, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.03735387, + "balance_loss_mlp": 1.02525508, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 2.100222832019172, + "language_loss": 0.62298411, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64437085, + "num_input_tokens_seen": 283387030, + "step": 13133, + "time_per_iteration": 2.4412624835968018 + }, + { + "auxiliary_loss_clip": 0.01082241, + "auxiliary_loss_mlp": 0.01038008, + "balance_loss_clip": 1.03718209, + "balance_loss_mlp": 1.024436, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.3170403025037616, + "language_loss": 0.79790086, + "learning_rate": 4.463250890899195e-07, + "loss": 0.8191033, + "num_input_tokens_seen": 283402090, + "step": 13134, + "time_per_iteration": 2.4932444095611572 + }, + { + "auxiliary_loss_clip": 0.010974, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.03342199, + "balance_loss_mlp": 1.01887715, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 1.7947682979504689, + "language_loss": 0.80355233, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82483816, + "num_input_tokens_seen": 283421035, + "step": 13135, + "time_per_iteration": 2.4262795448303223 + }, + { + "auxiliary_loss_clip": 0.01096944, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.0362978, + "balance_loss_mlp": 1.02018476, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.544466667579244, + "language_loss": 0.72434938, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74565029, + "num_input_tokens_seen": 283441830, + "step": 13136, + "time_per_iteration": 2.4706966876983643 + }, + { + "auxiliary_loss_clip": 0.01116203, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.03761101, + "balance_loss_mlp": 1.02390647, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 2.8637837334807084, + "language_loss": 0.70728111, + "learning_rate": 4.455896208180778e-07, + "loss": 0.72881484, + "num_input_tokens_seen": 283459540, + "step": 13137, + "time_per_iteration": 2.38736629486084 + }, + { + "auxiliary_loss_clip": 0.011083, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.03769875, + "balance_loss_mlp": 1.02024198, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.689902009760123, + "language_loss": 0.7378543, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.75927114, + "num_input_tokens_seen": 283478790, + "step": 13138, + "time_per_iteration": 2.401531219482422 + }, + { + "auxiliary_loss_clip": 0.01071513, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.03826523, + "balance_loss_mlp": 1.01866746, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 2.0538072725074614, + "language_loss": 0.68471128, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70573282, + "num_input_tokens_seen": 283495720, + "step": 13139, + "time_per_iteration": 2.477672576904297 + }, + { + "auxiliary_loss_clip": 0.0101969, + "auxiliary_loss_mlp": 0.01003939, + "balance_loss_clip": 1.00481343, + "balance_loss_mlp": 1.00262225, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8938821517816783, + "language_loss": 0.60180938, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62204564, + "num_input_tokens_seen": 283558795, + "step": 13140, + "time_per_iteration": 4.545746088027954 + }, + { + "auxiliary_loss_clip": 0.01109695, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.03745961, + "balance_loss_mlp": 1.01988828, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.6240765911043789, + "language_loss": 0.76095486, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78238046, + "num_input_tokens_seen": 283579305, + "step": 13141, + "time_per_iteration": 2.488247871398926 + }, + { + "auxiliary_loss_clip": 0.01099004, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.03584802, + "balance_loss_mlp": 1.02185249, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 2.3918594372794875, + "language_loss": 0.68930423, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.71063882, + "num_input_tokens_seen": 283597840, + "step": 13142, + "time_per_iteration": 2.515260934829712 + }, + { + "auxiliary_loss_clip": 0.00984346, + "auxiliary_loss_mlp": 0.01007829, + "balance_loss_clip": 1.00925398, + "balance_loss_mlp": 1.00645196, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8329467223388107, + "language_loss": 0.60055709, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62047887, + "num_input_tokens_seen": 283647950, + "step": 13143, + "time_per_iteration": 3.0877432823181152 + }, + { + "auxiliary_loss_clip": 0.01081561, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.03833234, + "balance_loss_mlp": 1.01908779, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.7320190725909053, + "language_loss": 0.74739814, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76852834, + "num_input_tokens_seen": 283670645, + "step": 13144, + "time_per_iteration": 3.2446630001068115 + }, + { + "auxiliary_loss_clip": 0.01103207, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.03692865, + "balance_loss_mlp": 1.02156138, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 1.8610534689298381, + "language_loss": 0.83095634, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.8523342, + "num_input_tokens_seen": 283688830, + "step": 13145, + "time_per_iteration": 2.473395347595215 + }, + { + "auxiliary_loss_clip": 0.01094152, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.03558946, + "balance_loss_mlp": 1.01987171, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.725907326326199, + "language_loss": 0.72928113, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75053501, + "num_input_tokens_seen": 283708625, + "step": 13146, + "time_per_iteration": 2.4718801975250244 + }, + { + "auxiliary_loss_clip": 0.01110462, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.0358963, + "balance_loss_mlp": 1.01870739, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 1.9307398210623292, + "language_loss": 0.75802827, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.7794432, + "num_input_tokens_seen": 283725710, + "step": 13147, + "time_per_iteration": 3.8337743282318115 + }, + { + "auxiliary_loss_clip": 0.01097146, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.03993011, + "balance_loss_mlp": 1.02438998, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.7407041718422747, + "language_loss": 0.72036606, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74171335, + "num_input_tokens_seen": 283744150, + "step": 13148, + "time_per_iteration": 2.4753007888793945 + }, + { + "auxiliary_loss_clip": 0.01096237, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.03351855, + "balance_loss_mlp": 1.01767278, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 9.699839336574023, + "language_loss": 0.71529412, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73655903, + "num_input_tokens_seen": 283764170, + "step": 13149, + "time_per_iteration": 2.5057308673858643 + }, + { + "auxiliary_loss_clip": 0.01077147, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.03321218, + "balance_loss_mlp": 1.02089095, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 1.8892226114811235, + "language_loss": 0.65713531, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67826247, + "num_input_tokens_seen": 283784305, + "step": 13150, + "time_per_iteration": 2.545609474182129 + }, + { + "auxiliary_loss_clip": 0.01108387, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.03619373, + "balance_loss_mlp": 1.01922774, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 8.918472830653064, + "language_loss": 0.7009201, + "learning_rate": 4.421644538650231e-07, + "loss": 0.72230965, + "num_input_tokens_seen": 283804040, + "step": 13151, + "time_per_iteration": 2.413341522216797 + }, + { + "auxiliary_loss_clip": 0.01090633, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.03616285, + "balance_loss_mlp": 1.02392268, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.6328615056025053, + "language_loss": 0.70137161, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72264791, + "num_input_tokens_seen": 283827120, + "step": 13152, + "time_per_iteration": 2.6516058444976807 + }, + { + "auxiliary_loss_clip": 0.01078328, + "auxiliary_loss_mlp": 0.0077722, + "balance_loss_clip": 1.03686917, + "balance_loss_mlp": 1.00065446, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 1.8901372950634103, + "language_loss": 0.72717476, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74573028, + "num_input_tokens_seen": 283844820, + "step": 13153, + "time_per_iteration": 2.5179858207702637 + }, + { + "auxiliary_loss_clip": 0.01107587, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.03512204, + "balance_loss_mlp": 1.01697314, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.567522991329649, + "language_loss": 0.7900728, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81143934, + "num_input_tokens_seen": 283862870, + "step": 13154, + "time_per_iteration": 2.456224203109741 + }, + { + "auxiliary_loss_clip": 0.01104499, + "auxiliary_loss_mlp": 0.01030495, + "balance_loss_clip": 1.03717566, + "balance_loss_mlp": 1.01636887, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 2.0780149304006126, + "language_loss": 0.70055783, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72190773, + "num_input_tokens_seen": 283882405, + "step": 13155, + "time_per_iteration": 2.463622570037842 + }, + { + "auxiliary_loss_clip": 0.01110806, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.03705192, + "balance_loss_mlp": 1.01497436, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 1.6824206994658355, + "language_loss": 0.76810455, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.78948605, + "num_input_tokens_seen": 283902070, + "step": 13156, + "time_per_iteration": 3.7456376552581787 + }, + { + "auxiliary_loss_clip": 0.0107918, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.03323424, + "balance_loss_mlp": 1.02156281, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 2.044552730564926, + "language_loss": 0.65800023, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67913163, + "num_input_tokens_seen": 283924100, + "step": 13157, + "time_per_iteration": 2.590355396270752 + }, + { + "auxiliary_loss_clip": 0.01097898, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.03347707, + "balance_loss_mlp": 1.02591991, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 2.1513793485461474, + "language_loss": 0.73983848, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76121306, + "num_input_tokens_seen": 283944955, + "step": 13158, + "time_per_iteration": 2.4999945163726807 + }, + { + "auxiliary_loss_clip": 0.01095205, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.03519821, + "balance_loss_mlp": 1.01812434, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 2.261385738871563, + "language_loss": 0.67255509, + "learning_rate": 4.40212412422309e-07, + "loss": 0.6938023, + "num_input_tokens_seen": 283963125, + "step": 13159, + "time_per_iteration": 2.4150047302246094 + }, + { + "auxiliary_loss_clip": 0.0109736, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.03730488, + "balance_loss_mlp": 1.01748145, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 2.3201976191202283, + "language_loss": 0.67310107, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69436884, + "num_input_tokens_seen": 283982850, + "step": 13160, + "time_per_iteration": 2.449274778366089 + }, + { + "auxiliary_loss_clip": 0.01082134, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.03335059, + "balance_loss_mlp": 1.01634765, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 2.581138945876392, + "language_loss": 0.72838438, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.74947232, + "num_input_tokens_seen": 283998275, + "step": 13161, + "time_per_iteration": 2.5421907901763916 + }, + { + "auxiliary_loss_clip": 0.01082392, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.03548026, + "balance_loss_mlp": 1.0174644, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 2.0124463886541784, + "language_loss": 0.7330991, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75422931, + "num_input_tokens_seen": 284018750, + "step": 13162, + "time_per_iteration": 2.5216524600982666 + }, + { + "auxiliary_loss_clip": 0.01089522, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.03815341, + "balance_loss_mlp": 1.01802301, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.6979606546904262, + "language_loss": 0.71745467, + "learning_rate": 4.392378109401811e-07, + "loss": 0.73865294, + "num_input_tokens_seen": 284037850, + "step": 13163, + "time_per_iteration": 2.484912395477295 + }, + { + "auxiliary_loss_clip": 0.01076821, + "auxiliary_loss_mlp": 0.01034227, + "balance_loss_clip": 1.03449285, + "balance_loss_mlp": 1.02105439, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 2.0331660904288507, + "language_loss": 0.69640493, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.71751547, + "num_input_tokens_seen": 284056380, + "step": 13164, + "time_per_iteration": 2.530879259109497 + }, + { + "auxiliary_loss_clip": 0.01070695, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.03661823, + "balance_loss_mlp": 1.0193584, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 1.985511446788698, + "language_loss": 0.66632468, + "learning_rate": 4.387508652677177e-07, + "loss": 0.6873436, + "num_input_tokens_seen": 284074945, + "step": 13165, + "time_per_iteration": 2.572389841079712 + }, + { + "auxiliary_loss_clip": 0.01059964, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.03382325, + "balance_loss_mlp": 1.01826549, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 1.947100753982664, + "language_loss": 0.72424114, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74513274, + "num_input_tokens_seen": 284092070, + "step": 13166, + "time_per_iteration": 2.5719501972198486 + }, + { + "auxiliary_loss_clip": 0.0110703, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.03531313, + "balance_loss_mlp": 1.0181917, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.7391837497022058, + "language_loss": 0.77425194, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79563522, + "num_input_tokens_seen": 284112255, + "step": 13167, + "time_per_iteration": 2.471569061279297 + }, + { + "auxiliary_loss_clip": 0.01075561, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.03612947, + "balance_loss_mlp": 1.02026343, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.6184396711761744, + "language_loss": 0.8411386, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86220765, + "num_input_tokens_seen": 284132330, + "step": 13168, + "time_per_iteration": 2.54038667678833 + }, + { + "auxiliary_loss_clip": 0.01110285, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.03720403, + "balance_loss_mlp": 1.01634574, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.6362481179657462, + "language_loss": 0.72631782, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74770463, + "num_input_tokens_seen": 284150640, + "step": 13169, + "time_per_iteration": 2.433619260787964 + }, + { + "auxiliary_loss_clip": 0.01111123, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.0361762, + "balance_loss_mlp": 1.02084363, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 2.746576436633718, + "language_loss": 0.67282343, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69427323, + "num_input_tokens_seen": 284171910, + "step": 13170, + "time_per_iteration": 2.5761938095092773 + }, + { + "auxiliary_loss_clip": 0.01096437, + "auxiliary_loss_mlp": 0.01024536, + "balance_loss_clip": 1.033337, + "balance_loss_mlp": 1.01297843, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 1.82098595530503, + "language_loss": 0.70692855, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72813833, + "num_input_tokens_seen": 284191340, + "step": 13171, + "time_per_iteration": 3.982302665710449 + }, + { + "auxiliary_loss_clip": 0.0109737, + "auxiliary_loss_mlp": 0.01028012, + "balance_loss_clip": 1.03678155, + "balance_loss_mlp": 1.01587033, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 1.9424156364255984, + "language_loss": 0.6676451, + "learning_rate": 4.370484207842553e-07, + "loss": 0.68889886, + "num_input_tokens_seen": 284212495, + "step": 13172, + "time_per_iteration": 2.5693747997283936 + }, + { + "auxiliary_loss_clip": 0.01083261, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03280926, + "balance_loss_mlp": 1.02004576, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 1.745098025103658, + "language_loss": 0.79632568, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81748122, + "num_input_tokens_seen": 284230825, + "step": 13173, + "time_per_iteration": 2.4884910583496094 + }, + { + "auxiliary_loss_clip": 0.01072564, + "auxiliary_loss_mlp": 0.01037593, + "balance_loss_clip": 1.03132033, + "balance_loss_mlp": 1.02532053, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.9090663666827012, + "language_loss": 0.76994932, + "learning_rate": 4.365625413419365e-07, + "loss": 0.79105085, + "num_input_tokens_seen": 284250365, + "step": 13174, + "time_per_iteration": 2.539539337158203 + }, + { + "auxiliary_loss_clip": 0.010839, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.03255749, + "balance_loss_mlp": 1.01754403, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.6319014282808049, + "language_loss": 0.71563894, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73676836, + "num_input_tokens_seen": 284269635, + "step": 13175, + "time_per_iteration": 2.5390565395355225 + }, + { + "auxiliary_loss_clip": 0.01097234, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.03393781, + "balance_loss_mlp": 1.01603246, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 1.7923186125946433, + "language_loss": 0.59653395, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61779565, + "num_input_tokens_seen": 284288380, + "step": 13176, + "time_per_iteration": 2.4411933422088623 + }, + { + "auxiliary_loss_clip": 0.01109632, + "auxiliary_loss_mlp": 0.01031906, + "balance_loss_clip": 1.03867137, + "balance_loss_mlp": 1.01990128, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.916944449159079, + "language_loss": 0.73301029, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75442564, + "num_input_tokens_seen": 284306920, + "step": 13177, + "time_per_iteration": 2.3969411849975586 + }, + { + "auxiliary_loss_clip": 0.01091862, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.03525507, + "balance_loss_mlp": 1.02105999, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 2.071611203498628, + "language_loss": 0.64019263, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66144657, + "num_input_tokens_seen": 284324700, + "step": 13178, + "time_per_iteration": 2.4338796138763428 + }, + { + "auxiliary_loss_clip": 0.01088398, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.03813815, + "balance_loss_mlp": 1.0189383, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.49148398132709, + "language_loss": 0.68684965, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.7080301, + "num_input_tokens_seen": 284345985, + "step": 13179, + "time_per_iteration": 4.0590057373046875 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.03459561, + "balance_loss_mlp": 1.01515651, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 3.2502165235748604, + "language_loss": 0.74175143, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76308298, + "num_input_tokens_seen": 284364475, + "step": 13180, + "time_per_iteration": 2.4412126541137695 + }, + { + "auxiliary_loss_clip": 0.01101583, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.04163861, + "balance_loss_mlp": 1.02381563, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 3.8500824278633394, + "language_loss": 0.81661379, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.83799994, + "num_input_tokens_seen": 284382125, + "step": 13181, + "time_per_iteration": 2.447775363922119 + }, + { + "auxiliary_loss_clip": 0.01085918, + "auxiliary_loss_mlp": 0.01036396, + "balance_loss_clip": 1.03863192, + "balance_loss_mlp": 1.02314591, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.7165644900783332, + "language_loss": 0.77515638, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79637951, + "num_input_tokens_seen": 284401585, + "step": 13182, + "time_per_iteration": 2.545788526535034 + }, + { + "auxiliary_loss_clip": 0.01095268, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.03602767, + "balance_loss_mlp": 1.02233934, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 1.9619776288285746, + "language_loss": 0.74372751, + "learning_rate": 4.34379019557056e-07, + "loss": 0.76504093, + "num_input_tokens_seen": 284419125, + "step": 13183, + "time_per_iteration": 2.460158348083496 + }, + { + "auxiliary_loss_clip": 0.01079312, + "auxiliary_loss_mlp": 0.01027409, + "balance_loss_clip": 1.03533351, + "balance_loss_mlp": 1.0143199, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 2.0402427530749008, + "language_loss": 0.68337345, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70444065, + "num_input_tokens_seen": 284440445, + "step": 13184, + "time_per_iteration": 2.6265058517456055 + }, + { + "auxiliary_loss_clip": 0.01070439, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.03608811, + "balance_loss_mlp": 1.02025056, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 4.491314883304851, + "language_loss": 0.7063399, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72736907, + "num_input_tokens_seen": 284459370, + "step": 13185, + "time_per_iteration": 2.5512990951538086 + }, + { + "auxiliary_loss_clip": 0.01096281, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.03809834, + "balance_loss_mlp": 1.01731372, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 2.049664223397362, + "language_loss": 0.65363336, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67489326, + "num_input_tokens_seen": 284477525, + "step": 13186, + "time_per_iteration": 3.836840867996216 + }, + { + "auxiliary_loss_clip": 0.01093134, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.0352751, + "balance_loss_mlp": 1.02179193, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 1.5645589290409783, + "language_loss": 0.7692346, + "learning_rate": 4.334101086130408e-07, + "loss": 0.79050577, + "num_input_tokens_seen": 284496590, + "step": 13187, + "time_per_iteration": 2.481473445892334 + }, + { + "auxiliary_loss_clip": 0.01088889, + "auxiliary_loss_mlp": 0.01027331, + "balance_loss_clip": 1.03556824, + "balance_loss_mlp": 1.01567245, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 2.1318520109128927, + "language_loss": 0.72509944, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74626166, + "num_input_tokens_seen": 284511470, + "step": 13188, + "time_per_iteration": 2.478344202041626 + }, + { + "auxiliary_loss_clip": 0.01108743, + "auxiliary_loss_mlp": 0.00779368, + "balance_loss_clip": 1.03487396, + "balance_loss_mlp": 1.00056911, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 6.8192411733044205, + "language_loss": 0.63110852, + "learning_rate": 4.329260095357725e-07, + "loss": 0.6499896, + "num_input_tokens_seen": 284531125, + "step": 13189, + "time_per_iteration": 2.4118831157684326 + }, + { + "auxiliary_loss_clip": 0.01071341, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.04353607, + "balance_loss_mlp": 1.01763308, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 2.1672934366542433, + "language_loss": 0.72416687, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74517101, + "num_input_tokens_seen": 284549340, + "step": 13190, + "time_per_iteration": 2.546325922012329 + }, + { + "auxiliary_loss_clip": 0.01093126, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.03687382, + "balance_loss_mlp": 1.01864552, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.8616020358943888, + "language_loss": 0.73230457, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75352567, + "num_input_tokens_seen": 284567060, + "step": 13191, + "time_per_iteration": 2.4948644638061523 + }, + { + "auxiliary_loss_clip": 0.01096042, + "auxiliary_loss_mlp": 0.01039992, + "balance_loss_clip": 1.03509569, + "balance_loss_mlp": 1.02751124, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.9418436000909514, + "language_loss": 0.6906653, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71202564, + "num_input_tokens_seen": 284586600, + "step": 13192, + "time_per_iteration": 2.4548935890197754 + }, + { + "auxiliary_loss_clip": 0.01076079, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.03323269, + "balance_loss_mlp": 1.02406991, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.701591026132328, + "language_loss": 0.75073278, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77185571, + "num_input_tokens_seen": 284605715, + "step": 13193, + "time_per_iteration": 2.5475285053253174 + }, + { + "auxiliary_loss_clip": 0.0109833, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.03793001, + "balance_loss_mlp": 1.01926661, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 2.010232298191349, + "language_loss": 0.72325718, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74456817, + "num_input_tokens_seen": 284628540, + "step": 13194, + "time_per_iteration": 2.52994441986084 + }, + { + "auxiliary_loss_clip": 0.01113146, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.037202, + "balance_loss_mlp": 1.01963758, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 2.8985007045932436, + "language_loss": 0.70270777, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72416449, + "num_input_tokens_seen": 284646040, + "step": 13195, + "time_per_iteration": 4.018856525421143 + }, + { + "auxiliary_loss_clip": 0.01058532, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.03608418, + "balance_loss_mlp": 1.01508415, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 1.7222129158280615, + "language_loss": 0.77560747, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79646599, + "num_input_tokens_seen": 284665110, + "step": 13196, + "time_per_iteration": 2.602407217025757 + }, + { + "auxiliary_loss_clip": 0.01081159, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.03941131, + "balance_loss_mlp": 1.02569366, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.5148388833326216, + "language_loss": 0.6886282, + "learning_rate": 4.309919909045268e-07, + "loss": 0.70981568, + "num_input_tokens_seen": 284686515, + "step": 13197, + "time_per_iteration": 2.636711359024048 + }, + { + "auxiliary_loss_clip": 0.01097972, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.03765237, + "balance_loss_mlp": 1.01944923, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 3.006224891365202, + "language_loss": 0.64854378, + "learning_rate": 4.30750506215646e-07, + "loss": 0.66983759, + "num_input_tokens_seen": 284707300, + "step": 13198, + "time_per_iteration": 2.548954486846924 + }, + { + "auxiliary_loss_clip": 0.01065261, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.03719604, + "balance_loss_mlp": 1.02378345, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 1.9512727939117789, + "language_loss": 0.72470748, + "learning_rate": 4.30509081032864e-07, + "loss": 0.74574316, + "num_input_tokens_seen": 284723545, + "step": 13199, + "time_per_iteration": 2.5413174629211426 + }, + { + "auxiliary_loss_clip": 0.01083129, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.03282654, + "balance_loss_mlp": 1.01663876, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 1.833677516096379, + "language_loss": 0.80788732, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82900858, + "num_input_tokens_seen": 284742650, + "step": 13200, + "time_per_iteration": 2.535266160964966 + }, + { + "auxiliary_loss_clip": 0.01095262, + "auxiliary_loss_mlp": 0.01030967, + "balance_loss_clip": 1.03704524, + "balance_loss_mlp": 1.01945138, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 2.153836352759506, + "language_loss": 0.77563131, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.7968936, + "num_input_tokens_seen": 284760955, + "step": 13201, + "time_per_iteration": 2.444478750228882 + }, + { + "auxiliary_loss_clip": 0.01104979, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.03461063, + "balance_loss_mlp": 1.01997685, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.7239182237539061, + "language_loss": 0.67308784, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.6944527, + "num_input_tokens_seen": 284780745, + "step": 13202, + "time_per_iteration": 2.451427936553955 + }, + { + "auxiliary_loss_clip": 0.01098082, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.03625619, + "balance_loss_mlp": 1.02072692, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 1.9211076501877946, + "language_loss": 0.74996817, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.77128279, + "num_input_tokens_seen": 284799000, + "step": 13203, + "time_per_iteration": 2.467440605163574 + }, + { + "auxiliary_loss_clip": 0.01060085, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.03785169, + "balance_loss_mlp": 1.01739192, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 1.8151157736411052, + "language_loss": 0.66258746, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68348134, + "num_input_tokens_seen": 284817450, + "step": 13204, + "time_per_iteration": 2.661810874938965 + }, + { + "auxiliary_loss_clip": 0.01056163, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.04012394, + "balance_loss_mlp": 1.01754951, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.3733354382684668, + "language_loss": 0.79404336, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81489861, + "num_input_tokens_seen": 284838865, + "step": 13205, + "time_per_iteration": 2.6679108142852783 + }, + { + "auxiliary_loss_clip": 0.01074466, + "auxiliary_loss_mlp": 0.01028649, + "balance_loss_clip": 1.0322175, + "balance_loss_mlp": 1.01623309, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 2.0762822824509373, + "language_loss": 0.77669042, + "learning_rate": 4.28820771692858e-07, + "loss": 0.79772156, + "num_input_tokens_seen": 284857975, + "step": 13206, + "time_per_iteration": 2.5645592212677 + }, + { + "auxiliary_loss_clip": 0.01085978, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.03580618, + "balance_loss_mlp": 1.0230726, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 2.123497655728947, + "language_loss": 0.79235804, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81358945, + "num_input_tokens_seen": 284877145, + "step": 13207, + "time_per_iteration": 2.542606830596924 + }, + { + "auxiliary_loss_clip": 0.01077517, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.04025733, + "balance_loss_mlp": 1.02180362, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.7873084243871544, + "language_loss": 0.841447, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.8625654, + "num_input_tokens_seen": 284895560, + "step": 13208, + "time_per_iteration": 2.589951276779175 + }, + { + "auxiliary_loss_clip": 0.00995378, + "auxiliary_loss_mlp": 0.01003496, + "balance_loss_clip": 1.00907087, + "balance_loss_mlp": 1.00225627, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7262454374264471, + "language_loss": 0.58317012, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60315889, + "num_input_tokens_seen": 284963135, + "step": 13209, + "time_per_iteration": 3.52121901512146 + }, + { + "auxiliary_loss_clip": 0.0107587, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.03366518, + "balance_loss_mlp": 1.01687551, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 2.30587315298648, + "language_loss": 0.6341821, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.65523994, + "num_input_tokens_seen": 284981755, + "step": 13210, + "time_per_iteration": 2.796393871307373 + }, + { + "auxiliary_loss_clip": 0.0109371, + "auxiliary_loss_mlp": 0.01036403, + "balance_loss_clip": 1.03730559, + "balance_loss_mlp": 1.0243746, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.5689137792633259, + "language_loss": 0.69172275, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.7130239, + "num_input_tokens_seen": 285003060, + "step": 13211, + "time_per_iteration": 3.9682838916778564 + }, + { + "auxiliary_loss_clip": 0.01100999, + "auxiliary_loss_mlp": 0.01038904, + "balance_loss_clip": 1.03740191, + "balance_loss_mlp": 1.02553511, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.616468533277012, + "language_loss": 0.71833956, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.73973858, + "num_input_tokens_seen": 285021640, + "step": 13212, + "time_per_iteration": 2.5066587924957275 + }, + { + "auxiliary_loss_clip": 0.01094694, + "auxiliary_loss_mlp": 0.01030001, + "balance_loss_clip": 1.03472698, + "balance_loss_mlp": 1.01804447, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.715878583704539, + "language_loss": 0.80843771, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82968462, + "num_input_tokens_seen": 285040490, + "step": 13213, + "time_per_iteration": 2.474224805831909 + }, + { + "auxiliary_loss_clip": 0.01100891, + "auxiliary_loss_mlp": 0.01030394, + "balance_loss_clip": 1.03710985, + "balance_loss_mlp": 1.01709676, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.318798519389018, + "language_loss": 0.67306137, + "learning_rate": 4.268948502428327e-07, + "loss": 0.6943742, + "num_input_tokens_seen": 285059270, + "step": 13214, + "time_per_iteration": 2.4523425102233887 + }, + { + "auxiliary_loss_clip": 0.01105578, + "auxiliary_loss_mlp": 0.01029068, + "balance_loss_clip": 1.03547239, + "balance_loss_mlp": 1.01737332, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 1.9666266656005928, + "language_loss": 0.72337008, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74471653, + "num_input_tokens_seen": 285075390, + "step": 13215, + "time_per_iteration": 2.411367177963257 + }, + { + "auxiliary_loss_clip": 0.01065869, + "auxiliary_loss_mlp": 0.01036152, + "balance_loss_clip": 1.04150105, + "balance_loss_mlp": 1.02327776, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.5279773156689394, + "language_loss": 0.78589165, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.80691183, + "num_input_tokens_seen": 285096290, + "step": 13216, + "time_per_iteration": 2.6308860778808594 + }, + { + "auxiliary_loss_clip": 0.01095181, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.03400421, + "balance_loss_mlp": 1.0204953, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.7702200473464877, + "language_loss": 0.74098039, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76225817, + "num_input_tokens_seen": 285116020, + "step": 13217, + "time_per_iteration": 2.476039171218872 + }, + { + "auxiliary_loss_clip": 0.01083452, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.03577864, + "balance_loss_mlp": 1.01904631, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.8275888212983837, + "language_loss": 0.7382648, + "learning_rate": 4.259333208810907e-07, + "loss": 0.75940961, + "num_input_tokens_seen": 285133510, + "step": 13218, + "time_per_iteration": 4.251389980316162 + }, + { + "auxiliary_loss_clip": 0.01098358, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.03438139, + "balance_loss_mlp": 1.02087545, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 1.9269297520310236, + "language_loss": 0.83456755, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85589039, + "num_input_tokens_seen": 285151690, + "step": 13219, + "time_per_iteration": 2.5669078826904297 + }, + { + "auxiliary_loss_clip": 0.0109982, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.03658009, + "balance_loss_mlp": 1.02284718, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 2.6271739243674097, + "language_loss": 0.75374365, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.77511394, + "num_input_tokens_seen": 285170485, + "step": 13220, + "time_per_iteration": 2.4967494010925293 + }, + { + "auxiliary_loss_clip": 0.01083462, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.03431714, + "balance_loss_mlp": 1.02203131, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 2.929433903273031, + "language_loss": 0.7235207, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74469978, + "num_input_tokens_seen": 285191050, + "step": 13221, + "time_per_iteration": 2.63612699508667 + }, + { + "auxiliary_loss_clip": 0.01098654, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.0382117, + "balance_loss_mlp": 1.01563764, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 3.3694069600807905, + "language_loss": 0.75399053, + "learning_rate": 4.249727465395634e-07, + "loss": 0.77524769, + "num_input_tokens_seen": 285208750, + "step": 13222, + "time_per_iteration": 2.4780406951904297 + }, + { + "auxiliary_loss_clip": 0.01014884, + "auxiliary_loss_mlp": 0.01000955, + "balance_loss_clip": 1.0108211, + "balance_loss_mlp": 0.99971479, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7772983544014324, + "language_loss": 0.67067206, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69083041, + "num_input_tokens_seen": 285264605, + "step": 13223, + "time_per_iteration": 2.973069429397583 + }, + { + "auxiliary_loss_clip": 0.01098927, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.03763008, + "balance_loss_mlp": 1.02074623, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 2.025112584754389, + "language_loss": 0.71093088, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73225677, + "num_input_tokens_seen": 285283940, + "step": 13224, + "time_per_iteration": 2.4870972633361816 + }, + { + "auxiliary_loss_clip": 0.01028468, + "auxiliary_loss_mlp": 0.01000776, + "balance_loss_clip": 1.0052104, + "balance_loss_mlp": 0.99955368, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6698542257839308, + "language_loss": 0.54991984, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57021224, + "num_input_tokens_seen": 285349525, + "step": 13225, + "time_per_iteration": 4.538046598434448 + }, + { + "auxiliary_loss_clip": 0.01082233, + "auxiliary_loss_mlp": 0.01021564, + "balance_loss_clip": 1.03244138, + "balance_loss_mlp": 1.01065671, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 1.8699318855170264, + "language_loss": 0.64768457, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.66872263, + "num_input_tokens_seen": 285367355, + "step": 13226, + "time_per_iteration": 2.49295973777771 + }, + { + "auxiliary_loss_clip": 0.01069589, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.04106772, + "balance_loss_mlp": 1.02478647, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 2.2614533118197615, + "language_loss": 0.70027184, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72133386, + "num_input_tokens_seen": 285386190, + "step": 13227, + "time_per_iteration": 2.7042086124420166 + }, + { + "auxiliary_loss_clip": 0.01065854, + "auxiliary_loss_mlp": 0.01025683, + "balance_loss_clip": 1.03329754, + "balance_loss_mlp": 1.01472759, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.794764306165948, + "language_loss": 0.69282329, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71373868, + "num_input_tokens_seen": 285406150, + "step": 13228, + "time_per_iteration": 2.57877254486084 + }, + { + "auxiliary_loss_clip": 0.01062798, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.03214765, + "balance_loss_mlp": 1.02756262, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.4762278173179673, + "language_loss": 0.7092247, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73025936, + "num_input_tokens_seen": 285429900, + "step": 13229, + "time_per_iteration": 2.7431225776672363 + }, + { + "auxiliary_loss_clip": 0.01102532, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.03796387, + "balance_loss_mlp": 1.02375627, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 1.9981718086110596, + "language_loss": 0.71843946, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.73982549, + "num_input_tokens_seen": 285452555, + "step": 13230, + "time_per_iteration": 2.5301055908203125 + }, + { + "auxiliary_loss_clip": 0.01016845, + "auxiliary_loss_mlp": 0.00999853, + "balance_loss_clip": 1.01323771, + "balance_loss_mlp": 0.9986372, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.8987935207575369, + "language_loss": 0.63538724, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65555423, + "num_input_tokens_seen": 285515700, + "step": 13231, + "time_per_iteration": 3.1032886505126953 + }, + { + "auxiliary_loss_clip": 0.01087494, + "auxiliary_loss_mlp": 0.01028554, + "balance_loss_clip": 1.03530288, + "balance_loss_mlp": 1.01683605, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.6679572366352986, + "language_loss": 0.69748354, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.71864402, + "num_input_tokens_seen": 285533910, + "step": 13232, + "time_per_iteration": 2.512064218521118 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.03586745, + "balance_loss_mlp": 1.01560593, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 1.7142833173616636, + "language_loss": 0.7781592, + "learning_rate": 4.223360961792952e-07, + "loss": 0.79942292, + "num_input_tokens_seen": 285554080, + "step": 13233, + "time_per_iteration": 2.5623421669006348 + }, + { + "auxiliary_loss_clip": 0.0110095, + "auxiliary_loss_mlp": 0.01034641, + "balance_loss_clip": 1.03804064, + "balance_loss_mlp": 1.02240968, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 1.9840686875706177, + "language_loss": 0.78507382, + "learning_rate": 4.220967594613769e-07, + "loss": 0.80642974, + "num_input_tokens_seen": 285572325, + "step": 13234, + "time_per_iteration": 3.9860920906066895 + }, + { + "auxiliary_loss_clip": 0.01086549, + "auxiliary_loss_mlp": 0.00776382, + "balance_loss_clip": 1.03625667, + "balance_loss_mlp": 1.00049484, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.7675522861627622, + "language_loss": 0.70032287, + "learning_rate": 4.218574825777077e-07, + "loss": 0.71895218, + "num_input_tokens_seen": 285589770, + "step": 13235, + "time_per_iteration": 2.4584877490997314 + }, + { + "auxiliary_loss_clip": 0.01071213, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.03410077, + "balance_loss_mlp": 1.01598406, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 1.4852731436555973, + "language_loss": 0.67938584, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.70038217, + "num_input_tokens_seen": 285610065, + "step": 13236, + "time_per_iteration": 2.548462152481079 + }, + { + "auxiliary_loss_clip": 0.01062027, + "auxiliary_loss_mlp": 0.01028265, + "balance_loss_clip": 1.03636003, + "balance_loss_mlp": 1.0160346, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.6660441645019983, + "language_loss": 0.75328141, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77418429, + "num_input_tokens_seen": 285628480, + "step": 13237, + "time_per_iteration": 2.5938282012939453 + }, + { + "auxiliary_loss_clip": 0.01099145, + "auxiliary_loss_mlp": 0.01035182, + "balance_loss_clip": 1.03821063, + "balance_loss_mlp": 1.022403, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 2.560743616914568, + "language_loss": 0.71509516, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73643839, + "num_input_tokens_seen": 285647805, + "step": 13238, + "time_per_iteration": 2.450822114944458 + }, + { + "auxiliary_loss_clip": 0.01097506, + "auxiliary_loss_mlp": 0.01027752, + "balance_loss_clip": 1.03867495, + "balance_loss_mlp": 1.01534188, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 2.048977969219583, + "language_loss": 0.73846608, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.75971866, + "num_input_tokens_seen": 285665505, + "step": 13239, + "time_per_iteration": 2.4567511081695557 + }, + { + "auxiliary_loss_clip": 0.01112054, + "auxiliary_loss_mlp": 0.01033495, + "balance_loss_clip": 1.03661895, + "balance_loss_mlp": 1.02067399, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.9329873660296193, + "language_loss": 0.69289339, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71434891, + "num_input_tokens_seen": 285685855, + "step": 13240, + "time_per_iteration": 2.471578598022461 + }, + { + "auxiliary_loss_clip": 0.01020999, + "auxiliary_loss_mlp": 0.0100318, + "balance_loss_clip": 1.0066061, + "balance_loss_mlp": 1.00183344, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8968567100142022, + "language_loss": 0.58634782, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60658967, + "num_input_tokens_seen": 285735710, + "step": 13241, + "time_per_iteration": 2.821178436279297 + }, + { + "auxiliary_loss_clip": 0.01079077, + "auxiliary_loss_mlp": 0.01030081, + "balance_loss_clip": 1.03776956, + "balance_loss_mlp": 1.01886296, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 1.8795381005533232, + "language_loss": 0.64627987, + "learning_rate": 4.201842205128772e-07, + "loss": 0.66737139, + "num_input_tokens_seen": 285757045, + "step": 13242, + "time_per_iteration": 2.706157922744751 + }, + { + "auxiliary_loss_clip": 0.01108117, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.03537989, + "balance_loss_mlp": 1.02137268, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 2.0659600379558762, + "language_loss": 0.7588526, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78027481, + "num_input_tokens_seen": 285776050, + "step": 13243, + "time_per_iteration": 2.4318084716796875 + }, + { + "auxiliary_loss_clip": 0.01082925, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.0379231, + "balance_loss_mlp": 1.01809192, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.7941545602064282, + "language_loss": 0.79285687, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.81399429, + "num_input_tokens_seen": 285796830, + "step": 13244, + "time_per_iteration": 2.6562728881835938 + }, + { + "auxiliary_loss_clip": 0.01102173, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.03586364, + "balance_loss_mlp": 1.01637101, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 2.1806785037700713, + "language_loss": 0.68098706, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70229852, + "num_input_tokens_seen": 285814755, + "step": 13245, + "time_per_iteration": 2.6028242111206055 + }, + { + "auxiliary_loss_clip": 0.01088263, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.03527153, + "balance_loss_mlp": 1.02154613, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.3973974838874599, + "language_loss": 0.79164255, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81286192, + "num_input_tokens_seen": 285834255, + "step": 13246, + "time_per_iteration": 2.5465099811553955 + }, + { + "auxiliary_loss_clip": 0.01090313, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.03660679, + "balance_loss_mlp": 1.01968122, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 1.8220806057863015, + "language_loss": 0.66409552, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68531609, + "num_input_tokens_seen": 285853540, + "step": 13247, + "time_per_iteration": 2.5245258808135986 + }, + { + "auxiliary_loss_clip": 0.01084247, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.0359056, + "balance_loss_mlp": 1.0181762, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 2.1438378615632923, + "language_loss": 0.71870017, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73983175, + "num_input_tokens_seen": 285872705, + "step": 13248, + "time_per_iteration": 2.5392730236053467 + }, + { + "auxiliary_loss_clip": 0.01093421, + "auxiliary_loss_mlp": 0.01030641, + "balance_loss_clip": 1.03770876, + "balance_loss_mlp": 1.01866007, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 2.6199293218369344, + "language_loss": 0.75980413, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78104472, + "num_input_tokens_seen": 285890290, + "step": 13249, + "time_per_iteration": 2.516306161880493 + }, + { + "auxiliary_loss_clip": 0.01082464, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.03729868, + "balance_loss_mlp": 1.0207417, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.1329401015572, + "language_loss": 0.6160481, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63719958, + "num_input_tokens_seen": 285909190, + "step": 13250, + "time_per_iteration": 3.9123690128326416 + }, + { + "auxiliary_loss_clip": 0.01087416, + "auxiliary_loss_mlp": 0.01026761, + "balance_loss_clip": 1.03410101, + "balance_loss_mlp": 1.01452971, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.1291920662123696, + "language_loss": 0.71733272, + "learning_rate": 4.180371972938206e-07, + "loss": 0.73847449, + "num_input_tokens_seen": 285927570, + "step": 13251, + "time_per_iteration": 2.4833476543426514 + }, + { + "auxiliary_loss_clip": 0.01112832, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.03760886, + "balance_loss_mlp": 1.01782227, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 1.9526375801866593, + "language_loss": 0.73379612, + "learning_rate": 4.177989389787624e-07, + "loss": 0.75523937, + "num_input_tokens_seen": 285945810, + "step": 13252, + "time_per_iteration": 2.434804677963257 + }, + { + "auxiliary_loss_clip": 0.01105195, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.03592896, + "balance_loss_mlp": 1.01790881, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.6431568948720954, + "language_loss": 0.66601574, + "learning_rate": 4.175607406609278e-07, + "loss": 0.6873672, + "num_input_tokens_seen": 285964235, + "step": 13253, + "time_per_iteration": 2.4669785499572754 + }, + { + "auxiliary_loss_clip": 0.01086514, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.04119253, + "balance_loss_mlp": 1.0205965, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.7109385657657366, + "language_loss": 0.67740226, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.6985985, + "num_input_tokens_seen": 285983710, + "step": 13254, + "time_per_iteration": 2.543973922729492 + }, + { + "auxiliary_loss_clip": 0.01099248, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.03712213, + "balance_loss_mlp": 1.0244782, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.8219817613704008, + "language_loss": 0.69189322, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.7132473, + "num_input_tokens_seen": 286003425, + "step": 13255, + "time_per_iteration": 2.47648286819458 + }, + { + "auxiliary_loss_clip": 0.01105559, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.03466177, + "balance_loss_mlp": 1.01917779, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 2.4856181869065033, + "language_loss": 0.7949661, + "learning_rate": 4.168465057810733e-07, + "loss": 0.8163293, + "num_input_tokens_seen": 286020130, + "step": 13256, + "time_per_iteration": 2.3957505226135254 + }, + { + "auxiliary_loss_clip": 0.01096476, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.03666401, + "balance_loss_mlp": 1.01619256, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.8894207151366023, + "language_loss": 0.65940279, + "learning_rate": 4.166085475424315e-07, + "loss": 0.68065381, + "num_input_tokens_seen": 286040230, + "step": 13257, + "time_per_iteration": 2.487192153930664 + }, + { + "auxiliary_loss_clip": 0.01091836, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.03654766, + "balance_loss_mlp": 1.02329254, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 2.5163966178936, + "language_loss": 0.71893579, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74021304, + "num_input_tokens_seen": 286059475, + "step": 13258, + "time_per_iteration": 4.011357069015503 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.03638494, + "balance_loss_mlp": 1.02106285, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.8606018302371248, + "language_loss": 0.68638849, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.70773792, + "num_input_tokens_seen": 286077820, + "step": 13259, + "time_per_iteration": 2.449909210205078 + }, + { + "auxiliary_loss_clip": 0.01094787, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03530812, + "balance_loss_mlp": 1.0201242, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.6744681364180676, + "language_loss": 0.73582202, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75708002, + "num_input_tokens_seen": 286097285, + "step": 13260, + "time_per_iteration": 2.5058887004852295 + }, + { + "auxiliary_loss_clip": 0.01082098, + "auxiliary_loss_mlp": 0.01029208, + "balance_loss_clip": 1.03135395, + "balance_loss_mlp": 1.01772857, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.7321910242273806, + "language_loss": 0.77995145, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80106449, + "num_input_tokens_seen": 286116000, + "step": 13261, + "time_per_iteration": 2.485788106918335 + }, + { + "auxiliary_loss_clip": 0.0109478, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.03621435, + "balance_loss_mlp": 1.02101147, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.4809866336077562, + "language_loss": 0.76026356, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78152996, + "num_input_tokens_seen": 286135110, + "step": 13262, + "time_per_iteration": 2.470101833343506 + }, + { + "auxiliary_loss_clip": 0.01079657, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.03765762, + "balance_loss_mlp": 1.01916921, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.905774342372271, + "language_loss": 0.7058351, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72695726, + "num_input_tokens_seen": 286152835, + "step": 13263, + "time_per_iteration": 2.533555746078491 + }, + { + "auxiliary_loss_clip": 0.01104396, + "auxiliary_loss_mlp": 0.0103781, + "balance_loss_clip": 1.03772712, + "balance_loss_mlp": 1.02450669, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 1.9147367897432, + "language_loss": 0.71124941, + "learning_rate": 4.149445215631153e-07, + "loss": 0.7326715, + "num_input_tokens_seen": 286171785, + "step": 13264, + "time_per_iteration": 2.482279062271118 + }, + { + "auxiliary_loss_clip": 0.01107049, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.03696489, + "balance_loss_mlp": 1.02140057, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.7659231965683886, + "language_loss": 0.7688725, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.79027301, + "num_input_tokens_seen": 286190420, + "step": 13265, + "time_per_iteration": 3.852385997772217 + }, + { + "auxiliary_loss_clip": 0.01079522, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.03747511, + "balance_loss_mlp": 1.01820886, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 2.002786780268117, + "language_loss": 0.76109767, + "learning_rate": 4.144696263830285e-07, + "loss": 0.78219497, + "num_input_tokens_seen": 286210105, + "step": 13266, + "time_per_iteration": 2.558738946914673 + }, + { + "auxiliary_loss_clip": 0.0108159, + "auxiliary_loss_mlp": 0.01025537, + "balance_loss_clip": 1.03566825, + "balance_loss_mlp": 1.0139854, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 6.453446252945881, + "language_loss": 0.84397519, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.8650465, + "num_input_tokens_seen": 286228180, + "step": 13267, + "time_per_iteration": 2.494311809539795 + }, + { + "auxiliary_loss_clip": 0.01096508, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.03588998, + "balance_loss_mlp": 1.01903701, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.6382736861037845, + "language_loss": 0.7602011, + "learning_rate": 4.139949716968223e-07, + "loss": 0.78147733, + "num_input_tokens_seen": 286247305, + "step": 13268, + "time_per_iteration": 2.487372636795044 + }, + { + "auxiliary_loss_clip": 0.01109445, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.03722119, + "balance_loss_mlp": 1.01708674, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.6172816132484245, + "language_loss": 0.78072023, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.80210745, + "num_input_tokens_seen": 286268145, + "step": 13269, + "time_per_iteration": 2.4343535900115967 + }, + { + "auxiliary_loss_clip": 0.01090625, + "auxiliary_loss_mlp": 0.01037866, + "balance_loss_clip": 1.03280902, + "balance_loss_mlp": 1.02547383, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.8287338420019454, + "language_loss": 0.8188383, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84012318, + "num_input_tokens_seen": 286286775, + "step": 13270, + "time_per_iteration": 2.455617666244507 + }, + { + "auxiliary_loss_clip": 0.01075301, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.0402739, + "balance_loss_mlp": 1.02235663, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 2.0218795249371637, + "language_loss": 0.59550846, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61662102, + "num_input_tokens_seen": 286305590, + "step": 13271, + "time_per_iteration": 2.534663200378418 + }, + { + "auxiliary_loss_clip": 0.0109139, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.03553057, + "balance_loss_mlp": 1.0222199, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.4645946130028582, + "language_loss": 0.7358489, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75710762, + "num_input_tokens_seen": 286328050, + "step": 13272, + "time_per_iteration": 2.585679769515991 + }, + { + "auxiliary_loss_clip": 0.0105041, + "auxiliary_loss_mlp": 0.01033926, + "balance_loss_clip": 1.03252983, + "balance_loss_mlp": 1.02039623, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 1.796463572253145, + "language_loss": 0.72028446, + "learning_rate": 4.128093876144161e-07, + "loss": 0.74112785, + "num_input_tokens_seen": 286345265, + "step": 13273, + "time_per_iteration": 2.570688486099243 + }, + { + "auxiliary_loss_clip": 0.01089387, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.03634071, + "balance_loss_mlp": 1.02182651, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 2.474935303761688, + "language_loss": 0.75649935, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77773678, + "num_input_tokens_seen": 286364465, + "step": 13274, + "time_per_iteration": 4.0636491775512695 + }, + { + "auxiliary_loss_clip": 0.0106117, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.03608871, + "balance_loss_mlp": 1.01979554, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.3341346438621304, + "language_loss": 0.77612287, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79704893, + "num_input_tokens_seen": 286385565, + "step": 13275, + "time_per_iteration": 2.6092283725738525 + }, + { + "auxiliary_loss_clip": 0.01100824, + "auxiliary_loss_mlp": 0.01033491, + "balance_loss_clip": 1.03671789, + "balance_loss_mlp": 1.02038383, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 2.114053352699332, + "language_loss": 0.64249504, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66383821, + "num_input_tokens_seen": 286403950, + "step": 13276, + "time_per_iteration": 2.5788683891296387 + }, + { + "auxiliary_loss_clip": 0.0107256, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.03797686, + "balance_loss_mlp": 1.02127624, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.6720424524961885, + "language_loss": 0.60803038, + "learning_rate": 4.118620036501945e-07, + "loss": 0.6290831, + "num_input_tokens_seen": 286426160, + "step": 13277, + "time_per_iteration": 2.5879569053649902 + }, + { + "auxiliary_loss_clip": 0.01085106, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.03595543, + "balance_loss_mlp": 1.02079844, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 2.309298487492006, + "language_loss": 0.7928468, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81403083, + "num_input_tokens_seen": 286446610, + "step": 13278, + "time_per_iteration": 2.5518908500671387 + }, + { + "auxiliary_loss_clip": 0.01089271, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.03599823, + "balance_loss_mlp": 1.02326334, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 2.038296318590381, + "language_loss": 0.63431346, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65557235, + "num_input_tokens_seen": 286465460, + "step": 13279, + "time_per_iteration": 2.499910831451416 + }, + { + "auxiliary_loss_clip": 0.01091976, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.03419542, + "balance_loss_mlp": 1.01648974, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.726723779245028, + "language_loss": 0.70962429, + "learning_rate": 4.111520979802825e-07, + "loss": 0.73082, + "num_input_tokens_seen": 286485720, + "step": 13280, + "time_per_iteration": 2.531456708908081 + }, + { + "auxiliary_loss_clip": 0.01073323, + "auxiliary_loss_mlp": 0.01041022, + "balance_loss_clip": 1.03602445, + "balance_loss_mlp": 1.02659202, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.7059754507022704, + "language_loss": 0.62712598, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64826941, + "num_input_tokens_seen": 286507465, + "step": 13281, + "time_per_iteration": 2.621518850326538 + }, + { + "auxiliary_loss_clip": 0.01100549, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.03450203, + "balance_loss_mlp": 1.02140713, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 2.391470100813837, + "language_loss": 0.80372512, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.8250711, + "num_input_tokens_seen": 286526345, + "step": 13282, + "time_per_iteration": 2.4723851680755615 + }, + { + "auxiliary_loss_clip": 0.01078123, + "auxiliary_loss_mlp": 0.00777567, + "balance_loss_clip": 1.03387594, + "balance_loss_mlp": 1.00056708, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 4.546750845615313, + "language_loss": 0.7174201, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73597699, + "num_input_tokens_seen": 286544095, + "step": 13283, + "time_per_iteration": 2.5133132934570312 + }, + { + "auxiliary_loss_clip": 0.01095259, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.03529608, + "balance_loss_mlp": 1.01972127, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 2.2785331295066644, + "language_loss": 0.73351437, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75478035, + "num_input_tokens_seen": 286560960, + "step": 13284, + "time_per_iteration": 2.4293699264526367 + }, + { + "auxiliary_loss_clip": 0.01081699, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.03269076, + "balance_loss_mlp": 1.02729881, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 1.637657373972986, + "language_loss": 0.70172668, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72292936, + "num_input_tokens_seen": 286579865, + "step": 13285, + "time_per_iteration": 2.6409356594085693 + }, + { + "auxiliary_loss_clip": 0.01081895, + "auxiliary_loss_mlp": 0.01030476, + "balance_loss_clip": 1.03351891, + "balance_loss_mlp": 1.01865017, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.6766685676605888, + "language_loss": 0.73731554, + "learning_rate": 4.097339136128437e-07, + "loss": 0.75843924, + "num_input_tokens_seen": 286597295, + "step": 13286, + "time_per_iteration": 2.496466875076294 + }, + { + "auxiliary_loss_clip": 0.01087623, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.03571045, + "balance_loss_mlp": 1.01993465, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 4.984972655557115, + "language_loss": 0.75320995, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.77440649, + "num_input_tokens_seen": 286616270, + "step": 13287, + "time_per_iteration": 2.546732187271118 + }, + { + "auxiliary_loss_clip": 0.01085153, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.037323, + "balance_loss_mlp": 1.01713514, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 1.9095501081010446, + "language_loss": 0.61724538, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63838851, + "num_input_tokens_seen": 286638315, + "step": 13288, + "time_per_iteration": 2.5629148483276367 + }, + { + "auxiliary_loss_clip": 0.01097712, + "auxiliary_loss_mlp": 0.01034241, + "balance_loss_clip": 1.03832483, + "balance_loss_mlp": 1.0227195, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 2.3534555911309183, + "language_loss": 0.7039417, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72526121, + "num_input_tokens_seen": 286658630, + "step": 13289, + "time_per_iteration": 3.96121883392334 + }, + { + "auxiliary_loss_clip": 0.01077449, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.04048717, + "balance_loss_mlp": 1.02267933, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 1.9848378124996522, + "language_loss": 0.62305009, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64417434, + "num_input_tokens_seen": 286676870, + "step": 13290, + "time_per_iteration": 2.512179136276245 + }, + { + "auxiliary_loss_clip": 0.01102741, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.03808284, + "balance_loss_mlp": 1.01579762, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 1.9393473464947555, + "language_loss": 0.71286076, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73417747, + "num_input_tokens_seen": 286694300, + "step": 13291, + "time_per_iteration": 2.5155978202819824 + }, + { + "auxiliary_loss_clip": 0.01078352, + "auxiliary_loss_mlp": 0.01027628, + "balance_loss_clip": 1.0437305, + "balance_loss_mlp": 1.01646399, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.6835815944815908, + "language_loss": 0.63395059, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65501034, + "num_input_tokens_seen": 286714545, + "step": 13292, + "time_per_iteration": 2.573747396469116 + }, + { + "auxiliary_loss_clip": 0.01097234, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.03549862, + "balance_loss_mlp": 1.02207327, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.8538741411568678, + "language_loss": 0.56217802, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58348584, + "num_input_tokens_seen": 286734525, + "step": 13293, + "time_per_iteration": 2.6856729984283447 + }, + { + "auxiliary_loss_clip": 0.01084092, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.03898847, + "balance_loss_mlp": 1.02335835, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.6888671203408143, + "language_loss": 0.71965289, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.74084961, + "num_input_tokens_seen": 286753430, + "step": 13294, + "time_per_iteration": 2.7306647300720215 + }, + { + "auxiliary_loss_clip": 0.01075305, + "auxiliary_loss_mlp": 0.01036347, + "balance_loss_clip": 1.03446615, + "balance_loss_mlp": 1.02422893, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.9879375777719632, + "language_loss": 0.72333157, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74444813, + "num_input_tokens_seen": 286771915, + "step": 13295, + "time_per_iteration": 2.52837872505188 + }, + { + "auxiliary_loss_clip": 0.01078329, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.03708017, + "balance_loss_mlp": 1.02047014, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 1.9482203481368583, + "language_loss": 0.76382732, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78492677, + "num_input_tokens_seen": 286789835, + "step": 13296, + "time_per_iteration": 2.5445213317871094 + }, + { + "auxiliary_loss_clip": 0.01000338, + "auxiliary_loss_mlp": 0.01011582, + "balance_loss_clip": 1.00790644, + "balance_loss_mlp": 1.0103302, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.6930951247133706, + "language_loss": 0.60804588, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62816507, + "num_input_tokens_seen": 286855580, + "step": 13297, + "time_per_iteration": 4.684834957122803 + }, + { + "auxiliary_loss_clip": 0.01087002, + "auxiliary_loss_mlp": 0.01031959, + "balance_loss_clip": 1.03754282, + "balance_loss_mlp": 1.02080131, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 2.027248550007412, + "language_loss": 0.70249903, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.7236886, + "num_input_tokens_seen": 286874360, + "step": 13298, + "time_per_iteration": 2.4710569381713867 + }, + { + "auxiliary_loss_clip": 0.01074006, + "auxiliary_loss_mlp": 0.01036662, + "balance_loss_clip": 1.03497136, + "balance_loss_mlp": 1.02196932, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 2.1243397784784124, + "language_loss": 0.75407958, + "learning_rate": 4.066686308212037e-07, + "loss": 0.7751863, + "num_input_tokens_seen": 286891950, + "step": 13299, + "time_per_iteration": 2.527278184890747 + }, + { + "auxiliary_loss_clip": 0.01080838, + "auxiliary_loss_mlp": 0.01029274, + "balance_loss_clip": 1.03272974, + "balance_loss_mlp": 1.01819396, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.6844759168036585, + "language_loss": 0.77606583, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79716694, + "num_input_tokens_seen": 286911725, + "step": 13300, + "time_per_iteration": 2.5313801765441895 + }, + { + "auxiliary_loss_clip": 0.01069334, + "auxiliary_loss_mlp": 0.01040706, + "balance_loss_clip": 1.03028214, + "balance_loss_mlp": 1.02594233, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 1.9528682987339943, + "language_loss": 0.6411494, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.6622498, + "num_input_tokens_seen": 286931400, + "step": 13301, + "time_per_iteration": 2.554408550262451 + }, + { + "auxiliary_loss_clip": 0.01096069, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.03514588, + "balance_loss_mlp": 1.0226469, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.6882905360846536, + "language_loss": 0.72169858, + "learning_rate": 4.059627072173928e-07, + "loss": 0.74300891, + "num_input_tokens_seen": 286949795, + "step": 13302, + "time_per_iteration": 2.4528493881225586 + }, + { + "auxiliary_loss_clip": 0.01111335, + "auxiliary_loss_mlp": 0.00778592, + "balance_loss_clip": 1.03730512, + "balance_loss_mlp": 1.00064278, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 2.0344956249851247, + "language_loss": 0.83535397, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85425317, + "num_input_tokens_seen": 286968805, + "step": 13303, + "time_per_iteration": 3.9561643600463867 + }, + { + "auxiliary_loss_clip": 0.01106059, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.03609514, + "balance_loss_mlp": 1.01814902, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.732761781730804, + "language_loss": 0.59052646, + "learning_rate": 4.054923936969166e-07, + "loss": 0.61187804, + "num_input_tokens_seen": 286990235, + "step": 13304, + "time_per_iteration": 2.4977166652679443 + }, + { + "auxiliary_loss_clip": 0.01109675, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.03440619, + "balance_loss_mlp": 1.01691127, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 2.1932086412739937, + "language_loss": 0.69068986, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71208304, + "num_input_tokens_seen": 287011060, + "step": 13305, + "time_per_iteration": 2.434758424758911 + }, + { + "auxiliary_loss_clip": 0.01077912, + "auxiliary_loss_mlp": 0.01028759, + "balance_loss_clip": 1.04152703, + "balance_loss_mlp": 1.01755905, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.6334434436817378, + "language_loss": 0.69304901, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71411568, + "num_input_tokens_seen": 287029215, + "step": 13306, + "time_per_iteration": 2.5130810737609863 + }, + { + "auxiliary_loss_clip": 0.01099076, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.03719687, + "balance_loss_mlp": 1.02209783, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.5295073470536418, + "language_loss": 0.69530934, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71663892, + "num_input_tokens_seen": 287050855, + "step": 13307, + "time_per_iteration": 2.5626823902130127 + }, + { + "auxiliary_loss_clip": 0.01087293, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.03448272, + "balance_loss_mlp": 1.02141762, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 2.5377648873467105, + "language_loss": 0.77020419, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.79141176, + "num_input_tokens_seen": 287069915, + "step": 13308, + "time_per_iteration": 2.4703094959259033 + }, + { + "auxiliary_loss_clip": 0.01068623, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.03899264, + "balance_loss_mlp": 1.01871693, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.4604168615354862, + "language_loss": 0.79032302, + "learning_rate": 4.0431766816972e-07, + "loss": 0.81133103, + "num_input_tokens_seen": 287091450, + "step": 13309, + "time_per_iteration": 2.652087450027466 + }, + { + "auxiliary_loss_clip": 0.0102845, + "auxiliary_loss_mlp": 0.01001891, + "balance_loss_clip": 1.00493383, + "balance_loss_mlp": 1.00075269, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9396303013968978, + "language_loss": 0.64684892, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66715229, + "num_input_tokens_seen": 287148365, + "step": 13310, + "time_per_iteration": 2.933288812637329 + }, + { + "auxiliary_loss_clip": 0.01097963, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.03546333, + "balance_loss_mlp": 1.02178884, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 2.0926305569516956, + "language_loss": 0.83047342, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85179144, + "num_input_tokens_seen": 287168280, + "step": 13311, + "time_per_iteration": 2.520423173904419 + }, + { + "auxiliary_loss_clip": 0.01100043, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.03685236, + "balance_loss_mlp": 1.02042389, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.166181028523196, + "language_loss": 0.66603315, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68736082, + "num_input_tokens_seen": 287185980, + "step": 13312, + "time_per_iteration": 2.4432015419006348 + }, + { + "auxiliary_loss_clip": 0.01112856, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.03905642, + "balance_loss_mlp": 1.02074361, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.8672206107814795, + "language_loss": 0.75533187, + "learning_rate": 4.033789768462843e-07, + "loss": 0.7767998, + "num_input_tokens_seen": 287203875, + "step": 13313, + "time_per_iteration": 2.4748525619506836 + }, + { + "auxiliary_loss_clip": 0.01096653, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.03343058, + "balance_loss_mlp": 1.02008367, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.3433844206819288, + "language_loss": 0.75834048, + "learning_rate": 4.031444553532575e-07, + "loss": 0.7796284, + "num_input_tokens_seen": 287226445, + "step": 13314, + "time_per_iteration": 2.5572750568389893 + }, + { + "auxiliary_loss_clip": 0.00994836, + "auxiliary_loss_mlp": 0.01002551, + "balance_loss_clip": 1.01297927, + "balance_loss_mlp": 1.00118589, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.8539180586368846, + "language_loss": 0.53785729, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55783117, + "num_input_tokens_seen": 287286240, + "step": 13315, + "time_per_iteration": 3.0355746746063232 + }, + { + "auxiliary_loss_clip": 0.01085771, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.03651094, + "balance_loss_mlp": 1.02111769, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 1.8862381186831216, + "language_loss": 0.71186918, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73305559, + "num_input_tokens_seen": 287310265, + "step": 13316, + "time_per_iteration": 2.629634380340576 + }, + { + "auxiliary_loss_clip": 0.01090777, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.03872013, + "balance_loss_mlp": 1.01957655, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 1.9741165905085045, + "language_loss": 0.64522922, + "learning_rate": 4.024412542272706e-07, + "loss": 0.6664573, + "num_input_tokens_seen": 287331610, + "step": 13317, + "time_per_iteration": 2.65434193611145 + }, + { + "auxiliary_loss_clip": 0.0102821, + "auxiliary_loss_mlp": 0.01003917, + "balance_loss_clip": 1.00470877, + "balance_loss_mlp": 1.00278497, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7651161730130214, + "language_loss": 0.58995008, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61027133, + "num_input_tokens_seen": 287394795, + "step": 13318, + "time_per_iteration": 3.095349073410034 + }, + { + "auxiliary_loss_clip": 0.01075163, + "auxiliary_loss_mlp": 0.01025199, + "balance_loss_clip": 1.03620529, + "balance_loss_mlp": 1.01325428, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 10.209122831872632, + "language_loss": 0.66203094, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68303454, + "num_input_tokens_seen": 287414595, + "step": 13319, + "time_per_iteration": 2.566054582595825 + }, + { + "auxiliary_loss_clip": 0.01111519, + "auxiliary_loss_mlp": 0.00779471, + "balance_loss_clip": 1.03627849, + "balance_loss_mlp": 1.00062263, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 2.9161423053285973, + "language_loss": 0.74101961, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.75992942, + "num_input_tokens_seen": 287434395, + "step": 13320, + "time_per_iteration": 2.4322354793548584 + }, + { + "auxiliary_loss_clip": 0.01095945, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.03544509, + "balance_loss_mlp": 1.01655018, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 1.770821028829361, + "language_loss": 0.80387932, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82513165, + "num_input_tokens_seen": 287450590, + "step": 13321, + "time_per_iteration": 2.4391891956329346 + }, + { + "auxiliary_loss_clip": 0.01038216, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.02754796, + "balance_loss_mlp": 1.02025151, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 1.854705211243867, + "language_loss": 0.65591788, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.67662489, + "num_input_tokens_seen": 287468455, + "step": 13322, + "time_per_iteration": 2.598172903060913 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.03657889, + "balance_loss_mlp": 1.01651597, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 1.7872904088746668, + "language_loss": 0.78115892, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80244207, + "num_input_tokens_seen": 287486485, + "step": 13323, + "time_per_iteration": 2.4643800258636475 + }, + { + "auxiliary_loss_clip": 0.0110999, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.03668785, + "balance_loss_mlp": 1.01449394, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 3.4617264737946285, + "language_loss": 0.71771944, + "learning_rate": 4.00802572299932e-07, + "loss": 0.73908627, + "num_input_tokens_seen": 287503940, + "step": 13324, + "time_per_iteration": 2.4999001026153564 + }, + { + "auxiliary_loss_clip": 0.01068684, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.03150582, + "balance_loss_mlp": 1.0202415, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 1.749226448973801, + "language_loss": 0.76393616, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78495514, + "num_input_tokens_seen": 287521660, + "step": 13325, + "time_per_iteration": 2.5197513103485107 + }, + { + "auxiliary_loss_clip": 0.01083597, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.03285384, + "balance_loss_mlp": 1.01792359, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.8217716004633875, + "language_loss": 0.79779929, + "learning_rate": 4.003349231059898e-07, + "loss": 0.8189224, + "num_input_tokens_seen": 287541505, + "step": 13326, + "time_per_iteration": 2.4614663124084473 + }, + { + "auxiliary_loss_clip": 0.01097277, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.03664804, + "balance_loss_mlp": 1.01918244, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 1.8116074917885305, + "language_loss": 0.66292679, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68420601, + "num_input_tokens_seen": 287560015, + "step": 13327, + "time_per_iteration": 2.5545525550842285 + }, + { + "auxiliary_loss_clip": 0.01094756, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.0374186, + "balance_loss_mlp": 1.02069235, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 1.6309585510122517, + "language_loss": 0.73354775, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75481915, + "num_input_tokens_seen": 287579150, + "step": 13328, + "time_per_iteration": 4.046750545501709 + }, + { + "auxiliary_loss_clip": 0.01051093, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.03312969, + "balance_loss_mlp": 1.02154732, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 2.4651705095986602, + "language_loss": 0.74042439, + "learning_rate": 3.996339042831798e-07, + "loss": 0.76128685, + "num_input_tokens_seen": 287597420, + "step": 13329, + "time_per_iteration": 2.5377049446105957 + }, + { + "auxiliary_loss_clip": 0.01019271, + "auxiliary_loss_mlp": 0.01001935, + "balance_loss_clip": 1.00469911, + "balance_loss_mlp": 1.00080228, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.7143382879937533, + "language_loss": 0.53001213, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.55022418, + "num_input_tokens_seen": 287667280, + "step": 13330, + "time_per_iteration": 3.169539213180542 + }, + { + "auxiliary_loss_clip": 0.01088983, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.03910875, + "balance_loss_mlp": 1.02211237, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 1.9296731197099297, + "language_loss": 0.72692549, + "learning_rate": 3.991668618167519e-07, + "loss": 0.74817216, + "num_input_tokens_seen": 287687375, + "step": 13331, + "time_per_iteration": 2.5329370498657227 + }, + { + "auxiliary_loss_clip": 0.01093849, + "auxiliary_loss_mlp": 0.01027196, + "balance_loss_clip": 1.03795087, + "balance_loss_mlp": 1.01614523, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 2.038033237287405, + "language_loss": 0.77124071, + "learning_rate": 3.989334316347401e-07, + "loss": 0.7924512, + "num_input_tokens_seen": 287707895, + "step": 13332, + "time_per_iteration": 2.495490789413452 + }, + { + "auxiliary_loss_clip": 0.01109843, + "auxiliary_loss_mlp": 0.01027363, + "balance_loss_clip": 1.03761363, + "balance_loss_mlp": 1.01524544, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 2.1141274260838228, + "language_loss": 0.83783054, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85920262, + "num_input_tokens_seen": 287723990, + "step": 13333, + "time_per_iteration": 2.440140962600708 + }, + { + "auxiliary_loss_clip": 0.01087694, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.03529835, + "balance_loss_mlp": 1.01687932, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.6534724243716894, + "language_loss": 0.72989649, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75106466, + "num_input_tokens_seen": 287742380, + "step": 13334, + "time_per_iteration": 2.523320198059082 + }, + { + "auxiliary_loss_clip": 0.01072334, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.03482735, + "balance_loss_mlp": 1.02059174, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 2.184481629132726, + "language_loss": 0.74719399, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76824802, + "num_input_tokens_seen": 287760130, + "step": 13335, + "time_per_iteration": 2.586077928543091 + }, + { + "auxiliary_loss_clip": 0.01072907, + "auxiliary_loss_mlp": 0.01027309, + "balance_loss_clip": 1.03758836, + "balance_loss_mlp": 1.01473808, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 1.9144545469109773, + "language_loss": 0.75557119, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.7765733, + "num_input_tokens_seen": 287777565, + "step": 13336, + "time_per_iteration": 4.108017683029175 + }, + { + "auxiliary_loss_clip": 0.01082688, + "auxiliary_loss_mlp": 0.01036061, + "balance_loss_clip": 1.04249763, + "balance_loss_mlp": 1.02283478, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 2.494641555977403, + "language_loss": 0.74829155, + "learning_rate": 3.977671915907068e-07, + "loss": 0.76947904, + "num_input_tokens_seen": 287796310, + "step": 13337, + "time_per_iteration": 2.5707712173461914 + }, + { + "auxiliary_loss_clip": 0.01054385, + "auxiliary_loss_mlp": 0.00778642, + "balance_loss_clip": 1.0394876, + "balance_loss_mlp": 1.00056815, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 1.732516146164518, + "language_loss": 0.80101609, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.81934637, + "num_input_tokens_seen": 287817330, + "step": 13338, + "time_per_iteration": 2.717006206512451 + }, + { + "auxiliary_loss_clip": 0.01073523, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.03257179, + "balance_loss_mlp": 1.01563811, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 2.3514441473555454, + "language_loss": 0.74611175, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.76713729, + "num_input_tokens_seen": 287835095, + "step": 13339, + "time_per_iteration": 2.5489487648010254 + }, + { + "auxiliary_loss_clip": 0.01094015, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.03566086, + "balance_loss_mlp": 1.01682615, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.7958533352840715, + "language_loss": 0.78963333, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81085503, + "num_input_tokens_seen": 287854595, + "step": 13340, + "time_per_iteration": 2.5652661323547363 + }, + { + "auxiliary_loss_clip": 0.01082646, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.03951383, + "balance_loss_mlp": 1.01975322, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 1.6634757340177329, + "language_loss": 0.68124461, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70238221, + "num_input_tokens_seen": 287876960, + "step": 13341, + "time_per_iteration": 2.612433671951294 + }, + { + "auxiliary_loss_clip": 0.01011667, + "auxiliary_loss_mlp": 0.0099852, + "balance_loss_clip": 1.00574911, + "balance_loss_mlp": 0.99736959, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8156613890612037, + "language_loss": 0.61647457, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63657647, + "num_input_tokens_seen": 287936530, + "step": 13342, + "time_per_iteration": 4.406428813934326 + }, + { + "auxiliary_loss_clip": 0.0108963, + "auxiliary_loss_mlp": 0.01032994, + "balance_loss_clip": 1.0364629, + "balance_loss_mlp": 1.02072763, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 1.7892355282482169, + "language_loss": 0.63740987, + "learning_rate": 3.963697086102522e-07, + "loss": 0.65863609, + "num_input_tokens_seen": 287954285, + "step": 13343, + "time_per_iteration": 2.5541114807128906 + }, + { + "auxiliary_loss_clip": 0.01084086, + "auxiliary_loss_mlp": 0.01028686, + "balance_loss_clip": 1.03724766, + "balance_loss_mlp": 1.017766, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 1.9986208798822918, + "language_loss": 0.69004583, + "learning_rate": 3.96137007563051e-07, + "loss": 0.71117353, + "num_input_tokens_seen": 287971595, + "step": 13344, + "time_per_iteration": 2.5207297801971436 + }, + { + "auxiliary_loss_clip": 0.01098747, + "auxiliary_loss_mlp": 0.01028724, + "balance_loss_clip": 1.03723824, + "balance_loss_mlp": 1.01635051, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.641314985536557, + "language_loss": 0.70367861, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72495335, + "num_input_tokens_seen": 287992540, + "step": 13345, + "time_per_iteration": 2.5743141174316406 + }, + { + "auxiliary_loss_clip": 0.01014634, + "auxiliary_loss_mlp": 0.01007478, + "balance_loss_clip": 1.00915062, + "balance_loss_mlp": 1.00625575, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8676963215998835, + "language_loss": 0.6297307, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64995182, + "num_input_tokens_seen": 288052810, + "step": 13346, + "time_per_iteration": 3.1625568866729736 + }, + { + "auxiliary_loss_clip": 0.01088783, + "auxiliary_loss_mlp": 0.01027371, + "balance_loss_clip": 1.04235888, + "balance_loss_mlp": 1.015522, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 1.485805586776943, + "language_loss": 0.72779542, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74895698, + "num_input_tokens_seen": 288073045, + "step": 13347, + "time_per_iteration": 2.616982936859131 + }, + { + "auxiliary_loss_clip": 0.01099515, + "auxiliary_loss_mlp": 0.01029135, + "balance_loss_clip": 1.03719962, + "balance_loss_mlp": 1.01623654, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 1.9392925545265682, + "language_loss": 0.72572792, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.7470144, + "num_input_tokens_seen": 288091165, + "step": 13348, + "time_per_iteration": 2.464625835418701 + }, + { + "auxiliary_loss_clip": 0.01082088, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.03839135, + "balance_loss_mlp": 1.01691186, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 1.7510681263552001, + "language_loss": 0.76006603, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.78118086, + "num_input_tokens_seen": 288110595, + "step": 13349, + "time_per_iteration": 2.536046266555786 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.0383507, + "balance_loss_mlp": 1.02199888, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 2.0913518899259134, + "language_loss": 0.83329773, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85473001, + "num_input_tokens_seen": 288128995, + "step": 13350, + "time_per_iteration": 2.4198310375213623 + }, + { + "auxiliary_loss_clip": 0.01099913, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.03738546, + "balance_loss_mlp": 1.02009773, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 1.8369892756187498, + "language_loss": 0.71263933, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73395693, + "num_input_tokens_seen": 288149265, + "step": 13351, + "time_per_iteration": 4.077024698257446 + }, + { + "auxiliary_loss_clip": 0.01070861, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.03366506, + "balance_loss_mlp": 1.01985955, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 1.5196222767162397, + "language_loss": 0.61548698, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63651586, + "num_input_tokens_seen": 288170745, + "step": 13352, + "time_per_iteration": 2.613435983657837 + }, + { + "auxiliary_loss_clip": 0.01096644, + "auxiliary_loss_mlp": 0.0103709, + "balance_loss_clip": 1.03698087, + "balance_loss_mlp": 1.02536023, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 1.7252621003169988, + "language_loss": 0.76733601, + "learning_rate": 3.940454360354046e-07, + "loss": 0.7886734, + "num_input_tokens_seen": 288189415, + "step": 13353, + "time_per_iteration": 2.4690589904785156 + }, + { + "auxiliary_loss_clip": 0.01054702, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.03754365, + "balance_loss_mlp": 1.0170964, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 2.1415421708703337, + "language_loss": 0.73945308, + "learning_rate": 3.938133435713582e-07, + "loss": 0.76030767, + "num_input_tokens_seen": 288206900, + "step": 13354, + "time_per_iteration": 2.6185615062713623 + }, + { + "auxiliary_loss_clip": 0.01074557, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.03837824, + "balance_loss_mlp": 1.02529526, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 2.0931514292543607, + "language_loss": 0.66239572, + "learning_rate": 3.935813120140714e-07, + "loss": 0.6835252, + "num_input_tokens_seen": 288224800, + "step": 13355, + "time_per_iteration": 2.572523355484009 + }, + { + "auxiliary_loss_clip": 0.0107546, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.03142881, + "balance_loss_mlp": 1.01949501, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 2.3574968057485655, + "language_loss": 0.69265914, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.71375221, + "num_input_tokens_seen": 288249400, + "step": 13356, + "time_per_iteration": 2.8313727378845215 + }, + { + "auxiliary_loss_clip": 0.01069599, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.04102921, + "balance_loss_mlp": 1.01735902, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.5704549543970234, + "language_loss": 0.77659154, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79758465, + "num_input_tokens_seen": 288268780, + "step": 13357, + "time_per_iteration": 2.6392550468444824 + }, + { + "auxiliary_loss_clip": 0.01075617, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.03330874, + "balance_loss_mlp": 1.01618195, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.558005096130597, + "language_loss": 0.76808631, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.78913736, + "num_input_tokens_seen": 288290830, + "step": 13358, + "time_per_iteration": 2.6412713527679443 + }, + { + "auxiliary_loss_clip": 0.01097206, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.03531742, + "balance_loss_mlp": 1.0177927, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.784220347103295, + "language_loss": 0.84807324, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.86933947, + "num_input_tokens_seen": 288308865, + "step": 13359, + "time_per_iteration": 2.551260232925415 + }, + { + "auxiliary_loss_clip": 0.01086251, + "auxiliary_loss_mlp": 0.01023881, + "balance_loss_clip": 1.04145229, + "balance_loss_mlp": 1.01253188, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 3.017532773245548, + "language_loss": 0.73396599, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75506735, + "num_input_tokens_seen": 288327325, + "step": 13360, + "time_per_iteration": 2.5760602951049805 + }, + { + "auxiliary_loss_clip": 0.01109427, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.03666222, + "balance_loss_mlp": 1.01755643, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 2.1391931417893733, + "language_loss": 0.70142931, + "learning_rate": 3.921904022048512e-07, + "loss": 0.72281587, + "num_input_tokens_seen": 288347285, + "step": 13361, + "time_per_iteration": 2.527945041656494 + }, + { + "auxiliary_loss_clip": 0.01112717, + "auxiliary_loss_mlp": 0.0103809, + "balance_loss_clip": 1.03762782, + "balance_loss_mlp": 1.02512562, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 1.6317202430524627, + "language_loss": 0.7013815, + "learning_rate": 3.919587972411098e-07, + "loss": 0.7228896, + "num_input_tokens_seen": 288367785, + "step": 13362, + "time_per_iteration": 2.4832992553710938 + }, + { + "auxiliary_loss_clip": 0.01115921, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.03799582, + "balance_loss_mlp": 1.02372861, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 2.6179745531589247, + "language_loss": 0.78069758, + "learning_rate": 3.91727253254452e-07, + "loss": 0.80223799, + "num_input_tokens_seen": 288384135, + "step": 13363, + "time_per_iteration": 2.437920093536377 + }, + { + "auxiliary_loss_clip": 0.01097462, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.03494751, + "balance_loss_mlp": 1.01767445, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 2.20486116084393, + "language_loss": 0.74497932, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.76625752, + "num_input_tokens_seen": 288403805, + "step": 13364, + "time_per_iteration": 2.5249743461608887 + }, + { + "auxiliary_loss_clip": 0.01100999, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.04407632, + "balance_loss_mlp": 1.0181911, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 2.0550585016164264, + "language_loss": 0.60714293, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62845242, + "num_input_tokens_seen": 288424895, + "step": 13365, + "time_per_iteration": 2.5987462997436523 + }, + { + "auxiliary_loss_clip": 0.01088902, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.03540146, + "balance_loss_mlp": 1.02059031, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 1.800260380858131, + "language_loss": 0.66251361, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68373346, + "num_input_tokens_seen": 288443865, + "step": 13366, + "time_per_iteration": 2.5079050064086914 + }, + { + "auxiliary_loss_clip": 0.01105183, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.03508317, + "balance_loss_mlp": 1.02054119, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.098704701144821, + "language_loss": 0.74980104, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77117562, + "num_input_tokens_seen": 288461065, + "step": 13367, + "time_per_iteration": 3.8982765674591064 + }, + { + "auxiliary_loss_clip": 0.01106646, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.03582549, + "balance_loss_mlp": 1.01520908, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.5761524002334886, + "language_loss": 0.73800302, + "learning_rate": 3.905704482846428e-07, + "loss": 0.75933677, + "num_input_tokens_seen": 288481865, + "step": 13368, + "time_per_iteration": 2.5015392303466797 + }, + { + "auxiliary_loss_clip": 0.01111094, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.03626132, + "balance_loss_mlp": 1.02013183, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 2.1805008477207757, + "language_loss": 0.7053892, + "learning_rate": 3.90339270344789e-07, + "loss": 0.72682565, + "num_input_tokens_seen": 288499345, + "step": 13369, + "time_per_iteration": 2.4310553073883057 + }, + { + "auxiliary_loss_clip": 0.01090239, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.03703332, + "balance_loss_mlp": 1.0190407, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 1.9413878592671423, + "language_loss": 0.73849618, + "learning_rate": 3.901081534434312e-07, + "loss": 0.75970101, + "num_input_tokens_seen": 288517660, + "step": 13370, + "time_per_iteration": 2.505253553390503 + }, + { + "auxiliary_loss_clip": 0.01090647, + "auxiliary_loss_mlp": 0.01032364, + "balance_loss_clip": 1.0362606, + "balance_loss_mlp": 1.01900065, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 2.5639352612044877, + "language_loss": 0.86682397, + "learning_rate": 3.898770975893342e-07, + "loss": 0.88805407, + "num_input_tokens_seen": 288534180, + "step": 13371, + "time_per_iteration": 2.4931159019470215 + }, + { + "auxiliary_loss_clip": 0.01101164, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.03475642, + "balance_loss_mlp": 1.0195725, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 1.9492728831025201, + "language_loss": 0.74906313, + "learning_rate": 3.89646102791259e-07, + "loss": 0.77040339, + "num_input_tokens_seen": 288553350, + "step": 13372, + "time_per_iteration": 2.47347354888916 + }, + { + "auxiliary_loss_clip": 0.01069396, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.03642082, + "balance_loss_mlp": 1.01804554, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 3.4072021252863856, + "language_loss": 0.79114246, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81214845, + "num_input_tokens_seen": 288571325, + "step": 13373, + "time_per_iteration": 2.633662462234497 + }, + { + "auxiliary_loss_clip": 0.01083146, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.03360367, + "balance_loss_mlp": 1.02054024, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 1.8220297069618925, + "language_loss": 0.74583793, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76698935, + "num_input_tokens_seen": 288592100, + "step": 13374, + "time_per_iteration": 2.530850410461426 + }, + { + "auxiliary_loss_clip": 0.01062538, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.03133953, + "balance_loss_mlp": 1.02109134, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 2.202491737573181, + "language_loss": 0.69109756, + "learning_rate": 3.889534848207452e-07, + "loss": 0.71207541, + "num_input_tokens_seen": 288612305, + "step": 13375, + "time_per_iteration": 4.144866228103638 + }, + { + "auxiliary_loss_clip": 0.01013109, + "auxiliary_loss_mlp": 0.01000126, + "balance_loss_clip": 1.01787281, + "balance_loss_mlp": 0.99886245, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.726629587578568, + "language_loss": 0.55702615, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57715851, + "num_input_tokens_seen": 288676015, + "step": 13376, + "time_per_iteration": 3.2109570503234863 + }, + { + "auxiliary_loss_clip": 0.01058286, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.03202891, + "balance_loss_mlp": 1.01968098, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 1.6076884561180456, + "language_loss": 0.73040128, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75131762, + "num_input_tokens_seen": 288696455, + "step": 13377, + "time_per_iteration": 2.6454379558563232 + }, + { + "auxiliary_loss_clip": 0.01095705, + "auxiliary_loss_mlp": 0.01030437, + "balance_loss_clip": 1.03446198, + "balance_loss_mlp": 1.0179019, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.7154946197691223, + "language_loss": 0.70050257, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72176397, + "num_input_tokens_seen": 288715560, + "step": 13378, + "time_per_iteration": 2.526319980621338 + }, + { + "auxiliary_loss_clip": 0.0110174, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.03657031, + "balance_loss_mlp": 1.01461744, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.569256588845558, + "language_loss": 0.69243968, + "learning_rate": 3.880308495088347e-07, + "loss": 0.7137295, + "num_input_tokens_seen": 288739485, + "step": 13379, + "time_per_iteration": 2.6830215454101562 + }, + { + "auxiliary_loss_clip": 0.01114296, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.03939319, + "balance_loss_mlp": 1.01817346, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.7345893473199405, + "language_loss": 0.76191163, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.78337252, + "num_input_tokens_seen": 288757420, + "step": 13380, + "time_per_iteration": 2.434553861618042 + }, + { + "auxiliary_loss_clip": 0.01063691, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.03353333, + "balance_loss_mlp": 1.01678717, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 4.866657184010001, + "language_loss": 0.69516468, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71608698, + "num_input_tokens_seen": 288775535, + "step": 13381, + "time_per_iteration": 2.6406562328338623 + }, + { + "auxiliary_loss_clip": 0.01101259, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.03731704, + "balance_loss_mlp": 1.02229905, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 2.294098881508629, + "language_loss": 0.6377852, + "learning_rate": 3.873395148176135e-07, + "loss": 0.6591484, + "num_input_tokens_seen": 288795035, + "step": 13382, + "time_per_iteration": 3.9925715923309326 + }, + { + "auxiliary_loss_clip": 0.01088851, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.03832555, + "balance_loss_mlp": 1.02643442, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 3.0813808565859166, + "language_loss": 0.76191831, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.7831865, + "num_input_tokens_seen": 288816270, + "step": 13383, + "time_per_iteration": 2.567134141921997 + }, + { + "auxiliary_loss_clip": 0.01093771, + "auxiliary_loss_mlp": 0.01040138, + "balance_loss_clip": 1.03404975, + "balance_loss_mlp": 1.02706718, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 1.7024912112564528, + "language_loss": 0.69760984, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71894896, + "num_input_tokens_seen": 288836050, + "step": 13384, + "time_per_iteration": 2.5269834995269775 + }, + { + "auxiliary_loss_clip": 0.01098098, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.03407955, + "balance_loss_mlp": 1.02450514, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 2.242096366044416, + "language_loss": 0.79597938, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81733727, + "num_input_tokens_seen": 288852900, + "step": 13385, + "time_per_iteration": 2.446181297302246 + }, + { + "auxiliary_loss_clip": 0.01109051, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.03598738, + "balance_loss_mlp": 1.02043808, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.72344323843379, + "language_loss": 0.72067797, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74210209, + "num_input_tokens_seen": 288872625, + "step": 13386, + "time_per_iteration": 2.51619553565979 + }, + { + "auxiliary_loss_clip": 0.00999394, + "auxiliary_loss_mlp": 0.00997889, + "balance_loss_clip": 1.00520039, + "balance_loss_mlp": 0.99681586, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6638100777970464, + "language_loss": 0.51266629, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53263909, + "num_input_tokens_seen": 288939180, + "step": 13387, + "time_per_iteration": 3.125929832458496 + }, + { + "auxiliary_loss_clip": 0.01108057, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.03532016, + "balance_loss_mlp": 1.02004743, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 1.7981947379551286, + "language_loss": 0.74112856, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76254606, + "num_input_tokens_seen": 288958925, + "step": 13388, + "time_per_iteration": 2.4376797676086426 + }, + { + "auxiliary_loss_clip": 0.01080515, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.0390532, + "balance_loss_mlp": 1.01615095, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 1.5243237955812274, + "language_loss": 0.71245784, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73354518, + "num_input_tokens_seen": 288980935, + "step": 13389, + "time_per_iteration": 2.591644048690796 + }, + { + "auxiliary_loss_clip": 0.01085502, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.0396862, + "balance_loss_mlp": 1.02290726, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 2.0327396654092045, + "language_loss": 0.82726264, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.84846765, + "num_input_tokens_seen": 288996780, + "step": 13390, + "time_per_iteration": 2.5019941329956055 + }, + { + "auxiliary_loss_clip": 0.01022618, + "auxiliary_loss_mlp": 0.01007077, + "balance_loss_clip": 1.00781202, + "balance_loss_mlp": 1.00584364, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7758418224402246, + "language_loss": 0.55511129, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57540822, + "num_input_tokens_seen": 289057590, + "step": 13391, + "time_per_iteration": 4.499171495437622 + }, + { + "auxiliary_loss_clip": 0.01096953, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.03831625, + "balance_loss_mlp": 1.01905215, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 2.379225637339929, + "language_loss": 0.84534818, + "learning_rate": 3.850390420667762e-07, + "loss": 0.86662591, + "num_input_tokens_seen": 289076285, + "step": 13392, + "time_per_iteration": 2.431245803833008 + }, + { + "auxiliary_loss_clip": 0.01075487, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.03314853, + "balance_loss_mlp": 1.02122855, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.4404868537632556, + "language_loss": 0.70520782, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72629148, + "num_input_tokens_seen": 289097585, + "step": 13393, + "time_per_iteration": 2.5838892459869385 + }, + { + "auxiliary_loss_clip": 0.01099135, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.03627014, + "balance_loss_mlp": 1.0164957, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 2.5157323398926095, + "language_loss": 0.76280731, + "learning_rate": 3.84579682111414e-07, + "loss": 0.784091, + "num_input_tokens_seen": 289116890, + "step": 13394, + "time_per_iteration": 2.4581058025360107 + }, + { + "auxiliary_loss_clip": 0.0111174, + "auxiliary_loss_mlp": 0.01033798, + "balance_loss_clip": 1.03854239, + "balance_loss_mlp": 1.02147198, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 3.5618245413674883, + "language_loss": 0.64955777, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67101318, + "num_input_tokens_seen": 289136670, + "step": 13395, + "time_per_iteration": 2.4391109943389893 + }, + { + "auxiliary_loss_clip": 0.01019228, + "auxiliary_loss_mlp": 0.0100557, + "balance_loss_clip": 1.00467515, + "balance_loss_mlp": 1.00428855, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.9638551856346489, + "language_loss": 0.57389683, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59414482, + "num_input_tokens_seen": 289200150, + "step": 13396, + "time_per_iteration": 3.150522232055664 + }, + { + "auxiliary_loss_clip": 0.0109941, + "auxiliary_loss_mlp": 0.01036645, + "balance_loss_clip": 1.03643739, + "balance_loss_mlp": 1.02362776, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 2.9288536218135963, + "language_loss": 0.77321243, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79457295, + "num_input_tokens_seen": 289218125, + "step": 13397, + "time_per_iteration": 2.4500927925109863 + }, + { + "auxiliary_loss_clip": 0.01095362, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.03965974, + "balance_loss_mlp": 1.01709783, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.7617032301889655, + "language_loss": 0.70343328, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72467488, + "num_input_tokens_seen": 289237115, + "step": 13398, + "time_per_iteration": 2.418368101119995 + }, + { + "auxiliary_loss_clip": 0.01087752, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.03500676, + "balance_loss_mlp": 1.02126753, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 2.3623003885951075, + "language_loss": 0.68836904, + "learning_rate": 3.834323543710805e-07, + "loss": 0.70957339, + "num_input_tokens_seen": 289253635, + "step": 13399, + "time_per_iteration": 2.469717502593994 + }, + { + "auxiliary_loss_clip": 0.01108297, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.03646815, + "balance_loss_mlp": 1.02005887, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.2773362730255515, + "language_loss": 0.72274435, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74414551, + "num_input_tokens_seen": 289270085, + "step": 13400, + "time_per_iteration": 2.4045515060424805 + }, + { + "auxiliary_loss_clip": 0.01095254, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.03258598, + "balance_loss_mlp": 1.02181339, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 1.8419053575555449, + "language_loss": 0.63960868, + "learning_rate": 3.829738523169037e-07, + "loss": 0.66090226, + "num_input_tokens_seen": 289289645, + "step": 13401, + "time_per_iteration": 2.476816177368164 + }, + { + "auxiliary_loss_clip": 0.0109864, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.03408015, + "balance_loss_mlp": 1.01954031, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.3674399398686665, + "language_loss": 0.83929515, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.86059946, + "num_input_tokens_seen": 289306630, + "step": 13402, + "time_per_iteration": 2.47965931892395 + }, + { + "auxiliary_loss_clip": 0.01064022, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.04153204, + "balance_loss_mlp": 1.02286863, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 1.9942272846049722, + "language_loss": 0.67731178, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.69830084, + "num_input_tokens_seen": 289324960, + "step": 13403, + "time_per_iteration": 2.57930064201355 + }, + { + "auxiliary_loss_clip": 0.0107145, + "auxiliary_loss_mlp": 0.00776906, + "balance_loss_clip": 1.03503823, + "balance_loss_mlp": 1.00054252, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.9450476402979546, + "language_loss": 0.84981942, + "learning_rate": 3.822865591408084e-07, + "loss": 0.868303, + "num_input_tokens_seen": 289344980, + "step": 13404, + "time_per_iteration": 2.630603313446045 + }, + { + "auxiliary_loss_clip": 0.01069382, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.0368526, + "balance_loss_mlp": 1.01768994, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 1.4891653877909496, + "language_loss": 0.70320982, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72419524, + "num_input_tokens_seen": 289367500, + "step": 13405, + "time_per_iteration": 2.676948308944702 + }, + { + "auxiliary_loss_clip": 0.01098714, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.03677344, + "balance_loss_mlp": 1.01472902, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 4.45106711535026, + "language_loss": 0.75799584, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77924126, + "num_input_tokens_seen": 289385930, + "step": 13406, + "time_per_iteration": 2.5023932456970215 + }, + { + "auxiliary_loss_clip": 0.01100403, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.03845072, + "balance_loss_mlp": 1.02069604, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.540802797632767, + "language_loss": 0.76203752, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78337634, + "num_input_tokens_seen": 289408025, + "step": 13407, + "time_per_iteration": 3.988009214401245 + }, + { + "auxiliary_loss_clip": 0.01081099, + "auxiliary_loss_mlp": 0.00778577, + "balance_loss_clip": 1.03361881, + "balance_loss_mlp": 1.00059605, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 2.327041210785719, + "language_loss": 0.73617339, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75477016, + "num_input_tokens_seen": 289426575, + "step": 13408, + "time_per_iteration": 2.4669549465179443 + }, + { + "auxiliary_loss_clip": 0.0107889, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.03275609, + "balance_loss_mlp": 1.02163935, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 1.957369991221816, + "language_loss": 0.70551324, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72664845, + "num_input_tokens_seen": 289447760, + "step": 13409, + "time_per_iteration": 2.523332118988037 + }, + { + "auxiliary_loss_clip": 0.0110749, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.03484285, + "balance_loss_mlp": 1.01608419, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 2.1018429603385447, + "language_loss": 0.76740521, + "learning_rate": 3.809136293070545e-07, + "loss": 0.78876066, + "num_input_tokens_seen": 289463920, + "step": 13410, + "time_per_iteration": 2.4189279079437256 + }, + { + "auxiliary_loss_clip": 0.01096859, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.03571141, + "balance_loss_mlp": 1.01988316, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 1.7763701749086869, + "language_loss": 0.68493474, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70622146, + "num_input_tokens_seen": 289482635, + "step": 13411, + "time_per_iteration": 2.4744465351104736 + }, + { + "auxiliary_loss_clip": 0.01076128, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.0347116, + "balance_loss_mlp": 1.01940203, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.5748473761204909, + "language_loss": 0.68347508, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70454973, + "num_input_tokens_seen": 289502040, + "step": 13412, + "time_per_iteration": 2.5185580253601074 + }, + { + "auxiliary_loss_clip": 0.01097227, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.03672278, + "balance_loss_mlp": 1.02283967, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.7481440775444057, + "language_loss": 0.81666285, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83799839, + "num_input_tokens_seen": 289520740, + "step": 13413, + "time_per_iteration": 2.4881863594055176 + }, + { + "auxiliary_loss_clip": 0.01093118, + "auxiliary_loss_mlp": 0.01038513, + "balance_loss_clip": 1.03378749, + "balance_loss_mlp": 1.02553117, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 1.929458318279097, + "language_loss": 0.84865081, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.8699671, + "num_input_tokens_seen": 289535840, + "step": 13414, + "time_per_iteration": 2.443234443664551 + }, + { + "auxiliary_loss_clip": 0.01083303, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.03307819, + "balance_loss_mlp": 1.0181675, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 1.9978058871798985, + "language_loss": 0.66755229, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.68868315, + "num_input_tokens_seen": 289555205, + "step": 13415, + "time_per_iteration": 3.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.01073303, + "auxiliary_loss_mlp": 0.0102568, + "balance_loss_clip": 1.03412509, + "balance_loss_mlp": 1.01432514, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.561484693811727, + "language_loss": 0.7625308, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.7835207, + "num_input_tokens_seen": 289573000, + "step": 13416, + "time_per_iteration": 2.5138494968414307 + }, + { + "auxiliary_loss_clip": 0.01095323, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.03376067, + "balance_loss_mlp": 1.02367115, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.621022352306956, + "language_loss": 0.65992534, + "learning_rate": 3.793146714797086e-07, + "loss": 0.68123543, + "num_input_tokens_seen": 289592625, + "step": 13417, + "time_per_iteration": 2.4876010417938232 + }, + { + "auxiliary_loss_clip": 0.01077406, + "auxiliary_loss_mlp": 0.01049142, + "balance_loss_clip": 1.03377056, + "balance_loss_mlp": 1.03535557, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.5265346517549818, + "language_loss": 0.80370116, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82496661, + "num_input_tokens_seen": 289610780, + "step": 13418, + "time_per_iteration": 2.545227289199829 + }, + { + "auxiliary_loss_clip": 0.01089829, + "auxiliary_loss_mlp": 0.0102931, + "balance_loss_clip": 1.03786361, + "balance_loss_mlp": 1.01660848, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 1.6238031730954106, + "language_loss": 0.84895086, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87014228, + "num_input_tokens_seen": 289628890, + "step": 13419, + "time_per_iteration": 2.486560106277466 + }, + { + "auxiliary_loss_clip": 0.01072414, + "auxiliary_loss_mlp": 0.007781, + "balance_loss_clip": 1.034603, + "balance_loss_mlp": 1.00064623, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.6902884217752592, + "language_loss": 0.75865656, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.77716166, + "num_input_tokens_seen": 289647220, + "step": 13420, + "time_per_iteration": 2.604886293411255 + }, + { + "auxiliary_loss_clip": 0.01089905, + "auxiliary_loss_mlp": 0.00777078, + "balance_loss_clip": 1.03207707, + "balance_loss_mlp": 1.00057077, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.843735262255573, + "language_loss": 0.78313464, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80180442, + "num_input_tokens_seen": 289665800, + "step": 13421, + "time_per_iteration": 2.4873836040496826 + }, + { + "auxiliary_loss_clip": 0.01079583, + "auxiliary_loss_mlp": 0.01025715, + "balance_loss_clip": 1.03858829, + "balance_loss_mlp": 1.01330519, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1.65977551907839, + "language_loss": 0.79511005, + "learning_rate": 3.78174402269098e-07, + "loss": 0.816163, + "num_input_tokens_seen": 289682705, + "step": 13422, + "time_per_iteration": 3.9543395042419434 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.03489757, + "balance_loss_mlp": 1.0189327, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.836733435874617, + "language_loss": 0.67929047, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70066011, + "num_input_tokens_seen": 289702920, + "step": 13423, + "time_per_iteration": 2.441617250442505 + }, + { + "auxiliary_loss_clip": 0.01085498, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.03877151, + "balance_loss_mlp": 1.02359581, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 3.118741465172739, + "language_loss": 0.80104613, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82226539, + "num_input_tokens_seen": 289723280, + "step": 13424, + "time_per_iteration": 2.5523593425750732 + }, + { + "auxiliary_loss_clip": 0.01098763, + "auxiliary_loss_mlp": 0.0102693, + "balance_loss_clip": 1.03519654, + "balance_loss_mlp": 1.01511645, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 1.6050695917983884, + "language_loss": 0.78752005, + "learning_rate": 3.774909786710232e-07, + "loss": 0.80877697, + "num_input_tokens_seen": 289743475, + "step": 13425, + "time_per_iteration": 2.4903435707092285 + }, + { + "auxiliary_loss_clip": 0.01076846, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.03265345, + "balance_loss_mlp": 1.02319932, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.5677015445069973, + "language_loss": 0.75221157, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77332926, + "num_input_tokens_seen": 289761400, + "step": 13426, + "time_per_iteration": 2.4735403060913086 + }, + { + "auxiliary_loss_clip": 0.01096904, + "auxiliary_loss_mlp": 0.01025243, + "balance_loss_clip": 1.03552711, + "balance_loss_mlp": 1.01362062, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 1.7602177229963911, + "language_loss": 0.73369402, + "learning_rate": 3.770356705530997e-07, + "loss": 0.75491548, + "num_input_tokens_seen": 289781025, + "step": 13427, + "time_per_iteration": 2.526500940322876 + }, + { + "auxiliary_loss_clip": 0.01061125, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.03766036, + "balance_loss_mlp": 1.02088618, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.696265934994422, + "language_loss": 0.70058811, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72153699, + "num_input_tokens_seen": 289798380, + "step": 13428, + "time_per_iteration": 2.608339786529541 + }, + { + "auxiliary_loss_clip": 0.0108843, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.03465199, + "balance_loss_mlp": 1.01781416, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 1.9618322911103316, + "language_loss": 0.74677271, + "learning_rate": 3.765806086070544e-07, + "loss": 0.76794589, + "num_input_tokens_seen": 289814515, + "step": 13429, + "time_per_iteration": 2.4855895042419434 + }, + { + "auxiliary_loss_clip": 0.01094507, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.03547645, + "balance_loss_mlp": 1.02014792, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 3.304954014925532, + "language_loss": 0.66944993, + "learning_rate": 3.763531699700568e-07, + "loss": 0.69071037, + "num_input_tokens_seen": 289834315, + "step": 13430, + "time_per_iteration": 3.9724209308624268 + }, + { + "auxiliary_loss_clip": 0.01074806, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.0376159, + "balance_loss_mlp": 1.0159328, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 1.7200639151820023, + "language_loss": 0.80286378, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82389134, + "num_input_tokens_seen": 289853770, + "step": 13431, + "time_per_iteration": 2.536569118499756 + }, + { + "auxiliary_loss_clip": 0.01087113, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.03842974, + "balance_loss_mlp": 1.01571035, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 2.1581903901968733, + "language_loss": 0.80339634, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82455236, + "num_input_tokens_seen": 289870480, + "step": 13432, + "time_per_iteration": 2.4962055683135986 + }, + { + "auxiliary_loss_clip": 0.01083115, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.03804076, + "balance_loss_mlp": 1.02131176, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 2.0371724199364887, + "language_loss": 0.70623755, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72740704, + "num_input_tokens_seen": 289888275, + "step": 13433, + "time_per_iteration": 2.4926376342773438 + }, + { + "auxiliary_loss_clip": 0.01087236, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.04055524, + "balance_loss_mlp": 1.01578915, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.6070097995231896, + "language_loss": 0.72301221, + "learning_rate": 3.754440311967828e-07, + "loss": 0.74415624, + "num_input_tokens_seen": 289911495, + "step": 13434, + "time_per_iteration": 2.7013847827911377 + }, + { + "auxiliary_loss_clip": 0.01071582, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.0368135, + "balance_loss_mlp": 1.01656783, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 2.241343636839041, + "language_loss": 0.68392706, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70493191, + "num_input_tokens_seen": 289930045, + "step": 13435, + "time_per_iteration": 2.526409149169922 + }, + { + "auxiliary_loss_clip": 0.01066944, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.03614187, + "balance_loss_mlp": 1.01723003, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 1.5982605141394555, + "language_loss": 0.74964321, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77062118, + "num_input_tokens_seen": 289950815, + "step": 13436, + "time_per_iteration": 2.6085245609283447 + }, + { + "auxiliary_loss_clip": 0.01103594, + "auxiliary_loss_mlp": 0.01031898, + "balance_loss_clip": 1.03393853, + "balance_loss_mlp": 1.01917791, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.6668973716300852, + "language_loss": 0.70162785, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72298276, + "num_input_tokens_seen": 289971730, + "step": 13437, + "time_per_iteration": 2.4825658798217773 + }, + { + "auxiliary_loss_clip": 0.01081896, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.03771389, + "balance_loss_mlp": 1.02128243, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.8940562040842237, + "language_loss": 0.72956145, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75070524, + "num_input_tokens_seen": 289992995, + "step": 13438, + "time_per_iteration": 2.570777177810669 + }, + { + "auxiliary_loss_clip": 0.01087662, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.03880191, + "balance_loss_mlp": 1.02003789, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 2.4972607591648255, + "language_loss": 0.76987064, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79106468, + "num_input_tokens_seen": 290009405, + "step": 13439, + "time_per_iteration": 2.514404535293579 + }, + { + "auxiliary_loss_clip": 0.01107457, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.03608346, + "balance_loss_mlp": 1.02026629, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.450998916643007, + "language_loss": 0.78758419, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.80898178, + "num_input_tokens_seen": 290031085, + "step": 13440, + "time_per_iteration": 2.483388900756836 + }, + { + "auxiliary_loss_clip": 0.01088371, + "auxiliary_loss_mlp": 0.0077831, + "balance_loss_clip": 1.03632617, + "balance_loss_mlp": 1.00059819, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 1.6275917375493485, + "language_loss": 0.59004515, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.60871202, + "num_input_tokens_seen": 290048670, + "step": 13441, + "time_per_iteration": 2.507133722305298 + }, + { + "auxiliary_loss_clip": 0.01097132, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.03536272, + "balance_loss_mlp": 1.01818705, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 1.9872431982778642, + "language_loss": 0.75940883, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.78068656, + "num_input_tokens_seen": 290064085, + "step": 13442, + "time_per_iteration": 2.4926164150238037 + }, + { + "auxiliary_loss_clip": 0.01087093, + "auxiliary_loss_mlp": 0.01030253, + "balance_loss_clip": 1.03707516, + "balance_loss_mlp": 1.01799226, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.5847224185158129, + "language_loss": 0.70750904, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72868258, + "num_input_tokens_seen": 290086255, + "step": 13443, + "time_per_iteration": 2.6357998847961426 + }, + { + "auxiliary_loss_clip": 0.01064864, + "auxiliary_loss_mlp": 0.01039683, + "balance_loss_clip": 1.03478777, + "balance_loss_mlp": 1.02820325, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 3.577413747985643, + "language_loss": 0.82254124, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84358674, + "num_input_tokens_seen": 290103995, + "step": 13444, + "time_per_iteration": 2.4966604709625244 + }, + { + "auxiliary_loss_clip": 0.00999094, + "auxiliary_loss_mlp": 0.00753771, + "balance_loss_clip": 1.01200926, + "balance_loss_mlp": 1.00032127, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.8147244296989309, + "language_loss": 0.5361501, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55367875, + "num_input_tokens_seen": 290157245, + "step": 13445, + "time_per_iteration": 2.9516122341156006 + }, + { + "auxiliary_loss_clip": 0.01070717, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.03459704, + "balance_loss_mlp": 1.01715517, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 2.4997312883178204, + "language_loss": 0.72131437, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.74232399, + "num_input_tokens_seen": 290174970, + "step": 13446, + "time_per_iteration": 4.062325954437256 + }, + { + "auxiliary_loss_clip": 0.01086124, + "auxiliary_loss_mlp": 0.01037623, + "balance_loss_clip": 1.03581965, + "balance_loss_mlp": 1.02352071, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 2.5535613464602003, + "language_loss": 0.71534395, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73658144, + "num_input_tokens_seen": 290194395, + "step": 13447, + "time_per_iteration": 2.542205810546875 + }, + { + "auxiliary_loss_clip": 0.01048708, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.03588676, + "balance_loss_mlp": 1.02033603, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 2.6365711420164577, + "language_loss": 0.75320613, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.77403545, + "num_input_tokens_seen": 290209200, + "step": 13448, + "time_per_iteration": 2.575057029724121 + }, + { + "auxiliary_loss_clip": 0.0102935, + "auxiliary_loss_mlp": 0.01000747, + "balance_loss_clip": 1.00609112, + "balance_loss_mlp": 0.99954319, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7536430753161809, + "language_loss": 0.63851851, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.6588195, + "num_input_tokens_seen": 290274565, + "step": 13449, + "time_per_iteration": 3.039763927459717 + }, + { + "auxiliary_loss_clip": 0.01099793, + "auxiliary_loss_mlp": 0.01025779, + "balance_loss_clip": 1.03684378, + "balance_loss_mlp": 1.01364946, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 1.6330882435341643, + "language_loss": 0.73965263, + "learning_rate": 3.718173381422105e-07, + "loss": 0.76090837, + "num_input_tokens_seen": 290293630, + "step": 13450, + "time_per_iteration": 2.4706804752349854 + }, + { + "auxiliary_loss_clip": 0.01085149, + "auxiliary_loss_mlp": 0.00776951, + "balance_loss_clip": 1.03489494, + "balance_loss_mlp": 1.00059938, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.7171443330329121, + "language_loss": 0.74053586, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.75915682, + "num_input_tokens_seen": 290311450, + "step": 13451, + "time_per_iteration": 2.473745107650757 + }, + { + "auxiliary_loss_clip": 0.01086262, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.03352642, + "balance_loss_mlp": 1.0185523, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.7379044467050075, + "language_loss": 0.79864967, + "learning_rate": 3.713651121244543e-07, + "loss": 0.81984204, + "num_input_tokens_seen": 290330165, + "step": 13452, + "time_per_iteration": 2.504141330718994 + }, + { + "auxiliary_loss_clip": 0.01099108, + "auxiliary_loss_mlp": 0.0103639, + "balance_loss_clip": 1.03619146, + "balance_loss_mlp": 1.02465951, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 2.8503068498102344, + "language_loss": 0.7851159, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80647087, + "num_input_tokens_seen": 290350815, + "step": 13453, + "time_per_iteration": 2.5469765663146973 + }, + { + "auxiliary_loss_clip": 0.01062445, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.03465652, + "balance_loss_mlp": 1.01751733, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 2.51134424022831, + "language_loss": 0.77337021, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79429793, + "num_input_tokens_seen": 290367380, + "step": 13454, + "time_per_iteration": 4.055288076400757 + }, + { + "auxiliary_loss_clip": 0.01078865, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.03784359, + "balance_loss_mlp": 1.02028847, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 2.27420296230468, + "language_loss": 0.76795763, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78906858, + "num_input_tokens_seen": 290387965, + "step": 13455, + "time_per_iteration": 2.598536491394043 + }, + { + "auxiliary_loss_clip": 0.01083257, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.03347254, + "balance_loss_mlp": 1.02173781, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 1.8327664107432216, + "language_loss": 0.78588033, + "learning_rate": 3.70461401253471e-07, + "loss": 0.80706143, + "num_input_tokens_seen": 290404150, + "step": 13456, + "time_per_iteration": 2.4684667587280273 + }, + { + "auxiliary_loss_clip": 0.01107937, + "auxiliary_loss_mlp": 0.01034849, + "balance_loss_clip": 1.03794408, + "balance_loss_mlp": 1.02347672, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 2.1127560211546945, + "language_loss": 0.72237337, + "learning_rate": 3.702356279949801e-07, + "loss": 0.74380124, + "num_input_tokens_seen": 290422370, + "step": 13457, + "time_per_iteration": 2.455754280090332 + }, + { + "auxiliary_loss_clip": 0.01085508, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.03412867, + "balance_loss_mlp": 1.01881254, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 1.7802230919066357, + "language_loss": 0.72853458, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74968851, + "num_input_tokens_seen": 290442645, + "step": 13458, + "time_per_iteration": 2.493436336517334 + }, + { + "auxiliary_loss_clip": 0.01098599, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.03710854, + "balance_loss_mlp": 1.02288651, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 2.335084582257281, + "language_loss": 0.78863525, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.80997384, + "num_input_tokens_seen": 290458520, + "step": 13459, + "time_per_iteration": 2.4355432987213135 + }, + { + "auxiliary_loss_clip": 0.01083975, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.03728354, + "balance_loss_mlp": 1.01530325, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 1.9677348092462252, + "language_loss": 0.8006146, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82173419, + "num_input_tokens_seen": 290474465, + "step": 13460, + "time_per_iteration": 2.479468822479248 + }, + { + "auxiliary_loss_clip": 0.01087184, + "auxiliary_loss_mlp": 0.01032896, + "balance_loss_clip": 1.03355324, + "balance_loss_mlp": 1.01971745, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.7164272384189063, + "language_loss": 0.84850478, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86970556, + "num_input_tokens_seen": 290492060, + "step": 13461, + "time_per_iteration": 3.8454959392547607 + }, + { + "auxiliary_loss_clip": 0.01101974, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.03901482, + "balance_loss_mlp": 1.02272844, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.8888986560723358, + "language_loss": 0.76359242, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78496385, + "num_input_tokens_seen": 290511510, + "step": 13462, + "time_per_iteration": 2.4961297512054443 + }, + { + "auxiliary_loss_clip": 0.01088009, + "auxiliary_loss_mlp": 0.01037461, + "balance_loss_clip": 1.03754044, + "balance_loss_mlp": 1.02472353, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 3.2851434310469054, + "language_loss": 0.83427179, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.85552645, + "num_input_tokens_seen": 290530035, + "step": 13463, + "time_per_iteration": 2.5181329250335693 + }, + { + "auxiliary_loss_clip": 0.01106853, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.036268, + "balance_loss_mlp": 1.02148485, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 3.1876166274629893, + "language_loss": 0.6238637, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64525986, + "num_input_tokens_seen": 290548245, + "step": 13464, + "time_per_iteration": 2.6491658687591553 + }, + { + "auxiliary_loss_clip": 0.01106554, + "auxiliary_loss_mlp": 0.01029799, + "balance_loss_clip": 1.03638554, + "balance_loss_mlp": 1.01842594, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.501227114131674, + "language_loss": 0.6154139, + "learning_rate": 3.684316674755341e-07, + "loss": 0.6367774, + "num_input_tokens_seen": 290568625, + "step": 13465, + "time_per_iteration": 2.436654806137085 + }, + { + "auxiliary_loss_clip": 0.01097967, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.03770208, + "balance_loss_mlp": 1.02290606, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 1.8319042323674095, + "language_loss": 0.81932831, + "learning_rate": 3.682064507324256e-07, + "loss": 0.84065658, + "num_input_tokens_seen": 290586575, + "step": 13466, + "time_per_iteration": 2.4672672748565674 + }, + { + "auxiliary_loss_clip": 0.01095039, + "auxiliary_loss_mlp": 0.00777657, + "balance_loss_clip": 1.03832316, + "balance_loss_mlp": 1.00063777, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 1.927017372818704, + "language_loss": 0.76344299, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.78216994, + "num_input_tokens_seen": 290606790, + "step": 13467, + "time_per_iteration": 2.551927089691162 + }, + { + "auxiliary_loss_clip": 0.0107413, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.03086483, + "balance_loss_mlp": 1.01808941, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 1.753691905282709, + "language_loss": 0.79457903, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81562531, + "num_input_tokens_seen": 290625525, + "step": 13468, + "time_per_iteration": 2.5441462993621826 + }, + { + "auxiliary_loss_clip": 0.01093965, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.03423405, + "balance_loss_mlp": 1.01862884, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 1.7454279399454953, + "language_loss": 0.68047738, + "learning_rate": 3.675311718038978e-07, + "loss": 0.70171523, + "num_input_tokens_seen": 290644935, + "step": 13469, + "time_per_iteration": 2.4543793201446533 + }, + { + "auxiliary_loss_clip": 0.01004831, + "auxiliary_loss_mlp": 0.01002331, + "balance_loss_clip": 1.01026189, + "balance_loss_mlp": 1.0008229, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.6890647540996215, + "language_loss": 0.54637384, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56644547, + "num_input_tokens_seen": 290710735, + "step": 13470, + "time_per_iteration": 4.684020757675171 + }, + { + "auxiliary_loss_clip": 0.01107279, + "auxiliary_loss_mlp": 0.01027219, + "balance_loss_clip": 1.03544497, + "balance_loss_mlp": 1.01611507, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 1.9760945863535153, + "language_loss": 0.69563043, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71697533, + "num_input_tokens_seen": 290729565, + "step": 13471, + "time_per_iteration": 2.4093995094299316 + }, + { + "auxiliary_loss_clip": 0.01099925, + "auxiliary_loss_mlp": 0.0102643, + "balance_loss_clip": 1.03808153, + "balance_loss_mlp": 1.01489067, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.7217047472223095, + "language_loss": 0.79679096, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.81805456, + "num_input_tokens_seen": 290749360, + "step": 13472, + "time_per_iteration": 2.5012366771698 + }, + { + "auxiliary_loss_clip": 0.0102268, + "auxiliary_loss_mlp": 0.0100506, + "balance_loss_clip": 1.00768089, + "balance_loss_mlp": 1.00388002, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.7464005548623692, + "language_loss": 0.5778985, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59817588, + "num_input_tokens_seen": 290812145, + "step": 13473, + "time_per_iteration": 2.992363214492798 + }, + { + "auxiliary_loss_clip": 0.01061207, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.03748417, + "balance_loss_mlp": 1.01643407, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 1.8397217539063313, + "language_loss": 0.73692268, + "learning_rate": 3.664069451043399e-07, + "loss": 0.75782132, + "num_input_tokens_seen": 290829845, + "step": 13474, + "time_per_iteration": 2.533141613006592 + }, + { + "auxiliary_loss_clip": 0.01101551, + "auxiliary_loss_mlp": 0.01037532, + "balance_loss_clip": 1.0418824, + "balance_loss_mlp": 1.02543187, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.7385175726165, + "language_loss": 0.78621638, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80760717, + "num_input_tokens_seen": 290848815, + "step": 13475, + "time_per_iteration": 2.4691162109375 + }, + { + "auxiliary_loss_clip": 0.01095985, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.03581715, + "balance_loss_mlp": 1.022035, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.5622729121263559, + "language_loss": 0.74953938, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77083635, + "num_input_tokens_seen": 290868580, + "step": 13476, + "time_per_iteration": 2.490283966064453 + }, + { + "auxiliary_loss_clip": 0.0109166, + "auxiliary_loss_mlp": 0.01034997, + "balance_loss_clip": 1.03522682, + "balance_loss_mlp": 1.02193797, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 3.5140615519438088, + "language_loss": 0.73933762, + "learning_rate": 3.657331523685485e-07, + "loss": 0.76060414, + "num_input_tokens_seen": 290883540, + "step": 13477, + "time_per_iteration": 2.4628429412841797 + }, + { + "auxiliary_loss_clip": 0.01082928, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.03555214, + "balance_loss_mlp": 1.02177477, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 2.4239601859081503, + "language_loss": 0.69931722, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.72048056, + "num_input_tokens_seen": 290901560, + "step": 13478, + "time_per_iteration": 2.4893553256988525 + }, + { + "auxiliary_loss_clip": 0.01029066, + "auxiliary_loss_mlp": 0.01003817, + "balance_loss_clip": 1.00584316, + "balance_loss_mlp": 1.00264919, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.698395563051902, + "language_loss": 0.52160805, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54193687, + "num_input_tokens_seen": 290959185, + "step": 13479, + "time_per_iteration": 2.9543395042419434 + }, + { + "auxiliary_loss_clip": 0.01060612, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.03404748, + "balance_loss_mlp": 1.02050567, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.6836230688444245, + "language_loss": 0.71102077, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73195136, + "num_input_tokens_seen": 290979585, + "step": 13480, + "time_per_iteration": 2.5871846675872803 + }, + { + "auxiliary_loss_clip": 0.01108013, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.03614414, + "balance_loss_mlp": 1.02000809, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.8049819866257846, + "language_loss": 0.79897904, + "learning_rate": 3.648356296957327e-07, + "loss": 0.82037556, + "num_input_tokens_seen": 291000865, + "step": 13481, + "time_per_iteration": 2.487644672393799 + }, + { + "auxiliary_loss_clip": 0.01085661, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.03537536, + "balance_loss_mlp": 1.01912475, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 1.8452975137174332, + "language_loss": 0.72486162, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74602824, + "num_input_tokens_seen": 291018285, + "step": 13482, + "time_per_iteration": 2.5330135822296143 + }, + { + "auxiliary_loss_clip": 0.01049829, + "auxiliary_loss_mlp": 0.01026268, + "balance_loss_clip": 1.03709221, + "balance_loss_mlp": 1.01425767, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.1959858493416102, + "language_loss": 0.65655738, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.67731833, + "num_input_tokens_seen": 291035745, + "step": 13483, + "time_per_iteration": 2.587857484817505 + }, + { + "auxiliary_loss_clip": 0.01082047, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.03285956, + "balance_loss_mlp": 1.01782966, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.706862342468541, + "language_loss": 0.76372248, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78484857, + "num_input_tokens_seen": 291053280, + "step": 13484, + "time_per_iteration": 2.534745454788208 + }, + { + "auxiliary_loss_clip": 0.01093311, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.03741693, + "balance_loss_mlp": 1.02000785, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.6560136591646886, + "language_loss": 0.72187257, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74313843, + "num_input_tokens_seen": 291072855, + "step": 13485, + "time_per_iteration": 3.967771053314209 + }, + { + "auxiliary_loss_clip": 0.01062736, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.02848625, + "balance_loss_mlp": 1.0241791, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.9207968514707965, + "language_loss": 0.75895762, + "learning_rate": 3.637151215443308e-07, + "loss": 0.77996111, + "num_input_tokens_seen": 291090285, + "step": 13486, + "time_per_iteration": 2.5377328395843506 + }, + { + "auxiliary_loss_clip": 0.01091203, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.03838408, + "balance_loss_mlp": 1.020509, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 3.287204467390626, + "language_loss": 0.72332889, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74457061, + "num_input_tokens_seen": 291107675, + "step": 13487, + "time_per_iteration": 2.5596156120300293 + }, + { + "auxiliary_loss_clip": 0.01049327, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.03784406, + "balance_loss_mlp": 1.01764667, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 1.603744158178553, + "language_loss": 0.84285688, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86364084, + "num_input_tokens_seen": 291126900, + "step": 13488, + "time_per_iteration": 2.696671485900879 + }, + { + "auxiliary_loss_clip": 0.01110438, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.03742981, + "balance_loss_mlp": 1.02133167, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 1.8472470165566246, + "language_loss": 0.7383641, + "learning_rate": 3.630435611625502e-07, + "loss": 0.75980669, + "num_input_tokens_seen": 291145285, + "step": 13489, + "time_per_iteration": 2.4690053462982178 + }, + { + "auxiliary_loss_clip": 0.01065146, + "auxiliary_loss_mlp": 0.00775844, + "balance_loss_clip": 1.03644085, + "balance_loss_mlp": 1.00057936, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 1.574445345289296, + "language_loss": 0.71549356, + "learning_rate": 3.628198318377453e-07, + "loss": 0.73390347, + "num_input_tokens_seen": 291163485, + "step": 13490, + "time_per_iteration": 2.5844309329986572 + }, + { + "auxiliary_loss_clip": 0.01076441, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.03432989, + "balance_loss_mlp": 1.02939272, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 3.233665806615931, + "language_loss": 0.71739191, + "learning_rate": 3.625961645949762e-07, + "loss": 0.73859459, + "num_input_tokens_seen": 291182215, + "step": 13491, + "time_per_iteration": 2.5445644855499268 + }, + { + "auxiliary_loss_clip": 0.01108966, + "auxiliary_loss_mlp": 0.01027794, + "balance_loss_clip": 1.03618908, + "balance_loss_mlp": 1.01602817, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.5327564098207789, + "language_loss": 0.67859197, + "learning_rate": 3.623725594427245e-07, + "loss": 0.69995964, + "num_input_tokens_seen": 291203145, + "step": 13492, + "time_per_iteration": 2.450568199157715 + }, + { + "auxiliary_loss_clip": 0.0106429, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.03508043, + "balance_loss_mlp": 1.01898336, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 1.5996296287000704, + "language_loss": 0.72150373, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.7424562, + "num_input_tokens_seen": 291220600, + "step": 13493, + "time_per_iteration": 4.0758256912231445 + }, + { + "auxiliary_loss_clip": 0.01092373, + "auxiliary_loss_mlp": 0.01044734, + "balance_loss_clip": 1.03277779, + "balance_loss_mlp": 1.0314486, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.9979541988422884, + "language_loss": 0.70624936, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72762036, + "num_input_tokens_seen": 291241195, + "step": 13494, + "time_per_iteration": 2.5425474643707275 + }, + { + "auxiliary_loss_clip": 0.01101067, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03713489, + "balance_loss_mlp": 1.01875544, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 2.613407453165923, + "language_loss": 0.77010274, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.79143429, + "num_input_tokens_seen": 291258715, + "step": 13495, + "time_per_iteration": 2.474790096282959 + }, + { + "auxiliary_loss_clip": 0.01089821, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.03688431, + "balance_loss_mlp": 1.02097535, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 2.40321800502539, + "language_loss": 0.79756498, + "learning_rate": 3.614787599084417e-07, + "loss": 0.81879675, + "num_input_tokens_seen": 291278030, + "step": 13496, + "time_per_iteration": 2.5578978061676025 + }, + { + "auxiliary_loss_clip": 0.01098129, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.03475595, + "balance_loss_mlp": 1.01801515, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.937502420526503, + "language_loss": 0.7125181, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73381221, + "num_input_tokens_seen": 291296740, + "step": 13497, + "time_per_iteration": 2.4502193927764893 + }, + { + "auxiliary_loss_clip": 0.01076107, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.03406119, + "balance_loss_mlp": 1.02128696, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 1.6884344702035368, + "language_loss": 0.77314687, + "learning_rate": 3.610322329047508e-07, + "loss": 0.79423386, + "num_input_tokens_seen": 291318730, + "step": 13498, + "time_per_iteration": 2.5680348873138428 + }, + { + "auxiliary_loss_clip": 0.01108213, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.03564775, + "balance_loss_mlp": 1.02493334, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 1.9131088910303622, + "language_loss": 0.83880246, + "learning_rate": 3.608090626234055e-07, + "loss": 0.86026192, + "num_input_tokens_seen": 291336755, + "step": 13499, + "time_per_iteration": 2.388885498046875 + }, + { + "auxiliary_loss_clip": 0.01079145, + "auxiliary_loss_mlp": 0.01032811, + "balance_loss_clip": 1.03762925, + "balance_loss_mlp": 1.01950121, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.610621505688939, + "language_loss": 0.76524031, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.78635991, + "num_input_tokens_seen": 291356795, + "step": 13500, + "time_per_iteration": 3.9994020462036133 + }, + { + "auxiliary_loss_clip": 0.01013093, + "auxiliary_loss_mlp": 0.00999227, + "balance_loss_clip": 1.00879335, + "balance_loss_mlp": 0.99804658, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8359245385260947, + "language_loss": 0.59861898, + "learning_rate": 3.603629085440303e-07, + "loss": 0.61874223, + "num_input_tokens_seen": 291416005, + "step": 13501, + "time_per_iteration": 3.116579532623291 + }, + { + "auxiliary_loss_clip": 0.01096503, + "auxiliary_loss_mlp": 0.01028515, + "balance_loss_clip": 1.03771114, + "balance_loss_mlp": 1.01692188, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 2.6585831911792477, + "language_loss": 0.79031968, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81156981, + "num_input_tokens_seen": 291434870, + "step": 13502, + "time_per_iteration": 2.5162603855133057 + }, + { + "auxiliary_loss_clip": 0.01083934, + "auxiliary_loss_mlp": 0.0103804, + "balance_loss_clip": 1.03306437, + "balance_loss_mlp": 1.02451563, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.8970078787949434, + "language_loss": 0.7160002, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73721993, + "num_input_tokens_seen": 291452230, + "step": 13503, + "time_per_iteration": 2.5016214847564697 + }, + { + "auxiliary_loss_clip": 0.01079125, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.03347874, + "balance_loss_mlp": 1.01787472, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 2.9938912776201647, + "language_loss": 0.67897278, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.70008183, + "num_input_tokens_seen": 291477425, + "step": 13504, + "time_per_iteration": 2.7397444248199463 + }, + { + "auxiliary_loss_clip": 0.01082543, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.03548312, + "balance_loss_mlp": 1.01897454, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 1.8485913212427019, + "language_loss": 0.74511647, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76626164, + "num_input_tokens_seen": 291501070, + "step": 13505, + "time_per_iteration": 2.772442102432251 + }, + { + "auxiliary_loss_clip": 0.01087747, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.03651738, + "balance_loss_mlp": 1.01558185, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 2.524911252316589, + "language_loss": 0.72612119, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.74728787, + "num_input_tokens_seen": 291524945, + "step": 13506, + "time_per_iteration": 2.6633217334747314 + }, + { + "auxiliary_loss_clip": 0.0111442, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.0371244, + "balance_loss_mlp": 1.02017009, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 2.677527359163688, + "language_loss": 0.7628901, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78436315, + "num_input_tokens_seen": 291544605, + "step": 13507, + "time_per_iteration": 2.478671073913574 + }, + { + "auxiliary_loss_clip": 0.0110965, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.03517485, + "balance_loss_mlp": 1.01732397, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.7689273425994545, + "language_loss": 0.70285833, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72424883, + "num_input_tokens_seen": 291563850, + "step": 13508, + "time_per_iteration": 2.4446065425872803 + }, + { + "auxiliary_loss_clip": 0.01097365, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.03677404, + "balance_loss_mlp": 1.02050114, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 1.762610197425049, + "language_loss": 0.76305151, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78434986, + "num_input_tokens_seen": 291581730, + "step": 13509, + "time_per_iteration": 4.022018194198608 + }, + { + "auxiliary_loss_clip": 0.01110833, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.03723502, + "balance_loss_mlp": 1.02106428, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 1.9705865936259483, + "language_loss": 0.77394062, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79538321, + "num_input_tokens_seen": 291601225, + "step": 13510, + "time_per_iteration": 2.5079877376556396 + }, + { + "auxiliary_loss_clip": 0.01101906, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.03587174, + "balance_loss_mlp": 1.02096415, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 1.7472810746238836, + "language_loss": 0.70228022, + "learning_rate": 3.581358700114212e-07, + "loss": 0.72363865, + "num_input_tokens_seen": 291616995, + "step": 13511, + "time_per_iteration": 2.443046808242798 + }, + { + "auxiliary_loss_clip": 0.0109266, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.03841555, + "balance_loss_mlp": 1.02335024, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.7798588315843549, + "language_loss": 0.79390168, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81518322, + "num_input_tokens_seen": 291636145, + "step": 13512, + "time_per_iteration": 2.527153730392456 + }, + { + "auxiliary_loss_clip": 0.01096517, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.03528059, + "balance_loss_mlp": 1.01935816, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 2.0860424194374527, + "language_loss": 0.63731265, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65858889, + "num_input_tokens_seen": 291662440, + "step": 13513, + "time_per_iteration": 2.8727946281433105 + }, + { + "auxiliary_loss_clip": 0.01066767, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.03844309, + "balance_loss_mlp": 1.0207026, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.7430724492362408, + "language_loss": 0.71333843, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73433644, + "num_input_tokens_seen": 291680950, + "step": 13514, + "time_per_iteration": 2.646124839782715 + }, + { + "auxiliary_loss_clip": 0.01073949, + "auxiliary_loss_mlp": 0.01029061, + "balance_loss_clip": 1.03624821, + "balance_loss_mlp": 1.0168364, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.646173141378393, + "language_loss": 0.63064814, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.65167826, + "num_input_tokens_seen": 291702395, + "step": 13515, + "time_per_iteration": 2.62550950050354 + }, + { + "auxiliary_loss_clip": 0.01101616, + "auxiliary_loss_mlp": 0.00776557, + "balance_loss_clip": 1.03399754, + "balance_loss_mlp": 1.00059104, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 1.5772862973054596, + "language_loss": 0.7512989, + "learning_rate": 3.570246849544616e-07, + "loss": 0.77008069, + "num_input_tokens_seen": 291721135, + "step": 13516, + "time_per_iteration": 2.4913761615753174 + }, + { + "auxiliary_loss_clip": 0.0106487, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.0385747, + "balance_loss_mlp": 1.01821923, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 1.9435323081276856, + "language_loss": 0.91241181, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93336093, + "num_input_tokens_seen": 291741235, + "step": 13517, + "time_per_iteration": 2.6044440269470215 + }, + { + "auxiliary_loss_clip": 0.01101193, + "auxiliary_loss_mlp": 0.00777525, + "balance_loss_clip": 1.03944349, + "balance_loss_mlp": 1.00068688, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 9.595830724602196, + "language_loss": 0.78667188, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80545902, + "num_input_tokens_seen": 291761430, + "step": 13518, + "time_per_iteration": 2.5402586460113525 + }, + { + "auxiliary_loss_clip": 0.01097339, + "auxiliary_loss_mlp": 0.0102876, + "balance_loss_clip": 1.03623486, + "balance_loss_mlp": 1.01812088, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.6748783264782685, + "language_loss": 0.7903831, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81164414, + "num_input_tokens_seen": 291781755, + "step": 13519, + "time_per_iteration": 2.5582144260406494 + }, + { + "auxiliary_loss_clip": 0.01109977, + "auxiliary_loss_mlp": 0.01039618, + "balance_loss_clip": 1.03674412, + "balance_loss_mlp": 1.02714276, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.879213001881997, + "language_loss": 0.70399308, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72548902, + "num_input_tokens_seen": 291804410, + "step": 13520, + "time_per_iteration": 2.5704896450042725 + }, + { + "auxiliary_loss_clip": 0.01093405, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.03754961, + "balance_loss_mlp": 1.02033055, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.5361248523379674, + "language_loss": 0.72788453, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74914408, + "num_input_tokens_seen": 291823285, + "step": 13521, + "time_per_iteration": 2.560305595397949 + }, + { + "auxiliary_loss_clip": 0.01101199, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.03628206, + "balance_loss_mlp": 1.01576829, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.8500820505982567, + "language_loss": 0.69869339, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.71999145, + "num_input_tokens_seen": 291845305, + "step": 13522, + "time_per_iteration": 2.5612285137176514 + }, + { + "auxiliary_loss_clip": 0.0109331, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.03581703, + "balance_loss_mlp": 1.02167857, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.488483727089643, + "language_loss": 0.70360231, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72486252, + "num_input_tokens_seen": 291863715, + "step": 13523, + "time_per_iteration": 2.4752659797668457 + }, + { + "auxiliary_loss_clip": 0.01097001, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.03403497, + "balance_loss_mlp": 1.01817799, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 2.0764681009414776, + "language_loss": 0.71357274, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.7348479, + "num_input_tokens_seen": 291880735, + "step": 13524, + "time_per_iteration": 2.4605906009674072 + }, + { + "auxiliary_loss_clip": 0.01096339, + "auxiliary_loss_mlp": 0.01030925, + "balance_loss_clip": 1.0347017, + "balance_loss_mlp": 1.01894414, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.6607829462219714, + "language_loss": 0.62156653, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64283919, + "num_input_tokens_seen": 291900535, + "step": 13525, + "time_per_iteration": 4.04605770111084 + }, + { + "auxiliary_loss_clip": 0.01081448, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.03745842, + "balance_loss_mlp": 1.02240241, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 1.544808049551931, + "language_loss": 0.65510178, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67625618, + "num_input_tokens_seen": 291919760, + "step": 13526, + "time_per_iteration": 2.639396905899048 + }, + { + "auxiliary_loss_clip": 0.01090072, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.04082942, + "balance_loss_mlp": 1.0202409, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 1.6495233836027523, + "language_loss": 0.75212789, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77334565, + "num_input_tokens_seen": 291938915, + "step": 13527, + "time_per_iteration": 2.624049425125122 + }, + { + "auxiliary_loss_clip": 0.01107101, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.03589058, + "balance_loss_mlp": 1.01570284, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 2.612627779098234, + "language_loss": 0.71005738, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.73140466, + "num_input_tokens_seen": 291958145, + "step": 13528, + "time_per_iteration": 2.4952433109283447 + }, + { + "auxiliary_loss_clip": 0.01108186, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.03525281, + "balance_loss_mlp": 1.01841474, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 2.3413071595337716, + "language_loss": 0.68900514, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.71039033, + "num_input_tokens_seen": 291976860, + "step": 13529, + "time_per_iteration": 2.4384381771087646 + }, + { + "auxiliary_loss_clip": 0.01089375, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.0349592, + "balance_loss_mlp": 1.01925611, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.2614340043796137, + "language_loss": 0.77323997, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.7944417, + "num_input_tokens_seen": 291998085, + "step": 13530, + "time_per_iteration": 2.512713670730591 + }, + { + "auxiliary_loss_clip": 0.01098308, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.03735769, + "balance_loss_mlp": 1.02123451, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 3.9741725292201457, + "language_loss": 0.82279968, + "learning_rate": 3.537004792574052e-07, + "loss": 0.84412014, + "num_input_tokens_seen": 292016585, + "step": 13531, + "time_per_iteration": 2.4390201568603516 + }, + { + "auxiliary_loss_clip": 0.0108612, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.03336966, + "balance_loss_mlp": 1.01914287, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 2.069161261066592, + "language_loss": 0.71820116, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73939341, + "num_input_tokens_seen": 292033255, + "step": 13532, + "time_per_iteration": 3.9895782470703125 + }, + { + "auxiliary_loss_clip": 0.01075627, + "auxiliary_loss_mlp": 0.01028767, + "balance_loss_clip": 1.03384638, + "balance_loss_mlp": 1.01719785, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 7.389016637073016, + "language_loss": 0.76126367, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78230762, + "num_input_tokens_seen": 292051800, + "step": 13533, + "time_per_iteration": 2.536344528198242 + }, + { + "auxiliary_loss_clip": 0.0111349, + "auxiliary_loss_mlp": 0.00778025, + "balance_loss_clip": 1.03701639, + "balance_loss_mlp": 1.00068212, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 1.9713967600377227, + "language_loss": 0.76442027, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78333533, + "num_input_tokens_seen": 292072215, + "step": 13534, + "time_per_iteration": 2.4487133026123047 + }, + { + "auxiliary_loss_clip": 0.01092259, + "auxiliary_loss_mlp": 0.01028377, + "balance_loss_clip": 1.03619647, + "balance_loss_mlp": 1.01746368, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.24681090007541, + "language_loss": 0.93330598, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95451236, + "num_input_tokens_seen": 292088830, + "step": 13535, + "time_per_iteration": 2.4110217094421387 + }, + { + "auxiliary_loss_clip": 0.01071731, + "auxiliary_loss_mlp": 0.0102698, + "balance_loss_clip": 1.03973508, + "balance_loss_mlp": 1.01535082, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.765869403524134, + "language_loss": 0.70447421, + "learning_rate": 3.52595530684499e-07, + "loss": 0.7254613, + "num_input_tokens_seen": 292109225, + "step": 13536, + "time_per_iteration": 2.5857527256011963 + }, + { + "auxiliary_loss_clip": 0.01071648, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.03300703, + "balance_loss_mlp": 1.01759505, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.6612444904677066, + "language_loss": 0.7571826, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77819967, + "num_input_tokens_seen": 292129660, + "step": 13537, + "time_per_iteration": 2.5921685695648193 + }, + { + "auxiliary_loss_clip": 0.01084374, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.03465009, + "balance_loss_mlp": 1.02306926, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.886933041924495, + "language_loss": 0.76389676, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.7851035, + "num_input_tokens_seen": 292149090, + "step": 13538, + "time_per_iteration": 2.5358030796051025 + }, + { + "auxiliary_loss_clip": 0.01096125, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.03901136, + "balance_loss_mlp": 1.01743698, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.620537731939249, + "language_loss": 0.7792874, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.800542, + "num_input_tokens_seen": 292169260, + "step": 13539, + "time_per_iteration": 2.5467417240142822 + }, + { + "auxiliary_loss_clip": 0.0107786, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.04885221, + "balance_loss_mlp": 1.01858759, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 3.4623184102618936, + "language_loss": 0.66158342, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68266463, + "num_input_tokens_seen": 292188145, + "step": 13540, + "time_per_iteration": 4.12460994720459 + }, + { + "auxiliary_loss_clip": 0.01100605, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.03748584, + "balance_loss_mlp": 1.02050209, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.4545768333368085, + "language_loss": 0.67643964, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69776464, + "num_input_tokens_seen": 292212135, + "step": 13541, + "time_per_iteration": 2.599822998046875 + }, + { + "auxiliary_loss_clip": 0.0110721, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.03563166, + "balance_loss_mlp": 1.02254367, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 2.4271718059647776, + "language_loss": 0.68656564, + "learning_rate": 3.512716539904355e-07, + "loss": 0.70798779, + "num_input_tokens_seen": 292230645, + "step": 13542, + "time_per_iteration": 2.4582669734954834 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.03702426, + "balance_loss_mlp": 1.02056193, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 2.7613096448980685, + "language_loss": 0.80210984, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.82357669, + "num_input_tokens_seen": 292243540, + "step": 13543, + "time_per_iteration": 2.3851547241210938 + }, + { + "auxiliary_loss_clip": 0.01085908, + "auxiliary_loss_mlp": 0.01039933, + "balance_loss_clip": 1.04233527, + "balance_loss_mlp": 1.0268023, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 2.5919433114096457, + "language_loss": 0.77827841, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.79953682, + "num_input_tokens_seen": 292261715, + "step": 13544, + "time_per_iteration": 2.602959394454956 + }, + { + "auxiliary_loss_clip": 0.01117174, + "auxiliary_loss_mlp": 0.01033881, + "balance_loss_clip": 1.0391562, + "balance_loss_mlp": 1.01942062, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 3.671911230612482, + "language_loss": 0.7369166, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75842714, + "num_input_tokens_seen": 292275080, + "step": 13545, + "time_per_iteration": 2.517982244491577 + }, + { + "auxiliary_loss_clip": 0.0109714, + "auxiliary_loss_mlp": 0.01028984, + "balance_loss_clip": 1.03647709, + "balance_loss_mlp": 1.01739681, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 1.655385621526153, + "language_loss": 0.76747823, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.78873944, + "num_input_tokens_seen": 292294635, + "step": 13546, + "time_per_iteration": 2.5199713706970215 + }, + { + "auxiliary_loss_clip": 0.01100036, + "auxiliary_loss_mlp": 0.01027696, + "balance_loss_clip": 1.03791511, + "balance_loss_mlp": 1.01638341, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 2.4506929416142103, + "language_loss": 0.70506275, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72634012, + "num_input_tokens_seen": 292312695, + "step": 13547, + "time_per_iteration": 2.469581127166748 + }, + { + "auxiliary_loss_clip": 0.01112364, + "auxiliary_loss_mlp": 0.01035353, + "balance_loss_clip": 1.0378437, + "balance_loss_mlp": 1.02214479, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 1.9126043894424036, + "language_loss": 0.7036919, + "learning_rate": 3.49950028014111e-07, + "loss": 0.72516906, + "num_input_tokens_seen": 292332005, + "step": 13548, + "time_per_iteration": 2.5182042121887207 + }, + { + "auxiliary_loss_clip": 0.01098784, + "auxiliary_loss_mlp": 0.010366, + "balance_loss_clip": 1.0382297, + "balance_loss_mlp": 1.02341545, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.200145716178774, + "language_loss": 0.76534599, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.78669977, + "num_input_tokens_seen": 292348365, + "step": 13549, + "time_per_iteration": 4.029219627380371 + }, + { + "auxiliary_loss_clip": 0.01111493, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.03892028, + "balance_loss_mlp": 1.01662529, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 1.863618634633416, + "language_loss": 0.71007514, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73147899, + "num_input_tokens_seen": 292368050, + "step": 13550, + "time_per_iteration": 2.432886838912964 + }, + { + "auxiliary_loss_clip": 0.01093715, + "auxiliary_loss_mlp": 0.01028224, + "balance_loss_clip": 1.03625858, + "balance_loss_mlp": 1.01653564, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 2.251673317671598, + "language_loss": 0.71836197, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.73958135, + "num_input_tokens_seen": 292385315, + "step": 13551, + "time_per_iteration": 2.4810662269592285 + }, + { + "auxiliary_loss_clip": 0.01073423, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.04026794, + "balance_loss_mlp": 1.01630712, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 1.917444404520787, + "language_loss": 0.68669641, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70772636, + "num_input_tokens_seen": 292403375, + "step": 13552, + "time_per_iteration": 2.5794730186462402 + }, + { + "auxiliary_loss_clip": 0.01107236, + "auxiliary_loss_mlp": 0.01041961, + "balance_loss_clip": 1.03541255, + "balance_loss_mlp": 1.02998042, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 1.9545452485667898, + "language_loss": 0.82429528, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84578729, + "num_input_tokens_seen": 292419260, + "step": 13553, + "time_per_iteration": 2.4045515060424805 + }, + { + "auxiliary_loss_clip": 0.01096973, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.0342803, + "balance_loss_mlp": 1.01834369, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 1.7145985964863888, + "language_loss": 0.67683852, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.69811571, + "num_input_tokens_seen": 292436095, + "step": 13554, + "time_per_iteration": 2.421292304992676 + }, + { + "auxiliary_loss_clip": 0.01082734, + "auxiliary_loss_mlp": 0.01040408, + "balance_loss_clip": 1.03365636, + "balance_loss_mlp": 1.0261451, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.668747184824778, + "language_loss": 0.66603571, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68726712, + "num_input_tokens_seen": 292457190, + "step": 13555, + "time_per_iteration": 2.574944496154785 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.03552508, + "balance_loss_mlp": 1.02225971, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 2.2423458566708536, + "language_loss": 0.72991741, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75129282, + "num_input_tokens_seen": 292474300, + "step": 13556, + "time_per_iteration": 2.425467014312744 + }, + { + "auxiliary_loss_clip": 0.01099229, + "auxiliary_loss_mlp": 0.0102655, + "balance_loss_clip": 1.03901482, + "balance_loss_mlp": 1.0156548, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 1.6064603043622665, + "language_loss": 0.80382824, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.825086, + "num_input_tokens_seen": 292492420, + "step": 13557, + "time_per_iteration": 2.4408371448516846 + }, + { + "auxiliary_loss_clip": 0.01091173, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.03539085, + "balance_loss_mlp": 1.01736271, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 1.7286379384210921, + "language_loss": 0.65991181, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.6811235, + "num_input_tokens_seen": 292512895, + "step": 13558, + "time_per_iteration": 2.5292632579803467 + }, + { + "auxiliary_loss_clip": 0.01027829, + "auxiliary_loss_mlp": 0.0100178, + "balance_loss_clip": 1.00473392, + "balance_loss_mlp": 1.00053394, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.7999665203903571, + "language_loss": 0.5690037, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.5892998, + "num_input_tokens_seen": 292566580, + "step": 13559, + "time_per_iteration": 2.9259395599365234 + }, + { + "auxiliary_loss_clip": 0.010112, + "auxiliary_loss_mlp": 0.01004997, + "balance_loss_clip": 1.00528049, + "balance_loss_mlp": 1.00361431, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6850113782057198, + "language_loss": 0.55278015, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57294214, + "num_input_tokens_seen": 292621490, + "step": 13560, + "time_per_iteration": 2.922744035720825 + }, + { + "auxiliary_loss_clip": 0.01087702, + "auxiliary_loss_mlp": 0.01029266, + "balance_loss_clip": 1.03453541, + "balance_loss_mlp": 1.01769066, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 2.1996266036512604, + "language_loss": 0.6746614, + "learning_rate": 3.470942348696948e-07, + "loss": 0.69583112, + "num_input_tokens_seen": 292638660, + "step": 13561, + "time_per_iteration": 2.474348306655884 + }, + { + "auxiliary_loss_clip": 0.01102828, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.03813064, + "balance_loss_mlp": 1.01863134, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.9437126711217583, + "language_loss": 0.81662703, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83796197, + "num_input_tokens_seen": 292658545, + "step": 13562, + "time_per_iteration": 2.514389991760254 + }, + { + "auxiliary_loss_clip": 0.01086626, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.03917313, + "balance_loss_mlp": 1.01793361, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.5147602449503812, + "language_loss": 0.71894407, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74010789, + "num_input_tokens_seen": 292678460, + "step": 13563, + "time_per_iteration": 2.5333402156829834 + }, + { + "auxiliary_loss_clip": 0.01023708, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.03314435, + "balance_loss_mlp": 1.01361048, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.7070045834447123, + "language_loss": 0.70249689, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.72299874, + "num_input_tokens_seen": 292699815, + "step": 13564, + "time_per_iteration": 4.377970933914185 + }, + { + "auxiliary_loss_clip": 0.01086665, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.03954792, + "balance_loss_mlp": 1.01622903, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 2.0073676073364024, + "language_loss": 0.70066988, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72182167, + "num_input_tokens_seen": 292717370, + "step": 13565, + "time_per_iteration": 3.0946919918060303 + }, + { + "auxiliary_loss_clip": 0.0109844, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.03666115, + "balance_loss_mlp": 1.02139688, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 1.7256759904373296, + "language_loss": 0.790694, + "learning_rate": 3.459986724180188e-07, + "loss": 0.81200933, + "num_input_tokens_seen": 292737110, + "step": 13566, + "time_per_iteration": 2.5489909648895264 + }, + { + "auxiliary_loss_clip": 0.01089697, + "auxiliary_loss_mlp": 0.01028777, + "balance_loss_clip": 1.03945422, + "balance_loss_mlp": 1.01798868, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.6161826523704672, + "language_loss": 0.82517558, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84636033, + "num_input_tokens_seen": 292756510, + "step": 13567, + "time_per_iteration": 2.539638042449951 + }, + { + "auxiliary_loss_clip": 0.01107062, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.03720832, + "balance_loss_mlp": 1.01756883, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 1.9806451438669919, + "language_loss": 0.79630965, + "learning_rate": 3.455608864184771e-07, + "loss": 0.8176589, + "num_input_tokens_seen": 292776710, + "step": 13568, + "time_per_iteration": 2.4681382179260254 + }, + { + "auxiliary_loss_clip": 0.01088498, + "auxiliary_loss_mlp": 0.01027938, + "balance_loss_clip": 1.03864956, + "balance_loss_mlp": 1.01685166, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 2.5174732553679497, + "language_loss": 0.77136123, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79252565, + "num_input_tokens_seen": 292794350, + "step": 13569, + "time_per_iteration": 2.5035605430603027 + }, + { + "auxiliary_loss_clip": 0.01105685, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.03582573, + "balance_loss_mlp": 1.02284718, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 2.1259956233816277, + "language_loss": 0.58081794, + "learning_rate": 3.451233513649199e-07, + "loss": 0.60221672, + "num_input_tokens_seen": 292814005, + "step": 13570, + "time_per_iteration": 2.46833872795105 + }, + { + "auxiliary_loss_clip": 0.01099581, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.03479707, + "balance_loss_mlp": 1.02441907, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 1.71867819637132, + "language_loss": 0.82540965, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84678137, + "num_input_tokens_seen": 292833485, + "step": 13571, + "time_per_iteration": 2.5344648361206055 + }, + { + "auxiliary_loss_clip": 0.01083238, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.03485918, + "balance_loss_mlp": 1.02616489, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.802017472871968, + "language_loss": 0.788239, + "learning_rate": 3.446860673237142e-07, + "loss": 0.80946279, + "num_input_tokens_seen": 292848045, + "step": 13572, + "time_per_iteration": 4.349792957305908 + }, + { + "auxiliary_loss_clip": 0.0111135, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.03769457, + "balance_loss_mlp": 1.02232742, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.468211685942249, + "language_loss": 0.64746982, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.66892493, + "num_input_tokens_seen": 292869965, + "step": 13573, + "time_per_iteration": 2.506176471710205 + }, + { + "auxiliary_loss_clip": 0.01074416, + "auxiliary_loss_mlp": 0.01029627, + "balance_loss_clip": 1.03664768, + "balance_loss_mlp": 1.01842773, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.5621489952909229, + "language_loss": 0.75283384, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77387428, + "num_input_tokens_seen": 292889680, + "step": 13574, + "time_per_iteration": 2.5880486965179443 + }, + { + "auxiliary_loss_clip": 0.01101095, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.03683829, + "balance_loss_mlp": 1.02054465, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 1.648509164230609, + "language_loss": 0.59493667, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61627936, + "num_input_tokens_seen": 292912360, + "step": 13575, + "time_per_iteration": 2.5405473709106445 + }, + { + "auxiliary_loss_clip": 0.01033295, + "auxiliary_loss_mlp": 0.01036949, + "balance_loss_clip": 1.03224933, + "balance_loss_mlp": 1.02282286, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 2.0240795490142354, + "language_loss": 0.74296969, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76367211, + "num_input_tokens_seen": 292928325, + "step": 13576, + "time_per_iteration": 2.626316785812378 + }, + { + "auxiliary_loss_clip": 0.0101309, + "auxiliary_loss_mlp": 0.01001678, + "balance_loss_clip": 1.00929999, + "balance_loss_mlp": 1.00046778, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8247388456541492, + "language_loss": 0.58679223, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60693991, + "num_input_tokens_seen": 292992795, + "step": 13577, + "time_per_iteration": 3.067521572113037 + }, + { + "auxiliary_loss_clip": 0.01051273, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.03175855, + "balance_loss_mlp": 1.01929247, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 1.7156241139488142, + "language_loss": 0.71736622, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73819458, + "num_input_tokens_seen": 293011950, + "step": 13578, + "time_per_iteration": 2.580819845199585 + }, + { + "auxiliary_loss_clip": 0.01069346, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.0345304, + "balance_loss_mlp": 1.02034962, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 1.7153891224654836, + "language_loss": 0.73681641, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75783306, + "num_input_tokens_seen": 293030175, + "step": 13579, + "time_per_iteration": 4.354538440704346 + }, + { + "auxiliary_loss_clip": 0.011102, + "auxiliary_loss_mlp": 0.01027687, + "balance_loss_clip": 1.03667378, + "balance_loss_mlp": 1.01584303, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 2.0534023633057603, + "language_loss": 0.79118168, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81256056, + "num_input_tokens_seen": 293047980, + "step": 13580, + "time_per_iteration": 2.43276047706604 + }, + { + "auxiliary_loss_clip": 0.01072267, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.03172302, + "balance_loss_mlp": 1.02112615, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 1.7367852833966873, + "language_loss": 0.68939948, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.71045995, + "num_input_tokens_seen": 293067030, + "step": 13581, + "time_per_iteration": 2.5272409915924072 + }, + { + "auxiliary_loss_clip": 0.01109119, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.03715885, + "balance_loss_mlp": 1.01683092, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.6520128411702346, + "language_loss": 0.59934878, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.62072855, + "num_input_tokens_seen": 293085575, + "step": 13582, + "time_per_iteration": 2.4537768363952637 + }, + { + "auxiliary_loss_clip": 0.01072736, + "auxiliary_loss_mlp": 0.00775868, + "balance_loss_clip": 1.0340879, + "balance_loss_mlp": 1.00052011, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.398253673573709, + "language_loss": 0.82033074, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.83881682, + "num_input_tokens_seen": 293108200, + "step": 13583, + "time_per_iteration": 2.707256317138672 + }, + { + "auxiliary_loss_clip": 0.01089165, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.0355165, + "balance_loss_mlp": 1.0162127, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 1.8539060697335286, + "language_loss": 0.74499846, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76616633, + "num_input_tokens_seen": 293126020, + "step": 13584, + "time_per_iteration": 2.52952241897583 + }, + { + "auxiliary_loss_clip": 0.01099757, + "auxiliary_loss_mlp": 0.0102835, + "balance_loss_clip": 1.0381161, + "balance_loss_mlp": 1.0158987, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 1.8425851505804967, + "language_loss": 0.74662495, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76790601, + "num_input_tokens_seen": 293144620, + "step": 13585, + "time_per_iteration": 2.4792094230651855 + }, + { + "auxiliary_loss_clip": 0.01085736, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.04064286, + "balance_loss_mlp": 1.02154469, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.6613204925792935, + "language_loss": 0.6944797, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71568143, + "num_input_tokens_seen": 293162850, + "step": 13586, + "time_per_iteration": 2.4958341121673584 + }, + { + "auxiliary_loss_clip": 0.01050765, + "auxiliary_loss_mlp": 0.0104046, + "balance_loss_clip": 1.03299057, + "balance_loss_mlp": 1.02882493, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.4609972022463555, + "language_loss": 0.60738504, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.62829727, + "num_input_tokens_seen": 293181620, + "step": 13587, + "time_per_iteration": 2.581538677215576 + }, + { + "auxiliary_loss_clip": 0.01099635, + "auxiliary_loss_mlp": 0.01036442, + "balance_loss_clip": 1.03495073, + "balance_loss_mlp": 1.02401435, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.5179835040259935, + "language_loss": 0.69287086, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71423161, + "num_input_tokens_seen": 293200270, + "step": 13588, + "time_per_iteration": 4.012706756591797 + }, + { + "auxiliary_loss_clip": 0.010865, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.03576124, + "balance_loss_mlp": 1.02595925, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.591620630116939, + "language_loss": 0.72929597, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75056881, + "num_input_tokens_seen": 293218960, + "step": 13589, + "time_per_iteration": 2.5142414569854736 + }, + { + "auxiliary_loss_clip": 0.01092645, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.03554058, + "balance_loss_mlp": 1.01847541, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 1.7124283220400054, + "language_loss": 0.73625863, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75748777, + "num_input_tokens_seen": 293236450, + "step": 13590, + "time_per_iteration": 2.4886248111724854 + }, + { + "auxiliary_loss_clip": 0.01112115, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.03696942, + "balance_loss_mlp": 1.02108657, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 2.368742986338688, + "language_loss": 0.65082103, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67228615, + "num_input_tokens_seen": 293256480, + "step": 13591, + "time_per_iteration": 2.561983823776245 + }, + { + "auxiliary_loss_clip": 0.0111065, + "auxiliary_loss_mlp": 0.01035624, + "balance_loss_clip": 1.03603935, + "balance_loss_mlp": 1.02332783, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 4.16366642283247, + "language_loss": 0.68048096, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70194376, + "num_input_tokens_seen": 293274960, + "step": 13592, + "time_per_iteration": 2.421205520629883 + }, + { + "auxiliary_loss_clip": 0.01087297, + "auxiliary_loss_mlp": 0.01025877, + "balance_loss_clip": 1.03509808, + "balance_loss_mlp": 1.01315713, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 1.8784053197692538, + "language_loss": 0.66653234, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68766403, + "num_input_tokens_seen": 293295945, + "step": 13593, + "time_per_iteration": 2.5325512886047363 + }, + { + "auxiliary_loss_clip": 0.01097467, + "auxiliary_loss_mlp": 0.01033135, + "balance_loss_clip": 1.03463686, + "balance_loss_mlp": 1.02177453, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 1.8461345374841045, + "language_loss": 0.6919539, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71325988, + "num_input_tokens_seen": 293313300, + "step": 13594, + "time_per_iteration": 2.428692579269409 + }, + { + "auxiliary_loss_clip": 0.01109451, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.0357784, + "balance_loss_mlp": 1.0195483, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 3.3936845236921136, + "language_loss": 0.65839773, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.6798026, + "num_input_tokens_seen": 293333085, + "step": 13595, + "time_per_iteration": 2.4494612216949463 + }, + { + "auxiliary_loss_clip": 0.01074927, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.04129112, + "balance_loss_mlp": 1.01724994, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.7380904362154408, + "language_loss": 0.7864756, + "learning_rate": 3.394582618976658e-07, + "loss": 0.80752313, + "num_input_tokens_seen": 293351895, + "step": 13596, + "time_per_iteration": 2.587392568588257 + }, + { + "auxiliary_loss_clip": 0.01080555, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.03092074, + "balance_loss_mlp": 1.01410222, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 3.107443097453564, + "language_loss": 0.58013409, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60119629, + "num_input_tokens_seen": 293371165, + "step": 13597, + "time_per_iteration": 2.504873752593994 + }, + { + "auxiliary_loss_clip": 0.01069976, + "auxiliary_loss_mlp": 0.01031241, + "balance_loss_clip": 1.04088604, + "balance_loss_mlp": 1.02004719, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 1.5462699559883506, + "language_loss": 0.82472074, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84573299, + "num_input_tokens_seen": 293391150, + "step": 13598, + "time_per_iteration": 2.625279664993286 + }, + { + "auxiliary_loss_clip": 0.01055077, + "auxiliary_loss_mlp": 0.010333, + "balance_loss_clip": 1.04134524, + "balance_loss_mlp": 1.0217669, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 2.0412365316949113, + "language_loss": 0.82468152, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.84556532, + "num_input_tokens_seen": 293409440, + "step": 13599, + "time_per_iteration": 2.7654545307159424 + }, + { + "auxiliary_loss_clip": 0.01060661, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.03228664, + "balance_loss_mlp": 1.02392924, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 1.794747297811072, + "language_loss": 0.84008157, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.86106056, + "num_input_tokens_seen": 293428995, + "step": 13600, + "time_per_iteration": 2.634401321411133 + }, + { + "auxiliary_loss_clip": 0.01081452, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.04131651, + "balance_loss_mlp": 1.01990151, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.966574251382527, + "language_loss": 0.74085772, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76199067, + "num_input_tokens_seen": 293449155, + "step": 13601, + "time_per_iteration": 2.601653575897217 + }, + { + "auxiliary_loss_clip": 0.01077883, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.03625703, + "balance_loss_mlp": 1.01947618, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.8333851358602566, + "language_loss": 0.68236309, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.7034632, + "num_input_tokens_seen": 293466125, + "step": 13602, + "time_per_iteration": 2.5159010887145996 + }, + { + "auxiliary_loss_clip": 0.01067328, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.03177059, + "balance_loss_mlp": 1.02357554, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 2.2789527524933733, + "language_loss": 0.83352613, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85455918, + "num_input_tokens_seen": 293481345, + "step": 13603, + "time_per_iteration": 2.4920334815979004 + }, + { + "auxiliary_loss_clip": 0.01062655, + "auxiliary_loss_mlp": 0.0102362, + "balance_loss_clip": 1.03862143, + "balance_loss_mlp": 1.01245594, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 2.0753504944211905, + "language_loss": 0.69282699, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71368968, + "num_input_tokens_seen": 293502330, + "step": 13604, + "time_per_iteration": 4.1273438930511475 + }, + { + "auxiliary_loss_clip": 0.01083671, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.03964031, + "balance_loss_mlp": 1.02505875, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 1.7326880538368186, + "language_loss": 0.74139869, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76261014, + "num_input_tokens_seen": 293521415, + "step": 13605, + "time_per_iteration": 2.532910108566284 + }, + { + "auxiliary_loss_clip": 0.01071496, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.04012179, + "balance_loss_mlp": 1.02599287, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.8676780153786894, + "language_loss": 0.7384758, + "learning_rate": 3.372907076364666e-07, + "loss": 0.75957412, + "num_input_tokens_seen": 293539245, + "step": 13606, + "time_per_iteration": 2.6068570613861084 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.03694522, + "balance_loss_mlp": 1.02166367, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 2.2439915699120245, + "language_loss": 0.65323699, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67464787, + "num_input_tokens_seen": 293560640, + "step": 13607, + "time_per_iteration": 2.5289804935455322 + }, + { + "auxiliary_loss_clip": 0.01091385, + "auxiliary_loss_mlp": 0.01032071, + "balance_loss_clip": 1.03966093, + "balance_loss_mlp": 1.01954842, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.8299324531211099, + "language_loss": 0.69760269, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.71883726, + "num_input_tokens_seen": 293579465, + "step": 13608, + "time_per_iteration": 2.5476884841918945 + }, + { + "auxiliary_loss_clip": 0.01093683, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.03307319, + "balance_loss_mlp": 1.02160311, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 2.338966825727095, + "language_loss": 0.79706979, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81834221, + "num_input_tokens_seen": 293600540, + "step": 13609, + "time_per_iteration": 2.5491135120391846 + }, + { + "auxiliary_loss_clip": 0.01006395, + "auxiliary_loss_mlp": 0.01000939, + "balance_loss_clip": 1.01097393, + "balance_loss_mlp": 0.99955601, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.746814517255399, + "language_loss": 0.55889964, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.578973, + "num_input_tokens_seen": 293665160, + "step": 13610, + "time_per_iteration": 3.1936941146850586 + }, + { + "auxiliary_loss_clip": 0.01046247, + "auxiliary_loss_mlp": 0.00776746, + "balance_loss_clip": 1.02990294, + "balance_loss_mlp": 1.00054848, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 1.8979460543213604, + "language_loss": 0.77578735, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79401731, + "num_input_tokens_seen": 293683995, + "step": 13611, + "time_per_iteration": 2.6495401859283447 + }, + { + "auxiliary_loss_clip": 0.01080885, + "auxiliary_loss_mlp": 0.01031576, + "balance_loss_clip": 1.04066312, + "balance_loss_mlp": 1.01826668, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 2.123679441892573, + "language_loss": 0.77300513, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79412979, + "num_input_tokens_seen": 293704115, + "step": 13612, + "time_per_iteration": 4.155451059341431 + }, + { + "auxiliary_loss_clip": 0.01071372, + "auxiliary_loss_mlp": 0.01025501, + "balance_loss_clip": 1.03296709, + "balance_loss_mlp": 1.01387262, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 2.064764901634696, + "language_loss": 0.86424655, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88521528, + "num_input_tokens_seen": 293722225, + "step": 13613, + "time_per_iteration": 2.5149848461151123 + }, + { + "auxiliary_loss_clip": 0.01097337, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_clip": 1.03722715, + "balance_loss_mlp": 1.03184068, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 1.5160071870407834, + "language_loss": 0.728127, + "learning_rate": 3.355612034397746e-07, + "loss": 0.74953234, + "num_input_tokens_seen": 293743995, + "step": 13614, + "time_per_iteration": 2.5094122886657715 + }, + { + "auxiliary_loss_clip": 0.01087984, + "auxiliary_loss_mlp": 0.01040346, + "balance_loss_clip": 1.03588438, + "balance_loss_mlp": 1.02807975, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.7073950728296647, + "language_loss": 0.8087737, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83005702, + "num_input_tokens_seen": 293764935, + "step": 13615, + "time_per_iteration": 2.549309253692627 + }, + { + "auxiliary_loss_clip": 0.01094562, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.0326004, + "balance_loss_mlp": 1.02124882, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.9529126905960015, + "language_loss": 0.75400078, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.77528578, + "num_input_tokens_seen": 293784035, + "step": 13616, + "time_per_iteration": 2.494130849838257 + }, + { + "auxiliary_loss_clip": 0.01069601, + "auxiliary_loss_mlp": 0.01041037, + "balance_loss_clip": 1.03130627, + "balance_loss_mlp": 1.02747667, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.8299111627677884, + "language_loss": 0.75261933, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77372575, + "num_input_tokens_seen": 293803360, + "step": 13617, + "time_per_iteration": 2.542076826095581 + }, + { + "auxiliary_loss_clip": 0.01077644, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.0324918, + "balance_loss_mlp": 1.02056003, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 1.9353136578773096, + "language_loss": 0.68094015, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70203716, + "num_input_tokens_seen": 293821325, + "step": 13618, + "time_per_iteration": 4.245892763137817 + }, + { + "auxiliary_loss_clip": 0.01090525, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.04084587, + "balance_loss_mlp": 1.01964641, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 1.7192589645998118, + "language_loss": 0.69702405, + "learning_rate": 3.344823143102058e-07, + "loss": 0.71825409, + "num_input_tokens_seen": 293840315, + "step": 13619, + "time_per_iteration": 2.5730228424072266 + }, + { + "auxiliary_loss_clip": 0.01052155, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.04116368, + "balance_loss_mlp": 1.01399708, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 1.8136930420571513, + "language_loss": 0.74024379, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.76102805, + "num_input_tokens_seen": 293855685, + "step": 13620, + "time_per_iteration": 2.598871946334839 + }, + { + "auxiliary_loss_clip": 0.01079878, + "auxiliary_loss_mlp": 0.00780428, + "balance_loss_clip": 1.03186631, + "balance_loss_mlp": 1.00067031, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.5645908057878253, + "language_loss": 0.76309419, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78169727, + "num_input_tokens_seen": 293875540, + "step": 13621, + "time_per_iteration": 2.536013603210449 + }, + { + "auxiliary_loss_clip": 0.0108566, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.03248012, + "balance_loss_mlp": 1.02025056, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 2.3058418157658624, + "language_loss": 0.65421999, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67540991, + "num_input_tokens_seen": 293896570, + "step": 13622, + "time_per_iteration": 2.592665672302246 + }, + { + "auxiliary_loss_clip": 0.01111151, + "auxiliary_loss_mlp": 0.01029587, + "balance_loss_clip": 1.0383153, + "balance_loss_mlp": 1.0161283, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 1.8868394743100845, + "language_loss": 0.74997157, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.77137893, + "num_input_tokens_seen": 293914680, + "step": 13623, + "time_per_iteration": 2.4674899578094482 + }, + { + "auxiliary_loss_clip": 0.01088016, + "auxiliary_loss_mlp": 0.01038424, + "balance_loss_clip": 1.03457665, + "balance_loss_mlp": 1.02574646, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 2.840375911925893, + "language_loss": 0.62663341, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.64789784, + "num_input_tokens_seen": 293936480, + "step": 13624, + "time_per_iteration": 2.66467547416687 + }, + { + "auxiliary_loss_clip": 0.01106861, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.03636432, + "balance_loss_mlp": 1.02409053, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.4994089843923295, + "language_loss": 0.77911615, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80054402, + "num_input_tokens_seen": 293957815, + "step": 13625, + "time_per_iteration": 2.4883344173431396 + }, + { + "auxiliary_loss_clip": 0.0110426, + "auxiliary_loss_mlp": 0.00778783, + "balance_loss_clip": 1.03544009, + "balance_loss_mlp": 1.00081968, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 2.1516970208349333, + "language_loss": 0.75930011, + "learning_rate": 3.329745223345244e-07, + "loss": 0.77813053, + "num_input_tokens_seen": 293975440, + "step": 13626, + "time_per_iteration": 2.500694513320923 + }, + { + "auxiliary_loss_clip": 0.01096245, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.03581548, + "balance_loss_mlp": 1.02225029, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.5979795100286582, + "language_loss": 0.73611915, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.75741625, + "num_input_tokens_seen": 293997540, + "step": 13627, + "time_per_iteration": 2.516331434249878 + }, + { + "auxiliary_loss_clip": 0.01109248, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.03704906, + "balance_loss_mlp": 1.02150106, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 2.1289315222963068, + "language_loss": 0.69137734, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.7128104, + "num_input_tokens_seen": 294017030, + "step": 13628, + "time_per_iteration": 3.9350481033325195 + }, + { + "auxiliary_loss_clip": 0.01087385, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.03512311, + "balance_loss_mlp": 1.02746308, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.6198997138240971, + "language_loss": 0.85179967, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87309027, + "num_input_tokens_seen": 294035700, + "step": 13629, + "time_per_iteration": 2.4716570377349854 + }, + { + "auxiliary_loss_clip": 0.01106247, + "auxiliary_loss_mlp": 0.01029108, + "balance_loss_clip": 1.03574514, + "balance_loss_mlp": 1.01669848, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 2.3313179520197473, + "language_loss": 0.73540699, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.75676054, + "num_input_tokens_seen": 294049730, + "step": 13630, + "time_per_iteration": 2.427889347076416 + }, + { + "auxiliary_loss_clip": 0.01097003, + "auxiliary_loss_mlp": 0.01038579, + "balance_loss_clip": 1.03588545, + "balance_loss_mlp": 1.02467275, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 1.9296584572733968, + "language_loss": 0.7262938, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74764961, + "num_input_tokens_seen": 294066545, + "step": 13631, + "time_per_iteration": 2.476243734359741 + }, + { + "auxiliary_loss_clip": 0.01108483, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.03577161, + "balance_loss_mlp": 1.02206922, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 2.4150179660393936, + "language_loss": 0.76854551, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78997022, + "num_input_tokens_seen": 294087455, + "step": 13632, + "time_per_iteration": 2.5392189025878906 + }, + { + "auxiliary_loss_clip": 0.01083626, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.03287888, + "balance_loss_mlp": 1.02378261, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1.7979905679536357, + "language_loss": 0.66050482, + "learning_rate": 3.314698278332588e-07, + "loss": 0.68170083, + "num_input_tokens_seen": 294107480, + "step": 13633, + "time_per_iteration": 2.545492172241211 + }, + { + "auxiliary_loss_clip": 0.01092611, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.03419971, + "balance_loss_mlp": 1.02219105, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 1.5293543267398866, + "language_loss": 0.75944984, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.78071022, + "num_input_tokens_seen": 294130115, + "step": 13634, + "time_per_iteration": 2.518284797668457 + }, + { + "auxiliary_loss_clip": 0.01051429, + "auxiliary_loss_mlp": 0.00776716, + "balance_loss_clip": 1.03488302, + "balance_loss_mlp": 1.00059271, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 2.0129834022957755, + "language_loss": 0.81677794, + "learning_rate": 3.310404844338841e-07, + "loss": 0.8350594, + "num_input_tokens_seen": 294148495, + "step": 13635, + "time_per_iteration": 2.662980079650879 + }, + { + "auxiliary_loss_clip": 0.01094719, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.0324955, + "balance_loss_mlp": 1.019876, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.6446750237918066, + "language_loss": 0.76020586, + "learning_rate": 3.308259076607949e-07, + "loss": 0.78147745, + "num_input_tokens_seen": 294169595, + "step": 13636, + "time_per_iteration": 2.5606350898742676 + }, + { + "auxiliary_loss_clip": 0.01084794, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.041484, + "balance_loss_mlp": 1.01974559, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 2.360973185165071, + "language_loss": 0.81354213, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83471125, + "num_input_tokens_seen": 294183885, + "step": 13637, + "time_per_iteration": 2.55364990234375 + }, + { + "auxiliary_loss_clip": 0.01098122, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.03776979, + "balance_loss_mlp": 1.01978302, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.1364175684394797, + "language_loss": 0.71181154, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73311436, + "num_input_tokens_seen": 294200150, + "step": 13638, + "time_per_iteration": 2.559238910675049 + }, + { + "auxiliary_loss_clip": 0.01070009, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.03809714, + "balance_loss_mlp": 1.02323627, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 2.0061206968491305, + "language_loss": 0.79745603, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81852889, + "num_input_tokens_seen": 294220385, + "step": 13639, + "time_per_iteration": 2.6291956901550293 + }, + { + "auxiliary_loss_clip": 0.01064026, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.03203797, + "balance_loss_mlp": 1.02041841, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.6676255735778596, + "language_loss": 0.7923227, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81328988, + "num_input_tokens_seen": 294239355, + "step": 13640, + "time_per_iteration": 2.5564136505126953 + }, + { + "auxiliary_loss_clip": 0.0107756, + "auxiliary_loss_mlp": 0.01034247, + "balance_loss_clip": 1.03409612, + "balance_loss_mlp": 1.02131844, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 2.823228670482716, + "language_loss": 0.63291478, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65403283, + "num_input_tokens_seen": 294259395, + "step": 13641, + "time_per_iteration": 2.6831932067871094 + }, + { + "auxiliary_loss_clip": 0.01059222, + "auxiliary_loss_mlp": 0.01028939, + "balance_loss_clip": 1.03700709, + "balance_loss_mlp": 1.01584411, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 2.3672453682370826, + "language_loss": 0.73166227, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75254393, + "num_input_tokens_seen": 294277365, + "step": 13642, + "time_per_iteration": 2.5628323554992676 + }, + { + "auxiliary_loss_clip": 0.01083881, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.03768086, + "balance_loss_mlp": 1.02184796, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.9178867291749735, + "language_loss": 0.70461643, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72579086, + "num_input_tokens_seen": 294297555, + "step": 13643, + "time_per_iteration": 2.5900285243988037 + }, + { + "auxiliary_loss_clip": 0.01097837, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.03744733, + "balance_loss_mlp": 1.02032113, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 2.237470382142637, + "language_loss": 0.65572822, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67702615, + "num_input_tokens_seen": 294317600, + "step": 13644, + "time_per_iteration": 3.9375407695770264 + }, + { + "auxiliary_loss_clip": 0.0107161, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.03536987, + "balance_loss_mlp": 1.02228189, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 2.3556790716595395, + "language_loss": 0.70869642, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72975314, + "num_input_tokens_seen": 294340215, + "step": 13645, + "time_per_iteration": 2.5806467533111572 + }, + { + "auxiliary_loss_clip": 0.01084716, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.03670239, + "balance_loss_mlp": 1.01814556, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 1.8640631390242575, + "language_loss": 0.71318293, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73433477, + "num_input_tokens_seen": 294358590, + "step": 13646, + "time_per_iteration": 2.589707374572754 + }, + { + "auxiliary_loss_clip": 0.01091565, + "auxiliary_loss_mlp": 0.01033125, + "balance_loss_clip": 1.0370152, + "balance_loss_mlp": 1.02056599, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.100120522779655, + "language_loss": 0.78566402, + "learning_rate": 3.284697424316132e-07, + "loss": 0.80691099, + "num_input_tokens_seen": 294375825, + "step": 13647, + "time_per_iteration": 2.584193229675293 + }, + { + "auxiliary_loss_clip": 0.0110579, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.03752613, + "balance_loss_mlp": 1.02451706, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 1.5169081953417751, + "language_loss": 0.67893696, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.70035803, + "num_input_tokens_seen": 294398500, + "step": 13648, + "time_per_iteration": 2.4958505630493164 + }, + { + "auxiliary_loss_clip": 0.01083742, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.03345847, + "balance_loss_mlp": 1.01798666, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.965586878829589, + "language_loss": 0.80314779, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82429564, + "num_input_tokens_seen": 294418840, + "step": 13649, + "time_per_iteration": 2.5972023010253906 + }, + { + "auxiliary_loss_clip": 0.01095498, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.03925347, + "balance_loss_mlp": 1.02195716, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.7656426395567109, + "language_loss": 0.69155288, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71286398, + "num_input_tokens_seen": 294438215, + "step": 13650, + "time_per_iteration": 2.478461265563965 + }, + { + "auxiliary_loss_clip": 0.01093511, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.03911138, + "balance_loss_mlp": 1.01889157, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 2.569240875058462, + "language_loss": 0.60658962, + "learning_rate": 3.276148560452001e-07, + "loss": 0.62784374, + "num_input_tokens_seen": 294455260, + "step": 13651, + "time_per_iteration": 3.966463565826416 + }, + { + "auxiliary_loss_clip": 0.01073134, + "auxiliary_loss_mlp": 0.00782781, + "balance_loss_clip": 1.03574324, + "balance_loss_mlp": 1.0007472, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 2.9124111192299584, + "language_loss": 0.72184515, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.74040437, + "num_input_tokens_seen": 294473205, + "step": 13652, + "time_per_iteration": 2.6219544410705566 + }, + { + "auxiliary_loss_clip": 0.01081719, + "auxiliary_loss_mlp": 0.01027639, + "balance_loss_clip": 1.0347383, + "balance_loss_mlp": 1.01733351, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 1.874750840955212, + "language_loss": 0.7318669, + "learning_rate": 3.271877933216558e-07, + "loss": 0.7529605, + "num_input_tokens_seen": 294490645, + "step": 13653, + "time_per_iteration": 2.469113349914551 + }, + { + "auxiliary_loss_clip": 0.01073847, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.03730249, + "balance_loss_mlp": 1.01780891, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 1.7756343469722418, + "language_loss": 0.62569922, + "learning_rate": 3.269743571056451e-07, + "loss": 0.64675051, + "num_input_tokens_seen": 294513500, + "step": 13654, + "time_per_iteration": 2.730806350708008 + }, + { + "auxiliary_loss_clip": 0.01085252, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.03651357, + "balance_loss_mlp": 1.01729679, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.5881468550972948, + "language_loss": 0.7017175, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72286594, + "num_input_tokens_seen": 294535710, + "step": 13655, + "time_per_iteration": 2.6023573875427246 + }, + { + "auxiliary_loss_clip": 0.01084139, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.0358355, + "balance_loss_mlp": 1.02302217, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 2.289721799519659, + "language_loss": 0.82013047, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84132618, + "num_input_tokens_seen": 294554055, + "step": 13656, + "time_per_iteration": 2.5226070880889893 + }, + { + "auxiliary_loss_clip": 0.01084831, + "auxiliary_loss_mlp": 0.0103518, + "balance_loss_clip": 1.03972697, + "balance_loss_mlp": 1.02353358, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.3168937617587733, + "language_loss": 0.73747241, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.75867248, + "num_input_tokens_seen": 294570390, + "step": 13657, + "time_per_iteration": 3.9045422077178955 + }, + { + "auxiliary_loss_clip": 0.01080425, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.03705966, + "balance_loss_mlp": 1.02026665, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 1.9225521022197636, + "language_loss": 0.55448794, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57561135, + "num_input_tokens_seen": 294593050, + "step": 13658, + "time_per_iteration": 2.614830255508423 + }, + { + "auxiliary_loss_clip": 0.01049493, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.03373218, + "balance_loss_mlp": 1.0192585, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.026029671243546, + "language_loss": 0.79241157, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81322145, + "num_input_tokens_seen": 294608550, + "step": 13659, + "time_per_iteration": 2.581735849380493 + }, + { + "auxiliary_loss_clip": 0.01091612, + "auxiliary_loss_mlp": 0.01026182, + "balance_loss_clip": 1.0341208, + "balance_loss_mlp": 1.01535177, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.6726203534363508, + "language_loss": 0.59788138, + "learning_rate": 3.256950723599887e-07, + "loss": 0.61905932, + "num_input_tokens_seen": 294630380, + "step": 13660, + "time_per_iteration": 2.6657183170318604 + }, + { + "auxiliary_loss_clip": 0.01096819, + "auxiliary_loss_mlp": 0.01035468, + "balance_loss_clip": 1.03462577, + "balance_loss_mlp": 1.02208042, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 2.106274380199056, + "language_loss": 0.72266853, + "learning_rate": 3.254820804029075e-07, + "loss": 0.74399137, + "num_input_tokens_seen": 294648655, + "step": 13661, + "time_per_iteration": 2.4848814010620117 + }, + { + "auxiliary_loss_clip": 0.01094516, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.03521025, + "balance_loss_mlp": 1.01953268, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 2.3274091527328835, + "language_loss": 0.74903309, + "learning_rate": 3.252691519437143e-07, + "loss": 0.77029926, + "num_input_tokens_seen": 294666915, + "step": 13662, + "time_per_iteration": 2.4278557300567627 + }, + { + "auxiliary_loss_clip": 0.010279, + "auxiliary_loss_mlp": 0.00999503, + "balance_loss_clip": 1.0046196, + "balance_loss_mlp": 0.99822778, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.7597790999846177, + "language_loss": 0.53972995, + "learning_rate": 3.250562869904825e-07, + "loss": 0.560004, + "num_input_tokens_seen": 294731545, + "step": 13663, + "time_per_iteration": 3.1471641063690186 + }, + { + "auxiliary_loss_clip": 0.01066083, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.03295898, + "balance_loss_mlp": 1.01864338, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.10917764209055, + "language_loss": 0.65856338, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67953515, + "num_input_tokens_seen": 294748745, + "step": 13664, + "time_per_iteration": 2.557219982147217 + }, + { + "auxiliary_loss_clip": 0.01083928, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.03477836, + "balance_loss_mlp": 1.01802778, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.628891007299944, + "language_loss": 0.75138474, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77251607, + "num_input_tokens_seen": 294768955, + "step": 13665, + "time_per_iteration": 2.5366551876068115 + }, + { + "auxiliary_loss_clip": 0.0109431, + "auxiliary_loss_mlp": 0.00777868, + "balance_loss_clip": 1.03765583, + "balance_loss_mlp": 1.00076246, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 2.1080212571887236, + "language_loss": 0.65763909, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67636085, + "num_input_tokens_seen": 294789250, + "step": 13666, + "time_per_iteration": 2.585965394973755 + }, + { + "auxiliary_loss_clip": 0.01064687, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.04020846, + "balance_loss_mlp": 1.01827383, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.7348172382198457, + "language_loss": 0.76742399, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.78837192, + "num_input_tokens_seen": 294809760, + "step": 13667, + "time_per_iteration": 4.123215436935425 + }, + { + "auxiliary_loss_clip": 0.01076922, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.03708327, + "balance_loss_mlp": 1.0197072, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 1.8952355435726715, + "language_loss": 0.77274024, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79382861, + "num_input_tokens_seen": 294826495, + "step": 13668, + "time_per_iteration": 2.5366714000701904 + }, + { + "auxiliary_loss_clip": 0.01068087, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.03921604, + "balance_loss_mlp": 1.02345824, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 2.0050810714234455, + "language_loss": 0.7328701, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75390613, + "num_input_tokens_seen": 294845370, + "step": 13669, + "time_per_iteration": 2.5925164222717285 + }, + { + "auxiliary_loss_clip": 0.01096621, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.03593671, + "balance_loss_mlp": 1.01508987, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 2.296806402690047, + "language_loss": 0.78692806, + "learning_rate": 3.235680111625161e-07, + "loss": 0.80816138, + "num_input_tokens_seen": 294863740, + "step": 13670, + "time_per_iteration": 2.4623448848724365 + }, + { + "auxiliary_loss_clip": 0.01101818, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.03760457, + "balance_loss_mlp": 1.02539515, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 1.9409200422887072, + "language_loss": 0.74902231, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.77042633, + "num_input_tokens_seen": 294882815, + "step": 13671, + "time_per_iteration": 2.5285823345184326 + }, + { + "auxiliary_loss_clip": 0.01103858, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.03813553, + "balance_loss_mlp": 1.01783991, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 1.7943997124192772, + "language_loss": 0.76281458, + "learning_rate": 3.23143361510728e-07, + "loss": 0.78415895, + "num_input_tokens_seen": 294901985, + "step": 13672, + "time_per_iteration": 2.5180163383483887 + }, + { + "auxiliary_loss_clip": 0.01064531, + "auxiliary_loss_mlp": 0.01032429, + "balance_loss_clip": 1.03814435, + "balance_loss_mlp": 1.01869655, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.1346301913204395, + "language_loss": 0.74839503, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76936466, + "num_input_tokens_seen": 294919705, + "step": 13673, + "time_per_iteration": 2.6002445220947266 + }, + { + "auxiliary_loss_clip": 0.01091259, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.03844011, + "balance_loss_mlp": 1.01997566, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.7277417178718848, + "language_loss": 0.79343784, + "learning_rate": 3.227189662052254e-07, + "loss": 0.81467474, + "num_input_tokens_seen": 294939900, + "step": 13674, + "time_per_iteration": 2.569420099258423 + }, + { + "auxiliary_loss_clip": 0.0108538, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.03427982, + "balance_loss_mlp": 1.02218938, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 2.771073878082766, + "language_loss": 0.70203346, + "learning_rate": 3.225068639524484e-07, + "loss": 0.7232303, + "num_input_tokens_seen": 294959110, + "step": 13675, + "time_per_iteration": 2.5135786533355713 + }, + { + "auxiliary_loss_clip": 0.01091283, + "auxiliary_loss_mlp": 0.01038155, + "balance_loss_clip": 1.03456676, + "balance_loss_mlp": 1.02576923, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.7556158098087207, + "language_loss": 0.74296188, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.7642563, + "num_input_tokens_seen": 294978660, + "step": 13676, + "time_per_iteration": 2.5323097705841064 + }, + { + "auxiliary_loss_clip": 0.01082569, + "auxiliary_loss_mlp": 0.01031246, + "balance_loss_clip": 1.03746736, + "balance_loss_mlp": 1.01987982, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 1.7315967313353338, + "language_loss": 0.80394351, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.82508165, + "num_input_tokens_seen": 294998075, + "step": 13677, + "time_per_iteration": 2.4968459606170654 + }, + { + "auxiliary_loss_clip": 0.01095541, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.03501546, + "balance_loss_mlp": 1.02090621, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 2.8650485185772117, + "language_loss": 0.69982183, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72112, + "num_input_tokens_seen": 295015950, + "step": 13678, + "time_per_iteration": 2.4614338874816895 + }, + { + "auxiliary_loss_clip": 0.01106939, + "auxiliary_loss_mlp": 0.01038292, + "balance_loss_clip": 1.03547597, + "balance_loss_mlp": 1.02525103, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.5889387812402644, + "language_loss": 0.71390593, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73535824, + "num_input_tokens_seen": 295036800, + "step": 13679, + "time_per_iteration": 2.504948854446411 + }, + { + "auxiliary_loss_clip": 0.01080591, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.03311443, + "balance_loss_mlp": 1.02184319, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 2.0781212377932214, + "language_loss": 0.70020413, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72136343, + "num_input_tokens_seen": 295055300, + "step": 13680, + "time_per_iteration": 2.5164413452148438 + }, + { + "auxiliary_loss_clip": 0.01077718, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.03724968, + "balance_loss_mlp": 1.01902246, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 1.637198789379628, + "language_loss": 0.59883034, + "learning_rate": 3.21235586541986e-07, + "loss": 0.61990857, + "num_input_tokens_seen": 295076420, + "step": 13681, + "time_per_iteration": 2.5628514289855957 + }, + { + "auxiliary_loss_clip": 0.01086201, + "auxiliary_loss_mlp": 0.01040152, + "balance_loss_clip": 1.03469598, + "balance_loss_mlp": 1.02754557, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.7483609399206064, + "language_loss": 0.69807923, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.71934277, + "num_input_tokens_seen": 295100540, + "step": 13682, + "time_per_iteration": 2.668069362640381 + }, + { + "auxiliary_loss_clip": 0.01109745, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.03672719, + "balance_loss_mlp": 1.01980972, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 1.9312194000116527, + "language_loss": 0.79286158, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81429261, + "num_input_tokens_seen": 295120180, + "step": 13683, + "time_per_iteration": 3.9117352962493896 + }, + { + "auxiliary_loss_clip": 0.0110412, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.03567457, + "balance_loss_mlp": 1.02299893, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 2.0535285543576256, + "language_loss": 0.86628759, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88766718, + "num_input_tokens_seen": 295138530, + "step": 13684, + "time_per_iteration": 2.477357864379883 + }, + { + "auxiliary_loss_clip": 0.01105121, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.03554893, + "balance_loss_mlp": 1.01981306, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.62676641915984, + "language_loss": 0.79873443, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.82009673, + "num_input_tokens_seen": 295160260, + "step": 13685, + "time_per_iteration": 2.4865565299987793 + }, + { + "auxiliary_loss_clip": 0.01075926, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.03804445, + "balance_loss_mlp": 1.01999569, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.681990884402368, + "language_loss": 0.68871093, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.70978928, + "num_input_tokens_seen": 295177055, + "step": 13686, + "time_per_iteration": 2.5787839889526367 + }, + { + "auxiliary_loss_clip": 0.01077488, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.03671944, + "balance_loss_mlp": 1.02031767, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 1.8777752838455277, + "language_loss": 0.78524935, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80634838, + "num_input_tokens_seen": 295193870, + "step": 13687, + "time_per_iteration": 2.511662006378174 + }, + { + "auxiliary_loss_clip": 0.01099831, + "auxiliary_loss_mlp": 0.01030135, + "balance_loss_clip": 1.03733587, + "balance_loss_mlp": 1.01754057, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 2.6536789629761546, + "language_loss": 0.72463179, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.74593145, + "num_input_tokens_seen": 295211040, + "step": 13688, + "time_per_iteration": 2.441164016723633 + }, + { + "auxiliary_loss_clip": 0.01109541, + "auxiliary_loss_mlp": 0.00777585, + "balance_loss_clip": 1.03711462, + "balance_loss_mlp": 1.00068998, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 1.63470514856218, + "language_loss": 0.7316581, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75052941, + "num_input_tokens_seen": 295231300, + "step": 13689, + "time_per_iteration": 2.473217487335205 + }, + { + "auxiliary_loss_clip": 0.01099632, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.03536069, + "balance_loss_mlp": 1.02091837, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 2.232792084922358, + "language_loss": 0.69153684, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71286857, + "num_input_tokens_seen": 295251045, + "step": 13690, + "time_per_iteration": 2.47776460647583 + }, + { + "auxiliary_loss_clip": 0.01064374, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.03145194, + "balance_loss_mlp": 1.02894831, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 2.0836911361798944, + "language_loss": 0.85680151, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87787259, + "num_input_tokens_seen": 295270225, + "step": 13691, + "time_per_iteration": 4.0482635498046875 + }, + { + "auxiliary_loss_clip": 0.01101698, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.03738987, + "balance_loss_mlp": 1.0214715, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 1.815536209609821, + "language_loss": 0.76840007, + "learning_rate": 3.189108646472252e-07, + "loss": 0.78975105, + "num_input_tokens_seen": 295288950, + "step": 13692, + "time_per_iteration": 2.6175150871276855 + }, + { + "auxiliary_loss_clip": 0.01095259, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.03538013, + "balance_loss_mlp": 1.01603556, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.505566079203105, + "language_loss": 0.71703565, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73826903, + "num_input_tokens_seen": 295309405, + "step": 13693, + "time_per_iteration": 2.4992659091949463 + }, + { + "auxiliary_loss_clip": 0.01071622, + "auxiliary_loss_mlp": 0.01028994, + "balance_loss_clip": 1.03264236, + "balance_loss_mlp": 1.01815748, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.4244179540745063, + "language_loss": 0.83901298, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.86001915, + "num_input_tokens_seen": 295331115, + "step": 13694, + "time_per_iteration": 2.6474292278289795 + }, + { + "auxiliary_loss_clip": 0.01052946, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.04101753, + "balance_loss_mlp": 1.02016473, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.6986895970562879, + "language_loss": 0.76766765, + "learning_rate": 3.182781878250118e-07, + "loss": 0.78852183, + "num_input_tokens_seen": 295350495, + "step": 13695, + "time_per_iteration": 2.652653455734253 + }, + { + "auxiliary_loss_clip": 0.01087538, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.03636551, + "balance_loss_mlp": 1.02109337, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 1.998040394975347, + "language_loss": 0.80916095, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.83036256, + "num_input_tokens_seen": 295368225, + "step": 13696, + "time_per_iteration": 2.5226612091064453 + }, + { + "auxiliary_loss_clip": 0.0101916, + "auxiliary_loss_mlp": 0.00999648, + "balance_loss_clip": 1.0052731, + "balance_loss_mlp": 0.99842662, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.9815974448579957, + "language_loss": 0.63875222, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65894032, + "num_input_tokens_seen": 295430035, + "step": 13697, + "time_per_iteration": 4.496326208114624 + }, + { + "auxiliary_loss_clip": 0.01071741, + "auxiliary_loss_mlp": 0.01024169, + "balance_loss_clip": 1.03588235, + "balance_loss_mlp": 1.01367807, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.759605382733866, + "language_loss": 0.73104262, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75200176, + "num_input_tokens_seen": 295447765, + "step": 13698, + "time_per_iteration": 2.540936231613159 + }, + { + "auxiliary_loss_clip": 0.01065402, + "auxiliary_loss_mlp": 0.01040802, + "balance_loss_clip": 1.02861786, + "balance_loss_mlp": 1.0258528, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 1.9050739914130554, + "language_loss": 0.72076076, + "learning_rate": 3.174355115608305e-07, + "loss": 0.74182278, + "num_input_tokens_seen": 295464810, + "step": 13699, + "time_per_iteration": 2.545830488204956 + }, + { + "auxiliary_loss_clip": 0.01086917, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.03594613, + "balance_loss_mlp": 1.01589561, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 2.4925102787424147, + "language_loss": 0.81701183, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.83815813, + "num_input_tokens_seen": 295482605, + "step": 13700, + "time_per_iteration": 2.5306060314178467 + }, + { + "auxiliary_loss_clip": 0.01086666, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.03605723, + "balance_loss_mlp": 1.02246273, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 2.492374801967981, + "language_loss": 0.7289868, + "learning_rate": 3.170145562148763e-07, + "loss": 0.7501893, + "num_input_tokens_seen": 295503780, + "step": 13701, + "time_per_iteration": 2.547218084335327 + }, + { + "auxiliary_loss_clip": 0.0109716, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.03367186, + "balance_loss_mlp": 1.02306414, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 5.893751065814095, + "language_loss": 0.69045752, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.7117855, + "num_input_tokens_seen": 295522035, + "step": 13702, + "time_per_iteration": 2.515786647796631 + }, + { + "auxiliary_loss_clip": 0.01062822, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.03519368, + "balance_loss_mlp": 1.0199542, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 2.051735080567884, + "language_loss": 0.75002271, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.7709738, + "num_input_tokens_seen": 295541190, + "step": 13703, + "time_per_iteration": 2.613833427429199 + }, + { + "auxiliary_loss_clip": 0.01112512, + "auxiliary_loss_mlp": 0.01041384, + "balance_loss_clip": 1.03623629, + "balance_loss_mlp": 1.02775848, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 1.9315422955314145, + "language_loss": 0.69485456, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.71639353, + "num_input_tokens_seen": 295558860, + "step": 13704, + "time_per_iteration": 2.4821741580963135 + }, + { + "auxiliary_loss_clip": 0.01105638, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03510165, + "balance_loss_mlp": 1.01881123, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 1.6356292556159, + "language_loss": 0.63879979, + "learning_rate": 3.161734114144916e-07, + "loss": 0.66015983, + "num_input_tokens_seen": 295578155, + "step": 13705, + "time_per_iteration": 2.5103936195373535 + }, + { + "auxiliary_loss_clip": 0.01109231, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.03556085, + "balance_loss_mlp": 1.01877093, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 2.3751194510274507, + "language_loss": 0.69743818, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.7188462, + "num_input_tokens_seen": 295599170, + "step": 13706, + "time_per_iteration": 2.4483449459075928 + }, + { + "auxiliary_loss_clip": 0.01086219, + "auxiliary_loss_mlp": 0.01039822, + "balance_loss_clip": 1.03813148, + "balance_loss_mlp": 1.02661991, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.849523735039121, + "language_loss": 0.69694972, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71821016, + "num_input_tokens_seen": 295617465, + "step": 13707, + "time_per_iteration": 3.9931821823120117 + }, + { + "auxiliary_loss_clip": 0.01074752, + "auxiliary_loss_mlp": 0.01037661, + "balance_loss_clip": 1.03359842, + "balance_loss_mlp": 1.02431571, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 2.306253195647908, + "language_loss": 0.79077363, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81189775, + "num_input_tokens_seen": 295634960, + "step": 13708, + "time_per_iteration": 2.6292059421539307 + }, + { + "auxiliary_loss_clip": 0.01093721, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.03373909, + "balance_loss_mlp": 1.02004457, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 2.3558017327824414, + "language_loss": 0.68463981, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70591235, + "num_input_tokens_seen": 295652725, + "step": 13709, + "time_per_iteration": 2.453721284866333 + }, + { + "auxiliary_loss_clip": 0.01068295, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.03277469, + "balance_loss_mlp": 1.0218277, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 2.106848389945312, + "language_loss": 0.82627368, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84729254, + "num_input_tokens_seen": 295671195, + "step": 13710, + "time_per_iteration": 2.6011788845062256 + }, + { + "auxiliary_loss_clip": 0.01097137, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.03525972, + "balance_loss_mlp": 1.01862216, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 2.053999913365346, + "language_loss": 0.78396142, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80524504, + "num_input_tokens_seen": 295689130, + "step": 13711, + "time_per_iteration": 2.4972004890441895 + }, + { + "auxiliary_loss_clip": 0.01078235, + "auxiliary_loss_mlp": 0.01029647, + "balance_loss_clip": 1.03555346, + "balance_loss_mlp": 1.0168736, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 2.164496027009473, + "language_loss": 0.65935457, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.68043339, + "num_input_tokens_seen": 295706385, + "step": 13712, + "time_per_iteration": 2.5622925758361816 + }, + { + "auxiliary_loss_clip": 0.01091394, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.03537548, + "balance_loss_mlp": 1.01785994, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.697170099828665, + "language_loss": 0.7392354, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76044428, + "num_input_tokens_seen": 295727925, + "step": 13713, + "time_per_iteration": 2.5326590538024902 + }, + { + "auxiliary_loss_clip": 0.01097, + "auxiliary_loss_mlp": 0.01026812, + "balance_loss_clip": 1.03457081, + "balance_loss_mlp": 1.01498079, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 6.288497767006589, + "language_loss": 0.81329727, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83453536, + "num_input_tokens_seen": 295744420, + "step": 13714, + "time_per_iteration": 2.4327354431152344 + }, + { + "auxiliary_loss_clip": 0.01099657, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.03860426, + "balance_loss_mlp": 1.01756847, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.9606269972394388, + "language_loss": 0.66045219, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.6817556, + "num_input_tokens_seen": 295765105, + "step": 13715, + "time_per_iteration": 2.547430992126465 + }, + { + "auxiliary_loss_clip": 0.01082473, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.03764307, + "balance_loss_mlp": 1.01758027, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 1.7939039899381088, + "language_loss": 0.74894059, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.77006626, + "num_input_tokens_seen": 295784200, + "step": 13716, + "time_per_iteration": 2.577219247817993 + }, + { + "auxiliary_loss_clip": 0.00990988, + "auxiliary_loss_mlp": 0.01001388, + "balance_loss_clip": 1.00962496, + "balance_loss_mlp": 1.00009441, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7164486581593086, + "language_loss": 0.58974564, + "learning_rate": 3.136561087351175e-07, + "loss": 0.60966933, + "num_input_tokens_seen": 295846555, + "step": 13717, + "time_per_iteration": 3.2233264446258545 + }, + { + "auxiliary_loss_clip": 0.01095072, + "auxiliary_loss_mlp": 0.00778027, + "balance_loss_clip": 1.03532791, + "balance_loss_mlp": 1.00063992, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 2.185907545874779, + "language_loss": 0.79747891, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81620991, + "num_input_tokens_seen": 295863425, + "step": 13718, + "time_per_iteration": 2.458808183670044 + }, + { + "auxiliary_loss_clip": 0.01087768, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.03456092, + "balance_loss_mlp": 1.02239418, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.5572132473834128, + "language_loss": 0.68706214, + "learning_rate": 3.132374531662778e-07, + "loss": 0.70827925, + "num_input_tokens_seen": 295880925, + "step": 13719, + "time_per_iteration": 2.4386141300201416 + }, + { + "auxiliary_loss_clip": 0.0108351, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.03628778, + "balance_loss_mlp": 1.02136421, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 2.551305967908974, + "language_loss": 0.69940662, + "learning_rate": 3.13028221321197e-07, + "loss": 0.72060513, + "num_input_tokens_seen": 295898205, + "step": 13720, + "time_per_iteration": 2.4995322227478027 + }, + { + "auxiliary_loss_clip": 0.01039128, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.0384382, + "balance_loss_mlp": 1.01430893, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 1.6295929867346446, + "language_loss": 0.76086402, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.78152263, + "num_input_tokens_seen": 295918130, + "step": 13721, + "time_per_iteration": 2.9126391410827637 + }, + { + "auxiliary_loss_clip": 0.01061277, + "auxiliary_loss_mlp": 0.01025362, + "balance_loss_clip": 1.03908038, + "balance_loss_mlp": 1.01391232, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 1.8523657445937598, + "language_loss": 0.78494501, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.80581141, + "num_input_tokens_seen": 295937760, + "step": 13722, + "time_per_iteration": 4.457479476928711 + }, + { + "auxiliary_loss_clip": 0.01107591, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.03689814, + "balance_loss_mlp": 1.01952398, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.7934606211090922, + "language_loss": 0.62363297, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.64502239, + "num_input_tokens_seen": 295957585, + "step": 13723, + "time_per_iteration": 2.542165517807007 + }, + { + "auxiliary_loss_clip": 0.01108911, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.03642893, + "balance_loss_mlp": 1.01901007, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.48733298688274, + "language_loss": 0.74529099, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76669377, + "num_input_tokens_seen": 295977135, + "step": 13724, + "time_per_iteration": 2.4786221981048584 + }, + { + "auxiliary_loss_clip": 0.01075584, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.03341532, + "balance_loss_mlp": 1.0195446, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.8706589608156712, + "language_loss": 0.63923007, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.66031146, + "num_input_tokens_seen": 295996265, + "step": 13725, + "time_per_iteration": 2.577704906463623 + }, + { + "auxiliary_loss_clip": 0.01082725, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.03313017, + "balance_loss_mlp": 1.02057362, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 2.1426606794370713, + "language_loss": 0.81982648, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.84098172, + "num_input_tokens_seen": 296014745, + "step": 13726, + "time_per_iteration": 2.521754264831543 + }, + { + "auxiliary_loss_clip": 0.01090615, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.03144908, + "balance_loss_mlp": 1.02283597, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.755483500678716, + "language_loss": 0.70345902, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72471058, + "num_input_tokens_seen": 296036960, + "step": 13727, + "time_per_iteration": 2.557219982147217 + }, + { + "auxiliary_loss_clip": 0.01100872, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.0369041, + "balance_loss_mlp": 1.01514757, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 2.333019914725169, + "language_loss": 0.62147075, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64276332, + "num_input_tokens_seen": 296056540, + "step": 13728, + "time_per_iteration": 2.4632670879364014 + }, + { + "auxiliary_loss_clip": 0.01093668, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.03879857, + "balance_loss_mlp": 1.01966298, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.9132350245236622, + "language_loss": 0.71160358, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73286301, + "num_input_tokens_seen": 296077950, + "step": 13729, + "time_per_iteration": 2.5796499252319336 + }, + { + "auxiliary_loss_clip": 0.010129, + "auxiliary_loss_mlp": 0.01000273, + "balance_loss_clip": 1.00805986, + "balance_loss_mlp": 0.9991585, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8823711653218334, + "language_loss": 0.62700289, + "learning_rate": 3.109394225359514e-07, + "loss": 0.6471346, + "num_input_tokens_seen": 296127060, + "step": 13730, + "time_per_iteration": 4.379768371582031 + }, + { + "auxiliary_loss_clip": 0.01060769, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.03565049, + "balance_loss_mlp": 1.02248871, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 2.0371312225129317, + "language_loss": 0.6295203, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65047574, + "num_input_tokens_seen": 296147775, + "step": 13731, + "time_per_iteration": 2.784756898880005 + }, + { + "auxiliary_loss_clip": 0.01074442, + "auxiliary_loss_mlp": 0.00783343, + "balance_loss_clip": 1.03096724, + "balance_loss_mlp": 1.00071251, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 2.434325535283837, + "language_loss": 0.69570738, + "learning_rate": 3.105224311177812e-07, + "loss": 0.71428525, + "num_input_tokens_seen": 296163560, + "step": 13732, + "time_per_iteration": 2.511544704437256 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.03591847, + "balance_loss_mlp": 1.02189243, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 2.572018174076636, + "language_loss": 0.70809138, + "learning_rate": 3.103140315024817e-07, + "loss": 0.72944391, + "num_input_tokens_seen": 296178730, + "step": 13733, + "time_per_iteration": 2.4398906230926514 + }, + { + "auxiliary_loss_clip": 0.01104613, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.03359473, + "balance_loss_mlp": 1.01876903, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.5193914089893406, + "language_loss": 0.82508898, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84644818, + "num_input_tokens_seen": 296200175, + "step": 13734, + "time_per_iteration": 2.477613687515259 + }, + { + "auxiliary_loss_clip": 0.01079507, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.03439498, + "balance_loss_mlp": 1.01853192, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.7780554217814355, + "language_loss": 0.82943332, + "learning_rate": 3.098974244989676e-07, + "loss": 0.85054207, + "num_input_tokens_seen": 296219305, + "step": 13735, + "time_per_iteration": 2.481330394744873 + }, + { + "auxiliary_loss_clip": 0.01100582, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.03765762, + "balance_loss_mlp": 1.02026355, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 2.034752815373509, + "language_loss": 0.7093631, + "learning_rate": 3.096892171265497e-07, + "loss": 0.73068392, + "num_input_tokens_seen": 296236945, + "step": 13736, + "time_per_iteration": 3.8513238430023193 + }, + { + "auxiliary_loss_clip": 0.01021513, + "auxiliary_loss_mlp": 0.01000209, + "balance_loss_clip": 1.0079689, + "balance_loss_mlp": 0.99901694, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8569599841644069, + "language_loss": 0.6835804, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.70379758, + "num_input_tokens_seen": 296294685, + "step": 13737, + "time_per_iteration": 3.041006326675415 + }, + { + "auxiliary_loss_clip": 0.01086478, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.03482628, + "balance_loss_mlp": 1.01951313, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 1.7508139919509278, + "language_loss": 0.69807786, + "learning_rate": 3.0927299467987e-07, + "loss": 0.7192564, + "num_input_tokens_seen": 296314790, + "step": 13738, + "time_per_iteration": 2.5075736045837402 + }, + { + "auxiliary_loss_clip": 0.01094202, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.03946352, + "balance_loss_mlp": 1.01698506, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 2.2296339114883326, + "language_loss": 0.63546908, + "learning_rate": 3.090649796213911e-07, + "loss": 0.65672195, + "num_input_tokens_seen": 296335355, + "step": 13739, + "time_per_iteration": 2.6540048122406006 + }, + { + "auxiliary_loss_clip": 0.0101422, + "auxiliary_loss_mlp": 0.00999928, + "balance_loss_clip": 1.00962901, + "balance_loss_mlp": 0.99870032, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8187668791140529, + "language_loss": 0.5926882, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61282969, + "num_input_tokens_seen": 296399885, + "step": 13740, + "time_per_iteration": 3.1182470321655273 + }, + { + "auxiliary_loss_clip": 0.01113157, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.03718591, + "balance_loss_mlp": 1.0190748, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 1.8863945478504756, + "language_loss": 0.750386, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77184176, + "num_input_tokens_seen": 296417660, + "step": 13741, + "time_per_iteration": 2.4470882415771484 + }, + { + "auxiliary_loss_clip": 0.01097148, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.03534067, + "balance_loss_mlp": 1.02011919, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 1.88393957673311, + "language_loss": 0.62236625, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64365727, + "num_input_tokens_seen": 296438255, + "step": 13742, + "time_per_iteration": 2.6265149116516113 + }, + { + "auxiliary_loss_clip": 0.01075931, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.04241061, + "balance_loss_mlp": 1.02023244, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 5.692249653043173, + "language_loss": 0.65892243, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68003225, + "num_input_tokens_seen": 296454485, + "step": 13743, + "time_per_iteration": 2.5545547008514404 + }, + { + "auxiliary_loss_clip": 0.01091325, + "auxiliary_loss_mlp": 0.01036569, + "balance_loss_clip": 1.03664064, + "balance_loss_mlp": 1.02398634, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 2.119172634438332, + "language_loss": 0.6698947, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.69117367, + "num_input_tokens_seen": 296473740, + "step": 13744, + "time_per_iteration": 2.5093114376068115 + }, + { + "auxiliary_loss_clip": 0.01077545, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.0387733, + "balance_loss_mlp": 1.01783299, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 3.1075003872819598, + "language_loss": 0.7530272, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77409482, + "num_input_tokens_seen": 296493355, + "step": 13745, + "time_per_iteration": 2.558595657348633 + }, + { + "auxiliary_loss_clip": 0.01077186, + "auxiliary_loss_mlp": 0.00781206, + "balance_loss_clip": 1.03299975, + "balance_loss_mlp": 1.0005703, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.7180277365024108, + "language_loss": 0.78671801, + "learning_rate": 3.076106700253709e-07, + "loss": 0.8053019, + "num_input_tokens_seen": 296510520, + "step": 13746, + "time_per_iteration": 4.4501793384552 + }, + { + "auxiliary_loss_clip": 0.01102533, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_clip": 1.03835678, + "balance_loss_mlp": 1.02360547, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 2.586926777094591, + "language_loss": 0.68453062, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70592177, + "num_input_tokens_seen": 296528265, + "step": 13747, + "time_per_iteration": 2.44743013381958 + }, + { + "auxiliary_loss_clip": 0.01096127, + "auxiliary_loss_mlp": 0.010404, + "balance_loss_clip": 1.0344969, + "balance_loss_mlp": 1.02563632, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 2.0203122492663157, + "language_loss": 0.75158679, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77295208, + "num_input_tokens_seen": 296547810, + "step": 13748, + "time_per_iteration": 2.4663102626800537 + }, + { + "auxiliary_loss_clip": 0.01069929, + "auxiliary_loss_mlp": 0.01029017, + "balance_loss_clip": 1.0372175, + "balance_loss_mlp": 1.01855063, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 2.021376679497948, + "language_loss": 0.64024556, + "learning_rate": 3.069883569603102e-07, + "loss": 0.66123497, + "num_input_tokens_seen": 296565940, + "step": 13749, + "time_per_iteration": 2.5389111042022705 + }, + { + "auxiliary_loss_clip": 0.01083573, + "auxiliary_loss_mlp": 0.01027957, + "balance_loss_clip": 1.03273273, + "balance_loss_mlp": 1.01624489, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.7178208704454787, + "language_loss": 0.7369051, + "learning_rate": 3.067810476598132e-07, + "loss": 0.7580204, + "num_input_tokens_seen": 296585090, + "step": 13750, + "time_per_iteration": 2.5138659477233887 + }, + { + "auxiliary_loss_clip": 0.01100026, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.03730464, + "balance_loss_mlp": 1.02382302, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 2.1088455919716536, + "language_loss": 0.65822148, + "learning_rate": 3.065738025663496e-07, + "loss": 0.67959106, + "num_input_tokens_seen": 296604950, + "step": 13751, + "time_per_iteration": 2.4997715950012207 + }, + { + "auxiliary_loss_clip": 0.01081614, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.03193951, + "balance_loss_mlp": 1.01824856, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.4582864603007921, + "language_loss": 0.60879678, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.62990713, + "num_input_tokens_seen": 296627780, + "step": 13752, + "time_per_iteration": 2.657379150390625 + }, + { + "auxiliary_loss_clip": 0.0102067, + "auxiliary_loss_mlp": 0.01000532, + "balance_loss_clip": 1.00732815, + "balance_loss_mlp": 0.99946517, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 1.1416589957400738, + "language_loss": 0.57430995, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.594522, + "num_input_tokens_seen": 296683850, + "step": 13753, + "time_per_iteration": 3.1039695739746094 + }, + { + "auxiliary_loss_clip": 0.00990193, + "auxiliary_loss_mlp": 0.00753, + "balance_loss_clip": 1.01338673, + "balance_loss_mlp": 1.00023544, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.6979322116034037, + "language_loss": 0.54899925, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56643111, + "num_input_tokens_seen": 296741420, + "step": 13754, + "time_per_iteration": 3.2492055892944336 + }, + { + "auxiliary_loss_clip": 0.01067103, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.03279769, + "balance_loss_mlp": 1.02588511, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 2.043399320018195, + "language_loss": 0.69061655, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71165699, + "num_input_tokens_seen": 296759620, + "step": 13755, + "time_per_iteration": 2.5463156700134277 + }, + { + "auxiliary_loss_clip": 0.01062823, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.03406906, + "balance_loss_mlp": 1.01896596, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 1.976120235068532, + "language_loss": 0.701388, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.72231257, + "num_input_tokens_seen": 296777275, + "step": 13756, + "time_per_iteration": 2.5522027015686035 + }, + { + "auxiliary_loss_clip": 0.01100409, + "auxiliary_loss_mlp": 0.01033055, + "balance_loss_clip": 1.03947902, + "balance_loss_mlp": 1.02103877, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 1.961156416978002, + "language_loss": 0.72500563, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74634027, + "num_input_tokens_seen": 296796655, + "step": 13757, + "time_per_iteration": 2.4733285903930664 + }, + { + "auxiliary_loss_clip": 0.01100321, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.03569984, + "balance_loss_mlp": 1.0204308, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.14059506484121, + "language_loss": 0.68996596, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71131706, + "num_input_tokens_seen": 296813705, + "step": 13758, + "time_per_iteration": 2.43093204498291 + }, + { + "auxiliary_loss_clip": 0.01086365, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.03723717, + "balance_loss_mlp": 1.01766837, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.687603635183645, + "language_loss": 0.69632757, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.71748531, + "num_input_tokens_seen": 296833985, + "step": 13759, + "time_per_iteration": 2.5398168563842773 + }, + { + "auxiliary_loss_clip": 0.01085694, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.03496158, + "balance_loss_mlp": 1.01821673, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 51.47769435085769, + "language_loss": 0.7116797, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73284233, + "num_input_tokens_seen": 296850150, + "step": 13760, + "time_per_iteration": 2.4838459491729736 + }, + { + "auxiliary_loss_clip": 0.01068732, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.03421724, + "balance_loss_mlp": 1.02009189, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.9903946389449854, + "language_loss": 0.77591377, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.796924, + "num_input_tokens_seen": 296869585, + "step": 13761, + "time_per_iteration": 2.5300092697143555 + }, + { + "auxiliary_loss_clip": 0.01075355, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.03735578, + "balance_loss_mlp": 1.01856744, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.7424292534261596, + "language_loss": 0.69822818, + "learning_rate": 3.042983464482387e-07, + "loss": 0.71927965, + "num_input_tokens_seen": 296887710, + "step": 13762, + "time_per_iteration": 3.9709603786468506 + }, + { + "auxiliary_loss_clip": 0.01059104, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.03669631, + "balance_loss_mlp": 1.02308607, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 2.0505259617200386, + "language_loss": 0.69999802, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.72094321, + "num_input_tokens_seen": 296906265, + "step": 13763, + "time_per_iteration": 2.5735890865325928 + }, + { + "auxiliary_loss_clip": 0.01011963, + "auxiliary_loss_mlp": 0.01004019, + "balance_loss_clip": 1.01308191, + "balance_loss_mlp": 1.00268984, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 1.027350303532243, + "language_loss": 0.6511609, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67132074, + "num_input_tokens_seen": 296971290, + "step": 13764, + "time_per_iteration": 3.174957752227783 + }, + { + "auxiliary_loss_clip": 0.01100042, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.03803217, + "balance_loss_mlp": 1.02024806, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 2.066938398190891, + "language_loss": 0.77773547, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.79906654, + "num_input_tokens_seen": 296989060, + "step": 13765, + "time_per_iteration": 2.4620354175567627 + }, + { + "auxiliary_loss_clip": 0.01060346, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.03684521, + "balance_loss_mlp": 1.01668513, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.6351253113141073, + "language_loss": 0.62529916, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64620113, + "num_input_tokens_seen": 297011300, + "step": 13766, + "time_per_iteration": 2.6862692832946777 + }, + { + "auxiliary_loss_clip": 0.01073961, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03494644, + "balance_loss_mlp": 1.01890326, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 1.5901860794383467, + "language_loss": 0.82606852, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84712243, + "num_input_tokens_seen": 297030350, + "step": 13767, + "time_per_iteration": 2.542940616607666 + }, + { + "auxiliary_loss_clip": 0.01082697, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.0393616, + "balance_loss_mlp": 1.01722729, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 1.9720746600702408, + "language_loss": 0.69075119, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71187246, + "num_input_tokens_seen": 297049710, + "step": 13768, + "time_per_iteration": 2.5591540336608887 + }, + { + "auxiliary_loss_clip": 0.01043251, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.02878118, + "balance_loss_mlp": 1.02226555, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 2.5592442170278886, + "language_loss": 0.74209303, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76287276, + "num_input_tokens_seen": 297070510, + "step": 13769, + "time_per_iteration": 2.643176555633545 + }, + { + "auxiliary_loss_clip": 0.01084404, + "auxiliary_loss_mlp": 0.01030015, + "balance_loss_clip": 1.03415561, + "balance_loss_mlp": 1.0177722, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 2.0020117309418795, + "language_loss": 0.74095023, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76209438, + "num_input_tokens_seen": 297092585, + "step": 13770, + "time_per_iteration": 4.223522424697876 + }, + { + "auxiliary_loss_clip": 0.01081666, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.03759944, + "balance_loss_mlp": 1.01683497, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.667063857723141, + "language_loss": 0.75773239, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.77884257, + "num_input_tokens_seen": 297110055, + "step": 13771, + "time_per_iteration": 2.5417633056640625 + }, + { + "auxiliary_loss_clip": 0.01108432, + "auxiliary_loss_mlp": 0.01031709, + "balance_loss_clip": 1.0361414, + "balance_loss_mlp": 1.01979947, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.5938014204884994, + "language_loss": 0.72735298, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74875438, + "num_input_tokens_seen": 297132170, + "step": 13772, + "time_per_iteration": 2.5520689487457275 + }, + { + "auxiliary_loss_clip": 0.01088735, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.03879833, + "balance_loss_mlp": 1.01607323, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.3252113195223827, + "language_loss": 0.74376881, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76494163, + "num_input_tokens_seen": 297149515, + "step": 13773, + "time_per_iteration": 2.501347541809082 + }, + { + "auxiliary_loss_clip": 0.01062064, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.03549016, + "balance_loss_mlp": 1.02319598, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 2.44767741427967, + "language_loss": 0.75869513, + "learning_rate": 3.01824904601915e-07, + "loss": 0.77966583, + "num_input_tokens_seen": 297170320, + "step": 13774, + "time_per_iteration": 2.632398843765259 + }, + { + "auxiliary_loss_clip": 0.01082201, + "auxiliary_loss_mlp": 0.00778332, + "balance_loss_clip": 1.03965974, + "balance_loss_mlp": 1.00064659, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.7298353440134715, + "language_loss": 0.75316679, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77177215, + "num_input_tokens_seen": 297189935, + "step": 13775, + "time_per_iteration": 3.9462037086486816 + }, + { + "auxiliary_loss_clip": 0.01112317, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.03650582, + "balance_loss_mlp": 1.01931214, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 1.8679385331529603, + "language_loss": 0.74088663, + "learning_rate": 3.01413565459353e-07, + "loss": 0.7623356, + "num_input_tokens_seen": 297210885, + "step": 13776, + "time_per_iteration": 2.4918510913848877 + }, + { + "auxiliary_loss_clip": 0.01056543, + "auxiliary_loss_mlp": 0.01039035, + "balance_loss_clip": 1.02847743, + "balance_loss_mlp": 1.02498639, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 2.801985135927176, + "language_loss": 0.77810705, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79906273, + "num_input_tokens_seen": 297228500, + "step": 13777, + "time_per_iteration": 2.5274548530578613 + }, + { + "auxiliary_loss_clip": 0.0109557, + "auxiliary_loss_mlp": 0.01025755, + "balance_loss_clip": 1.036304, + "balance_loss_mlp": 1.01473427, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 1.4812470801298239, + "language_loss": 0.82355833, + "learning_rate": 3.010024839590604e-07, + "loss": 0.84477156, + "num_input_tokens_seen": 297249470, + "step": 13778, + "time_per_iteration": 2.5018844604492188 + }, + { + "auxiliary_loss_clip": 0.01091452, + "auxiliary_loss_mlp": 0.01024989, + "balance_loss_clip": 1.03250229, + "balance_loss_mlp": 1.01255596, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.8889629408771613, + "language_loss": 0.74285895, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.76402336, + "num_input_tokens_seen": 297265970, + "step": 13779, + "time_per_iteration": 2.4320952892303467 + }, + { + "auxiliary_loss_clip": 0.01006361, + "auxiliary_loss_mlp": 0.01002464, + "balance_loss_clip": 1.01004374, + "balance_loss_mlp": 1.00118256, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.764358269670012, + "language_loss": 0.56727272, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58736098, + "num_input_tokens_seen": 297325525, + "step": 13780, + "time_per_iteration": 3.1269943714141846 + }, + { + "auxiliary_loss_clip": 0.0106899, + "auxiliary_loss_mlp": 0.01026762, + "balance_loss_clip": 1.03398132, + "balance_loss_mlp": 1.01388144, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.8751226223034878, + "language_loss": 0.80008286, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.82104039, + "num_input_tokens_seen": 297345025, + "step": 13781, + "time_per_iteration": 2.528364896774292 + }, + { + "auxiliary_loss_clip": 0.01067541, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.03739548, + "balance_loss_mlp": 1.01798522, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 2.2807397526582034, + "language_loss": 0.75560558, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77659667, + "num_input_tokens_seen": 297363570, + "step": 13782, + "time_per_iteration": 2.591473340988159 + }, + { + "auxiliary_loss_clip": 0.01095589, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.03399587, + "balance_loss_mlp": 1.01746869, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.5765762306141573, + "language_loss": 0.76074219, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78199708, + "num_input_tokens_seen": 297385385, + "step": 13783, + "time_per_iteration": 2.5166752338409424 + }, + { + "auxiliary_loss_clip": 0.01108271, + "auxiliary_loss_mlp": 0.01026411, + "balance_loss_clip": 1.03616858, + "balance_loss_mlp": 1.01445436, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.6892940256071627, + "language_loss": 0.7362535, + "learning_rate": 2.997707859351304e-07, + "loss": 0.75760031, + "num_input_tokens_seen": 297403950, + "step": 13784, + "time_per_iteration": 2.4469854831695557 + }, + { + "auxiliary_loss_clip": 0.01100575, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.03519416, + "balance_loss_mlp": 1.02130461, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.5677494679200796, + "language_loss": 0.69803464, + "learning_rate": 2.99565728540772e-07, + "loss": 0.71938616, + "num_input_tokens_seen": 297424565, + "step": 13785, + "time_per_iteration": 4.176361322402954 + }, + { + "auxiliary_loss_clip": 0.01091167, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.04395497, + "balance_loss_mlp": 1.02135587, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 1.6321861799703985, + "language_loss": 0.68524927, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70649958, + "num_input_tokens_seen": 297445180, + "step": 13786, + "time_per_iteration": 2.549801826477051 + }, + { + "auxiliary_loss_clip": 0.01070358, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.03771913, + "balance_loss_mlp": 1.02156711, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.8916238287921594, + "language_loss": 0.76879728, + "learning_rate": 2.991558072017426e-07, + "loss": 0.78984118, + "num_input_tokens_seen": 297463790, + "step": 13787, + "time_per_iteration": 2.5776147842407227 + }, + { + "auxiliary_loss_clip": 0.01091823, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03601861, + "balance_loss_mlp": 1.0207845, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 1.724722878134112, + "language_loss": 0.80591357, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82715529, + "num_input_tokens_seen": 297480100, + "step": 13788, + "time_per_iteration": 2.439352035522461 + }, + { + "auxiliary_loss_clip": 0.0108131, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.03546631, + "balance_loss_mlp": 1.01763964, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 16.20283189987216, + "language_loss": 0.71503878, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73614758, + "num_input_tokens_seen": 297499890, + "step": 13789, + "time_per_iteration": 2.59932279586792 + }, + { + "auxiliary_loss_clip": 0.01078286, + "auxiliary_loss_mlp": 0.01029267, + "balance_loss_clip": 1.03218985, + "balance_loss_mlp": 1.01673818, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 2.0740928840984596, + "language_loss": 0.67720902, + "learning_rate": 2.985414089339813e-07, + "loss": 0.69828463, + "num_input_tokens_seen": 297521440, + "step": 13790, + "time_per_iteration": 2.673841953277588 + }, + { + "auxiliary_loss_clip": 0.01098443, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.0350883, + "balance_loss_mlp": 1.01946092, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 1.5590514643809463, + "language_loss": 0.77311158, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.79442072, + "num_input_tokens_seen": 297539920, + "step": 13791, + "time_per_iteration": 2.4704079627990723 + }, + { + "auxiliary_loss_clip": 0.0108559, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.03709483, + "balance_loss_mlp": 1.01767206, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.429463504966266, + "language_loss": 0.69858998, + "learning_rate": 2.981321326732651e-07, + "loss": 0.71975613, + "num_input_tokens_seen": 297560000, + "step": 13792, + "time_per_iteration": 2.5423901081085205 + }, + { + "auxiliary_loss_clip": 0.01088745, + "auxiliary_loss_mlp": 0.01035476, + "balance_loss_clip": 1.03654337, + "balance_loss_mlp": 1.02246416, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 1.5758373259033782, + "language_loss": 0.64961004, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67085224, + "num_input_tokens_seen": 297579300, + "step": 13793, + "time_per_iteration": 2.550651788711548 + }, + { + "auxiliary_loss_clip": 0.01055833, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.03276491, + "balance_loss_mlp": 1.02561915, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 2.2733529617157577, + "language_loss": 0.66695315, + "learning_rate": 2.977231145525461e-07, + "loss": 0.6879108, + "num_input_tokens_seen": 297598095, + "step": 13794, + "time_per_iteration": 2.590306043624878 + }, + { + "auxiliary_loss_clip": 0.01106732, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.03420186, + "balance_loss_mlp": 1.02148366, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 2.05423025372221, + "language_loss": 0.66184342, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68325454, + "num_input_tokens_seen": 297615955, + "step": 13795, + "time_per_iteration": 2.4486279487609863 + }, + { + "auxiliary_loss_clip": 0.01018522, + "auxiliary_loss_mlp": 0.01041161, + "balance_loss_clip": 1.03059375, + "balance_loss_mlp": 1.02714753, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.84423026250646, + "language_loss": 0.66700602, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68760288, + "num_input_tokens_seen": 297636285, + "step": 13796, + "time_per_iteration": 2.9133341312408447 + }, + { + "auxiliary_loss_clip": 0.01062258, + "auxiliary_loss_mlp": 0.01038822, + "balance_loss_clip": 1.03195381, + "balance_loss_mlp": 1.02502334, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.5698315284393902, + "language_loss": 0.71800733, + "learning_rate": 2.971100715196666e-07, + "loss": 0.73901808, + "num_input_tokens_seen": 297653315, + "step": 13797, + "time_per_iteration": 3.0633487701416016 + }, + { + "auxiliary_loss_clip": 0.01049594, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.04176152, + "balance_loss_mlp": 1.0197531, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 3.525119847033456, + "language_loss": 0.72286177, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74366713, + "num_input_tokens_seen": 297673480, + "step": 13798, + "time_per_iteration": 2.740540027618408 + }, + { + "auxiliary_loss_clip": 0.01070188, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.03053153, + "balance_loss_mlp": 1.01928234, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.7370143518889167, + "language_loss": 0.76187432, + "learning_rate": 2.967016990202822e-07, + "loss": 0.7828905, + "num_input_tokens_seen": 297693250, + "step": 13799, + "time_per_iteration": 2.913956642150879 + }, + { + "auxiliary_loss_clip": 0.01109335, + "auxiliary_loss_mlp": 0.01037732, + "balance_loss_clip": 1.03761387, + "balance_loss_mlp": 1.02534652, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 2.0159988293453, + "language_loss": 0.67747951, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69895017, + "num_input_tokens_seen": 297710975, + "step": 13800, + "time_per_iteration": 2.528920888900757 + }, + { + "auxiliary_loss_clip": 0.01078445, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.04107809, + "balance_loss_mlp": 1.02001047, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 1.7695413340914832, + "language_loss": 0.7428624, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.763982, + "num_input_tokens_seen": 297730860, + "step": 13801, + "time_per_iteration": 4.260084390640259 + }, + { + "auxiliary_loss_clip": 0.01063099, + "auxiliary_loss_mlp": 0.01030208, + "balance_loss_clip": 1.03306961, + "balance_loss_mlp": 1.01894903, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 3.4301364227686078, + "language_loss": 0.73582977, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75676286, + "num_input_tokens_seen": 297749765, + "step": 13802, + "time_per_iteration": 2.586644172668457 + }, + { + "auxiliary_loss_clip": 0.0108901, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.03415704, + "balance_loss_mlp": 1.01755595, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.5337576324788968, + "language_loss": 0.74558562, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.76677698, + "num_input_tokens_seen": 297770380, + "step": 13803, + "time_per_iteration": 2.5334713459014893 + }, + { + "auxiliary_loss_clip": 0.01098102, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.03751659, + "balance_loss_mlp": 1.02147067, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.6630904594832052, + "language_loss": 0.79076755, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81208384, + "num_input_tokens_seen": 297789440, + "step": 13804, + "time_per_iteration": 2.4690165519714355 + }, + { + "auxiliary_loss_clip": 0.01108494, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.03715885, + "balance_loss_mlp": 1.01933575, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.6781217726297089, + "language_loss": 0.73158586, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75297946, + "num_input_tokens_seen": 297810425, + "step": 13805, + "time_per_iteration": 2.496307134628296 + }, + { + "auxiliary_loss_clip": 0.01100828, + "auxiliary_loss_mlp": 0.007782, + "balance_loss_clip": 1.03716421, + "balance_loss_mlp": 1.0007031, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 2.036482372933214, + "language_loss": 0.77994215, + "learning_rate": 2.952744302396906e-07, + "loss": 0.7987324, + "num_input_tokens_seen": 297827680, + "step": 13806, + "time_per_iteration": 2.465487480163574 + }, + { + "auxiliary_loss_clip": 0.01101114, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.0369395, + "balance_loss_mlp": 1.01854205, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.7209922878403534, + "language_loss": 0.63203681, + "learning_rate": 2.950707932112444e-07, + "loss": 0.6533674, + "num_input_tokens_seen": 297848005, + "step": 13807, + "time_per_iteration": 2.4701428413391113 + }, + { + "auxiliary_loss_clip": 0.01096382, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.03831911, + "balance_loss_mlp": 1.01917529, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 2.387936490014387, + "language_loss": 0.73179352, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75307268, + "num_input_tokens_seen": 297866730, + "step": 13808, + "time_per_iteration": 2.452131748199463 + }, + { + "auxiliary_loss_clip": 0.01092697, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.03735769, + "balance_loss_mlp": 1.02724028, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.9760443332939877, + "language_loss": 0.6656599, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68699461, + "num_input_tokens_seen": 297886390, + "step": 13809, + "time_per_iteration": 2.565657138824463 + }, + { + "auxiliary_loss_clip": 0.01109825, + "auxiliary_loss_mlp": 0.01022575, + "balance_loss_clip": 1.03645766, + "balance_loss_mlp": 1.01089835, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 2.4185302132638085, + "language_loss": 0.74329966, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76462364, + "num_input_tokens_seen": 297905110, + "step": 13810, + "time_per_iteration": 3.8844504356384277 + }, + { + "auxiliary_loss_clip": 0.0107783, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.03736103, + "balance_loss_mlp": 1.02250957, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.5741919817575434, + "language_loss": 0.8113932, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83250737, + "num_input_tokens_seen": 297925460, + "step": 13811, + "time_per_iteration": 2.575895309448242 + }, + { + "auxiliary_loss_clip": 0.01076518, + "auxiliary_loss_mlp": 0.01040212, + "balance_loss_clip": 1.03509617, + "balance_loss_mlp": 1.02727795, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 1.8490528403170456, + "language_loss": 0.73274505, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75391245, + "num_input_tokens_seen": 297941760, + "step": 13812, + "time_per_iteration": 2.526355743408203 + }, + { + "auxiliary_loss_clip": 0.01086388, + "auxiliary_loss_mlp": 0.01028, + "balance_loss_clip": 1.04144394, + "balance_loss_mlp": 1.01603782, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.6322674202312544, + "language_loss": 0.78218299, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.8033269, + "num_input_tokens_seen": 297959745, + "step": 13813, + "time_per_iteration": 2.5462653636932373 + }, + { + "auxiliary_loss_clip": 0.01052687, + "auxiliary_loss_mlp": 0.00777997, + "balance_loss_clip": 1.03250146, + "balance_loss_mlp": 1.0006367, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 2.77989965721843, + "language_loss": 0.71105701, + "learning_rate": 2.93647144674658e-07, + "loss": 0.72936386, + "num_input_tokens_seen": 297977665, + "step": 13814, + "time_per_iteration": 4.019163131713867 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.01040997, + "balance_loss_clip": 1.03802538, + "balance_loss_mlp": 1.02633429, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 2.4994666264175986, + "language_loss": 0.67664367, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.69822741, + "num_input_tokens_seen": 297993525, + "step": 13815, + "time_per_iteration": 2.510101795196533 + }, + { + "auxiliary_loss_clip": 0.01096109, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.03795409, + "balance_loss_mlp": 1.0187968, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 2.087929064827621, + "language_loss": 0.76163656, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.78291124, + "num_input_tokens_seen": 298012920, + "step": 13816, + "time_per_iteration": 2.4529147148132324 + }, + { + "auxiliary_loss_clip": 0.01073773, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.03231525, + "balance_loss_mlp": 1.02322018, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.703762702329428, + "language_loss": 0.81185454, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83293277, + "num_input_tokens_seen": 298033310, + "step": 13817, + "time_per_iteration": 2.5801196098327637 + }, + { + "auxiliary_loss_clip": 0.01101459, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.03928399, + "balance_loss_mlp": 1.02386105, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.5979681326134343, + "language_loss": 0.78298473, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80436814, + "num_input_tokens_seen": 298053530, + "step": 13818, + "time_per_iteration": 2.494306802749634 + }, + { + "auxiliary_loss_clip": 0.01092914, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.03803539, + "balance_loss_mlp": 1.02370143, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 1.746406537453368, + "language_loss": 0.81944889, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84073716, + "num_input_tokens_seen": 298069305, + "step": 13819, + "time_per_iteration": 2.4899537563323975 + }, + { + "auxiliary_loss_clip": 0.01021174, + "auxiliary_loss_mlp": 0.01003942, + "balance_loss_clip": 1.00785375, + "balance_loss_mlp": 1.00285137, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.762998669019061, + "language_loss": 0.56209272, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58234394, + "num_input_tokens_seen": 298125830, + "step": 13820, + "time_per_iteration": 3.0713765621185303 + }, + { + "auxiliary_loss_clip": 0.01095625, + "auxiliary_loss_mlp": 0.01025156, + "balance_loss_clip": 1.03471267, + "balance_loss_mlp": 1.01319969, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.8487111034244506, + "language_loss": 0.68297064, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70417839, + "num_input_tokens_seen": 298142320, + "step": 13821, + "time_per_iteration": 2.4330828189849854 + }, + { + "auxiliary_loss_clip": 0.01047972, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.03163528, + "balance_loss_mlp": 1.02075481, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 1.7887222120552846, + "language_loss": 0.68981397, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71063161, + "num_input_tokens_seen": 298161845, + "step": 13822, + "time_per_iteration": 2.6628665924072266 + }, + { + "auxiliary_loss_clip": 0.0106092, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.03527451, + "balance_loss_mlp": 1.02265048, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.9185395459330172, + "language_loss": 0.6239205, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64487058, + "num_input_tokens_seen": 298184165, + "step": 13823, + "time_per_iteration": 2.6435248851776123 + }, + { + "auxiliary_loss_clip": 0.01011151, + "auxiliary_loss_mlp": 0.00999713, + "balance_loss_clip": 1.00670147, + "balance_loss_mlp": 0.99856228, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.859198036028798, + "language_loss": 0.61993623, + "learning_rate": 2.916188616354669e-07, + "loss": 0.64004493, + "num_input_tokens_seen": 298251720, + "step": 13824, + "time_per_iteration": 4.693988800048828 + }, + { + "auxiliary_loss_clip": 0.01109207, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.03744149, + "balance_loss_mlp": 1.02174926, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.6036646839708593, + "language_loss": 0.73807567, + "learning_rate": 2.914163895056552e-07, + "loss": 0.75950712, + "num_input_tokens_seen": 298271910, + "step": 13825, + "time_per_iteration": 2.4501824378967285 + }, + { + "auxiliary_loss_clip": 0.01062976, + "auxiliary_loss_mlp": 0.00778086, + "balance_loss_clip": 1.03550434, + "balance_loss_mlp": 1.00063586, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 2.2893061620935637, + "language_loss": 0.80592251, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82433313, + "num_input_tokens_seen": 298288105, + "step": 13826, + "time_per_iteration": 2.539867877960205 + }, + { + "auxiliary_loss_clip": 0.01109307, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.03702235, + "balance_loss_mlp": 1.01780558, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.8107071668561567, + "language_loss": 0.6786077, + "learning_rate": 2.910116396226914e-07, + "loss": 0.70000803, + "num_input_tokens_seen": 298307600, + "step": 13827, + "time_per_iteration": 2.461042881011963 + }, + { + "auxiliary_loss_clip": 0.01098223, + "auxiliary_loss_mlp": 0.01027431, + "balance_loss_clip": 1.03551936, + "balance_loss_mlp": 1.01647568, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 1.9701702674574844, + "language_loss": 0.74013567, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.76139224, + "num_input_tokens_seen": 298323055, + "step": 13828, + "time_per_iteration": 2.440355062484741 + }, + { + "auxiliary_loss_clip": 0.01076521, + "auxiliary_loss_mlp": 0.01037462, + "balance_loss_clip": 1.03407383, + "balance_loss_mlp": 1.02496314, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 1.5832822848402281, + "language_loss": 0.6735419, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69468176, + "num_input_tokens_seen": 298346950, + "step": 13829, + "time_per_iteration": 2.7413811683654785 + }, + { + "auxiliary_loss_clip": 0.01092032, + "auxiliary_loss_mlp": 0.01028101, + "balance_loss_clip": 1.0424211, + "balance_loss_mlp": 1.01507783, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 1.5132331080959929, + "language_loss": 0.82738847, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84858978, + "num_input_tokens_seen": 298366315, + "step": 13830, + "time_per_iteration": 2.549471616744995 + }, + { + "auxiliary_loss_clip": 0.01097149, + "auxiliary_loss_mlp": 0.01033177, + "balance_loss_clip": 1.03566813, + "balance_loss_mlp": 1.02082658, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 2.1955192518608415, + "language_loss": 0.73784912, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.75915241, + "num_input_tokens_seen": 298385185, + "step": 13831, + "time_per_iteration": 2.448683977127075 + }, + { + "auxiliary_loss_clip": 0.0111069, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.03803396, + "balance_loss_mlp": 1.02252221, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.690824880289708, + "language_loss": 0.71287596, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73434162, + "num_input_tokens_seen": 298402335, + "step": 13832, + "time_per_iteration": 2.4185192584991455 + }, + { + "auxiliary_loss_clip": 0.01085057, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.03747785, + "balance_loss_mlp": 1.01990628, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.6507300724761924, + "language_loss": 0.84293181, + "learning_rate": 2.897989455393979e-07, + "loss": 0.8641023, + "num_input_tokens_seen": 298423370, + "step": 13833, + "time_per_iteration": 2.537928342819214 + }, + { + "auxiliary_loss_clip": 0.01091819, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.03747213, + "balance_loss_mlp": 1.0240593, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 1.561700192382248, + "language_loss": 0.76442677, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.7857163, + "num_input_tokens_seen": 298444835, + "step": 13834, + "time_per_iteration": 2.5547704696655273 + }, + { + "auxiliary_loss_clip": 0.01104966, + "auxiliary_loss_mlp": 0.0077696, + "balance_loss_clip": 1.03494942, + "balance_loss_mlp": 1.00056362, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 2.3447848922192955, + "language_loss": 0.79780757, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81662679, + "num_input_tokens_seen": 298461845, + "step": 13835, + "time_per_iteration": 2.4009859561920166 + }, + { + "auxiliary_loss_clip": 0.01103706, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.03848779, + "balance_loss_mlp": 1.0254215, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 2.170862022776341, + "language_loss": 0.80841142, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.82984412, + "num_input_tokens_seen": 298479095, + "step": 13836, + "time_per_iteration": 2.4357833862304688 + }, + { + "auxiliary_loss_clip": 0.01086945, + "auxiliary_loss_mlp": 0.01029068, + "balance_loss_clip": 1.03741288, + "balance_loss_mlp": 1.0176177, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 1.8517140241949217, + "language_loss": 0.77413416, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.79529428, + "num_input_tokens_seen": 298494475, + "step": 13837, + "time_per_iteration": 2.4666953086853027 + }, + { + "auxiliary_loss_clip": 0.01112728, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.03628993, + "balance_loss_mlp": 1.01877856, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 1.760911238266722, + "language_loss": 0.83512092, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85657269, + "num_input_tokens_seen": 298513185, + "step": 13838, + "time_per_iteration": 2.423919439315796 + }, + { + "auxiliary_loss_clip": 0.01081935, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.03174973, + "balance_loss_mlp": 1.02314508, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 2.1354179130050412, + "language_loss": 0.74309826, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76428479, + "num_input_tokens_seen": 298531885, + "step": 13839, + "time_per_iteration": 2.4882242679595947 + }, + { + "auxiliary_loss_clip": 0.01095009, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.03387594, + "balance_loss_mlp": 1.02142656, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.8610286837882004, + "language_loss": 0.67817891, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.69946992, + "num_input_tokens_seen": 298554905, + "step": 13840, + "time_per_iteration": 2.555046796798706 + }, + { + "auxiliary_loss_clip": 0.01056343, + "auxiliary_loss_mlp": 0.01039663, + "balance_loss_clip": 1.03234935, + "balance_loss_mlp": 1.02572191, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 2.0516303402865423, + "language_loss": 0.79359043, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81455052, + "num_input_tokens_seen": 298571185, + "step": 13841, + "time_per_iteration": 3.9672658443450928 + }, + { + "auxiliary_loss_clip": 0.01068361, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.04333985, + "balance_loss_mlp": 1.01725698, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 1.7623275184605915, + "language_loss": 0.68488431, + "learning_rate": 2.879842823726262e-07, + "loss": 0.7058664, + "num_input_tokens_seen": 298588505, + "step": 13842, + "time_per_iteration": 2.6163551807403564 + }, + { + "auxiliary_loss_clip": 0.01089212, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.03649127, + "balance_loss_mlp": 1.01958704, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 2.7222330468230247, + "language_loss": 0.73148227, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75269842, + "num_input_tokens_seen": 298609295, + "step": 13843, + "time_per_iteration": 2.5753414630889893 + }, + { + "auxiliary_loss_clip": 0.01077948, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.03581846, + "balance_loss_mlp": 1.01655889, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 2.2243056613668037, + "language_loss": 0.78001237, + "learning_rate": 2.875817378128975e-07, + "loss": 0.80108142, + "num_input_tokens_seen": 298625765, + "step": 13844, + "time_per_iteration": 2.523460865020752 + }, + { + "auxiliary_loss_clip": 0.0101176, + "auxiliary_loss_mlp": 0.010034, + "balance_loss_clip": 1.00921965, + "balance_loss_mlp": 1.00215459, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.7772292954681222, + "language_loss": 0.55250186, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57265347, + "num_input_tokens_seen": 298683005, + "step": 13845, + "time_per_iteration": 2.994393825531006 + }, + { + "auxiliary_loss_clip": 0.01102333, + "auxiliary_loss_mlp": 0.01047473, + "balance_loss_clip": 1.03775048, + "balance_loss_mlp": 1.03445554, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 2.2748164011785175, + "language_loss": 0.75562847, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77712655, + "num_input_tokens_seen": 298703060, + "step": 13846, + "time_per_iteration": 2.545133352279663 + }, + { + "auxiliary_loss_clip": 0.01058273, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.0315938, + "balance_loss_mlp": 1.01913404, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.636843414348723, + "language_loss": 0.78947443, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.81039798, + "num_input_tokens_seen": 298721765, + "step": 13847, + "time_per_iteration": 2.6114749908447266 + }, + { + "auxiliary_loss_clip": 0.01058563, + "auxiliary_loss_mlp": 0.01025893, + "balance_loss_clip": 1.04179227, + "balance_loss_mlp": 1.01488388, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.6779308079226922, + "language_loss": 0.74450016, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76534474, + "num_input_tokens_seen": 298740825, + "step": 13848, + "time_per_iteration": 2.6379454135894775 + }, + { + "auxiliary_loss_clip": 0.0109868, + "auxiliary_loss_mlp": 0.01025876, + "balance_loss_clip": 1.03753877, + "balance_loss_mlp": 1.01394963, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 1.8165994926931985, + "language_loss": 0.63130188, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65254736, + "num_input_tokens_seen": 298758515, + "step": 13849, + "time_per_iteration": 3.933964729309082 + }, + { + "auxiliary_loss_clip": 0.01086238, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.03197563, + "balance_loss_mlp": 1.02031887, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 4.194066369344647, + "language_loss": 0.79378688, + "learning_rate": 2.863756628194638e-07, + "loss": 0.81497967, + "num_input_tokens_seen": 298776375, + "step": 13850, + "time_per_iteration": 2.6053900718688965 + }, + { + "auxiliary_loss_clip": 0.01066468, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.03178334, + "balance_loss_mlp": 1.02213717, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.5597922335172816, + "language_loss": 0.7850852, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80607772, + "num_input_tokens_seen": 298795135, + "step": 13851, + "time_per_iteration": 2.586702346801758 + }, + { + "auxiliary_loss_clip": 0.01021347, + "auxiliary_loss_mlp": 0.01001378, + "balance_loss_clip": 1.00764656, + "balance_loss_mlp": 1.00006068, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7584589595376564, + "language_loss": 0.55832326, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57855058, + "num_input_tokens_seen": 298855475, + "step": 13852, + "time_per_iteration": 3.049415349960327 + }, + { + "auxiliary_loss_clip": 0.01097188, + "auxiliary_loss_mlp": 0.01027821, + "balance_loss_clip": 1.03602254, + "balance_loss_mlp": 1.01557231, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 1.6646130138972695, + "language_loss": 0.66998136, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69123149, + "num_input_tokens_seen": 298875875, + "step": 13853, + "time_per_iteration": 4.105189561843872 + }, + { + "auxiliary_loss_clip": 0.01084747, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.03772974, + "balance_loss_mlp": 1.02119851, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.5489627193403013, + "language_loss": 0.784235, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80542207, + "num_input_tokens_seen": 298895950, + "step": 13854, + "time_per_iteration": 2.5347275733947754 + }, + { + "auxiliary_loss_clip": 0.01027471, + "auxiliary_loss_mlp": 0.00999004, + "balance_loss_clip": 1.00421631, + "balance_loss_mlp": 0.99775201, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.749909046398112, + "language_loss": 0.58641988, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60668463, + "num_input_tokens_seen": 298955770, + "step": 13855, + "time_per_iteration": 2.8859167098999023 + }, + { + "auxiliary_loss_clip": 0.01096134, + "auxiliary_loss_mlp": 0.01025463, + "balance_loss_clip": 1.03555727, + "balance_loss_mlp": 1.01321447, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 2.1620515706099717, + "language_loss": 0.72002667, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.74124265, + "num_input_tokens_seen": 298976545, + "step": 13856, + "time_per_iteration": 2.458904981613159 + }, + { + "auxiliary_loss_clip": 0.01099455, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.03667569, + "balance_loss_mlp": 1.02001977, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.683433063913931, + "language_loss": 0.75414264, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77545953, + "num_input_tokens_seen": 298996750, + "step": 13857, + "time_per_iteration": 2.5116350650787354 + }, + { + "auxiliary_loss_clip": 0.01058917, + "auxiliary_loss_mlp": 0.01025797, + "balance_loss_clip": 1.0358181, + "balance_loss_mlp": 1.01525903, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.8441091100975762, + "language_loss": 0.73538637, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75623357, + "num_input_tokens_seen": 299014895, + "step": 13858, + "time_per_iteration": 2.5747883319854736 + }, + { + "auxiliary_loss_clip": 0.01111858, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.03590989, + "balance_loss_mlp": 1.0221349, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 1.699990191532843, + "language_loss": 0.73534524, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75681472, + "num_input_tokens_seen": 299032855, + "step": 13859, + "time_per_iteration": 2.399690866470337 + }, + { + "auxiliary_loss_clip": 0.01094295, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.03634405, + "balance_loss_mlp": 1.01853657, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 2.021007951070084, + "language_loss": 0.79242229, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81366676, + "num_input_tokens_seen": 299052055, + "step": 13860, + "time_per_iteration": 2.4877376556396484 + }, + { + "auxiliary_loss_clip": 0.01040751, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.0386095, + "balance_loss_mlp": 1.01871598, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.4192854716852195, + "language_loss": 0.82343554, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84414595, + "num_input_tokens_seen": 299075285, + "step": 13861, + "time_per_iteration": 2.824028253555298 + }, + { + "auxiliary_loss_clip": 0.01111156, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.0384078, + "balance_loss_mlp": 1.01943588, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 1.9201175566810458, + "language_loss": 0.79011953, + "learning_rate": 2.839705324021806e-07, + "loss": 0.8115496, + "num_input_tokens_seen": 299092520, + "step": 13862, + "time_per_iteration": 2.7318239212036133 + }, + { + "auxiliary_loss_clip": 0.01099769, + "auxiliary_loss_mlp": 0.01037584, + "balance_loss_clip": 1.03543723, + "balance_loss_mlp": 1.02539444, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 1.815204191509605, + "language_loss": 0.75132763, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77270114, + "num_input_tokens_seen": 299109450, + "step": 13863, + "time_per_iteration": 2.4822850227355957 + }, + { + "auxiliary_loss_clip": 0.0105013, + "auxiliary_loss_mlp": 0.00779698, + "balance_loss_clip": 1.03104496, + "balance_loss_mlp": 1.00067043, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.673310927597113, + "language_loss": 0.75446939, + "learning_rate": 2.835705879864232e-07, + "loss": 0.77276772, + "num_input_tokens_seen": 299129540, + "step": 13864, + "time_per_iteration": 4.126272201538086 + }, + { + "auxiliary_loss_clip": 0.01085441, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.03475761, + "balance_loss_mlp": 1.0209918, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 1.8007499603950556, + "language_loss": 0.69455618, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71575063, + "num_input_tokens_seen": 299148670, + "step": 13865, + "time_per_iteration": 2.5393006801605225 + }, + { + "auxiliary_loss_clip": 0.01098964, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.03744733, + "balance_loss_mlp": 1.02027535, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 1.9325943979570086, + "language_loss": 0.75287157, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77418506, + "num_input_tokens_seen": 299169330, + "step": 13866, + "time_per_iteration": 2.6153154373168945 + }, + { + "auxiliary_loss_clip": 0.01009501, + "auxiliary_loss_mlp": 0.01006404, + "balance_loss_clip": 1.00725794, + "balance_loss_mlp": 1.00494945, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8945489711408118, + "language_loss": 0.63095117, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65111029, + "num_input_tokens_seen": 299220980, + "step": 13867, + "time_per_iteration": 2.989569902420044 + }, + { + "auxiliary_loss_clip": 0.01085267, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.03754938, + "balance_loss_mlp": 1.02224827, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 1.6514215523058189, + "language_loss": 0.72318029, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74436939, + "num_input_tokens_seen": 299240130, + "step": 13868, + "time_per_iteration": 2.537585496902466 + }, + { + "auxiliary_loss_clip": 0.01089329, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.04359412, + "balance_loss_mlp": 1.01904869, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.406928302174282, + "language_loss": 0.80472314, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82592738, + "num_input_tokens_seen": 299260705, + "step": 13869, + "time_per_iteration": 2.566117286682129 + }, + { + "auxiliary_loss_clip": 0.01100393, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.03831649, + "balance_loss_mlp": 1.02115464, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 1.5258025032833242, + "language_loss": 0.82767481, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84900796, + "num_input_tokens_seen": 299278925, + "step": 13870, + "time_per_iteration": 2.4917447566986084 + }, + { + "auxiliary_loss_clip": 0.0107971, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.0362947, + "balance_loss_mlp": 1.01916313, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.6456547844559832, + "language_loss": 0.70589387, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72701156, + "num_input_tokens_seen": 299291580, + "step": 13871, + "time_per_iteration": 2.4490814208984375 + }, + { + "auxiliary_loss_clip": 0.01095457, + "auxiliary_loss_mlp": 0.01034047, + "balance_loss_clip": 1.03634036, + "balance_loss_mlp": 1.02185202, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 1.7795540284007259, + "language_loss": 0.69149667, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.71279174, + "num_input_tokens_seen": 299310385, + "step": 13872, + "time_per_iteration": 2.4719300270080566 + }, + { + "auxiliary_loss_clip": 0.01087129, + "auxiliary_loss_mlp": 0.01026634, + "balance_loss_clip": 1.0365479, + "balance_loss_mlp": 1.01480258, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 1.762699713002024, + "language_loss": 0.73176032, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75289798, + "num_input_tokens_seen": 299327660, + "step": 13873, + "time_per_iteration": 2.5161521434783936 + }, + { + "auxiliary_loss_clip": 0.01087757, + "auxiliary_loss_mlp": 0.01034525, + "balance_loss_clip": 1.0360775, + "balance_loss_mlp": 1.02020228, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 2.275804393592588, + "language_loss": 0.75463432, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77585715, + "num_input_tokens_seen": 299343685, + "step": 13874, + "time_per_iteration": 2.4762487411499023 + }, + { + "auxiliary_loss_clip": 0.01082162, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.03599799, + "balance_loss_mlp": 1.01732874, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 1.9142783052940273, + "language_loss": 0.6595484, + "learning_rate": 2.813755490573118e-07, + "loss": 0.6806618, + "num_input_tokens_seen": 299363305, + "step": 13875, + "time_per_iteration": 2.5860579013824463 + }, + { + "auxiliary_loss_clip": 0.01066681, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.03373933, + "balance_loss_mlp": 1.02283168, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.905565197711128, + "language_loss": 0.79807365, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81908721, + "num_input_tokens_seen": 299382630, + "step": 13876, + "time_per_iteration": 2.5799429416656494 + }, + { + "auxiliary_loss_clip": 0.01093348, + "auxiliary_loss_mlp": 0.0103979, + "balance_loss_clip": 1.03414392, + "balance_loss_mlp": 1.02627146, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 2.059746855669268, + "language_loss": 0.87301904, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89435041, + "num_input_tokens_seen": 299402385, + "step": 13877, + "time_per_iteration": 2.501877546310425 + }, + { + "auxiliary_loss_clip": 0.01066048, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.03283906, + "balance_loss_mlp": 1.01910329, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 2.0611107273378493, + "language_loss": 0.69440854, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71537757, + "num_input_tokens_seen": 299419820, + "step": 13878, + "time_per_iteration": 2.512448310852051 + }, + { + "auxiliary_loss_clip": 0.0108648, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.0366689, + "balance_loss_mlp": 1.01793909, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 1.8063252243055516, + "language_loss": 0.79258358, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81374562, + "num_input_tokens_seen": 299436265, + "step": 13879, + "time_per_iteration": 2.492436408996582 + }, + { + "auxiliary_loss_clip": 0.01063792, + "auxiliary_loss_mlp": 0.01030236, + "balance_loss_clip": 1.04001391, + "balance_loss_mlp": 1.01926279, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 2.2410461510765405, + "language_loss": 0.83538872, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85632896, + "num_input_tokens_seen": 299451660, + "step": 13880, + "time_per_iteration": 3.9859752655029297 + }, + { + "auxiliary_loss_clip": 0.01090803, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.03660893, + "balance_loss_mlp": 1.01721251, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 2.880919506975409, + "language_loss": 0.78064632, + "learning_rate": 2.80181578143982e-07, + "loss": 0.8018468, + "num_input_tokens_seen": 299472070, + "step": 13881, + "time_per_iteration": 2.545332670211792 + }, + { + "auxiliary_loss_clip": 0.01068666, + "auxiliary_loss_mlp": 0.01027481, + "balance_loss_clip": 1.03773665, + "balance_loss_mlp": 1.01660264, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 2.618883632159943, + "language_loss": 0.78115749, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.8021189, + "num_input_tokens_seen": 299486725, + "step": 13882, + "time_per_iteration": 2.5532052516937256 + }, + { + "auxiliary_loss_clip": 0.01072252, + "auxiliary_loss_mlp": 0.01052134, + "balance_loss_clip": 1.03248024, + "balance_loss_mlp": 1.03844929, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 1.80796689963432, + "language_loss": 0.80280232, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82404625, + "num_input_tokens_seen": 299505435, + "step": 13883, + "time_per_iteration": 2.566401481628418 + }, + { + "auxiliary_loss_clip": 0.01096108, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.03512836, + "balance_loss_mlp": 1.02033305, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 1.9877244824246594, + "language_loss": 0.73990369, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76119757, + "num_input_tokens_seen": 299523555, + "step": 13884, + "time_per_iteration": 2.4552125930786133 + }, + { + "auxiliary_loss_clip": 0.01096535, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.04003382, + "balance_loss_mlp": 1.02445495, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 1.6958670503647397, + "language_loss": 0.70364535, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72499406, + "num_input_tokens_seen": 299541660, + "step": 13885, + "time_per_iteration": 2.558746814727783 + }, + { + "auxiliary_loss_clip": 0.0107945, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.03746486, + "balance_loss_mlp": 1.01919484, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 1.7296258456204905, + "language_loss": 0.70342791, + "learning_rate": 2.791883957449912e-07, + "loss": 0.72454011, + "num_input_tokens_seen": 299562465, + "step": 13886, + "time_per_iteration": 2.6687378883361816 + }, + { + "auxiliary_loss_clip": 0.01072862, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.03369021, + "balance_loss_mlp": 1.02024364, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 1.5243100037985147, + "language_loss": 0.78989089, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81095022, + "num_input_tokens_seen": 299582700, + "step": 13887, + "time_per_iteration": 2.5763447284698486 + }, + { + "auxiliary_loss_clip": 0.0109212, + "auxiliary_loss_mlp": 0.00777535, + "balance_loss_clip": 1.03903246, + "balance_loss_mlp": 1.00068688, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 6.78512408378387, + "language_loss": 0.64564043, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.66433704, + "num_input_tokens_seen": 299600310, + "step": 13888, + "time_per_iteration": 2.557758331298828 + }, + { + "auxiliary_loss_clip": 0.01088594, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.03663158, + "balance_loss_mlp": 1.0166831, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 4.179770108777466, + "language_loss": 0.67289639, + "learning_rate": 2.785932692855244e-07, + "loss": 0.69407022, + "num_input_tokens_seen": 299617025, + "step": 13889, + "time_per_iteration": 4.051962614059448 + }, + { + "auxiliary_loss_clip": 0.01089604, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.03184974, + "balance_loss_mlp": 1.01661599, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 1.814917131846773, + "language_loss": 0.68498802, + "learning_rate": 2.783950243408399e-07, + "loss": 0.7061705, + "num_input_tokens_seen": 299633050, + "step": 13890, + "time_per_iteration": 2.489074945449829 + }, + { + "auxiliary_loss_clip": 0.01088523, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.037112, + "balance_loss_mlp": 1.02273297, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.464048980884234, + "language_loss": 0.59404492, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61528313, + "num_input_tokens_seen": 299646445, + "step": 13891, + "time_per_iteration": 2.465569019317627 + }, + { + "auxiliary_loss_clip": 0.01098655, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.03684783, + "balance_loss_mlp": 1.01850975, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 1.72991061298617, + "language_loss": 0.71738136, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73867035, + "num_input_tokens_seen": 299662665, + "step": 13892, + "time_per_iteration": 3.7519516944885254 + }, + { + "auxiliary_loss_clip": 0.01105324, + "auxiliary_loss_mlp": 0.01027582, + "balance_loss_clip": 1.03532434, + "balance_loss_mlp": 1.01564884, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 3.534164544663106, + "language_loss": 0.65842491, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.67975402, + "num_input_tokens_seen": 299683585, + "step": 13893, + "time_per_iteration": 2.4595186710357666 + }, + { + "auxiliary_loss_clip": 0.0107991, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.03371358, + "balance_loss_mlp": 1.01540101, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 2.3865932629634403, + "language_loss": 0.78097427, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80204797, + "num_input_tokens_seen": 299702680, + "step": 13894, + "time_per_iteration": 2.4642817974090576 + }, + { + "auxiliary_loss_clip": 0.01091261, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.03588688, + "balance_loss_mlp": 1.01922381, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.622967996308795, + "language_loss": 0.72812647, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74935102, + "num_input_tokens_seen": 299721050, + "step": 13895, + "time_per_iteration": 2.458012104034424 + }, + { + "auxiliary_loss_clip": 0.01095089, + "auxiliary_loss_mlp": 0.01041449, + "balance_loss_clip": 1.03562617, + "balance_loss_mlp": 1.02794266, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 1.9386666844089255, + "language_loss": 0.71940601, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74077129, + "num_input_tokens_seen": 299738255, + "step": 13896, + "time_per_iteration": 2.4388060569763184 + }, + { + "auxiliary_loss_clip": 0.01095383, + "auxiliary_loss_mlp": 0.01028356, + "balance_loss_clip": 1.03417659, + "balance_loss_mlp": 1.01628613, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.3673990148816917, + "language_loss": 0.59025383, + "learning_rate": 2.770091380848423e-07, + "loss": 0.6114912, + "num_input_tokens_seen": 299761315, + "step": 13897, + "time_per_iteration": 2.713930368423462 + }, + { + "auxiliary_loss_clip": 0.01027934, + "auxiliary_loss_mlp": 0.00753071, + "balance_loss_clip": 1.00457191, + "balance_loss_mlp": 1.00029778, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.7004685181699802, + "language_loss": 0.57687938, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59468949, + "num_input_tokens_seen": 299828735, + "step": 13898, + "time_per_iteration": 3.0682668685913086 + }, + { + "auxiliary_loss_clip": 0.0109482, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.03681374, + "balance_loss_mlp": 1.02178752, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 2.2227831576808352, + "language_loss": 0.80332601, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.82462496, + "num_input_tokens_seen": 299848395, + "step": 13899, + "time_per_iteration": 2.454169988632202 + }, + { + "auxiliary_loss_clip": 0.01109455, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.03675675, + "balance_loss_mlp": 1.01905465, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.7735951389622917, + "language_loss": 0.68895924, + "learning_rate": 2.764161667219749e-07, + "loss": 0.71035838, + "num_input_tokens_seen": 299871665, + "step": 13900, + "time_per_iteration": 2.621676445007324 + }, + { + "auxiliary_loss_clip": 0.01086314, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.03725123, + "balance_loss_mlp": 1.01781619, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.5428266267571373, + "language_loss": 0.71318352, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73434126, + "num_input_tokens_seen": 299891960, + "step": 13901, + "time_per_iteration": 2.560500144958496 + }, + { + "auxiliary_loss_clip": 0.01065718, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.03204298, + "balance_loss_mlp": 1.02425766, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.647444391975119, + "language_loss": 0.79962498, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82065368, + "num_input_tokens_seen": 299905070, + "step": 13902, + "time_per_iteration": 2.531040906906128 + }, + { + "auxiliary_loss_clip": 0.01094094, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.03353763, + "balance_loss_mlp": 1.02096105, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 2.1503037809109276, + "language_loss": 0.62625659, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64752609, + "num_input_tokens_seen": 299925130, + "step": 13903, + "time_per_iteration": 2.4528965950012207 + }, + { + "auxiliary_loss_clip": 0.01085381, + "auxiliary_loss_mlp": 0.0103701, + "balance_loss_clip": 1.03420866, + "balance_loss_mlp": 1.02390242, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 1.8742757178055989, + "language_loss": 0.74165875, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76288265, + "num_input_tokens_seen": 299943845, + "step": 13904, + "time_per_iteration": 4.09048056602478 + }, + { + "auxiliary_loss_clip": 0.01082334, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.0331229, + "balance_loss_mlp": 1.01785254, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.8154340955944854, + "language_loss": 0.7249372, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74606442, + "num_input_tokens_seen": 299961620, + "step": 13905, + "time_per_iteration": 2.492879629135132 + }, + { + "auxiliary_loss_clip": 0.01097149, + "auxiliary_loss_mlp": 0.01040484, + "balance_loss_clip": 1.03682256, + "balance_loss_mlp": 1.02961814, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 1.6940068900581635, + "language_loss": 0.66374147, + "learning_rate": 2.752319888771e-07, + "loss": 0.68511778, + "num_input_tokens_seen": 299982170, + "step": 13906, + "time_per_iteration": 2.474208354949951 + }, + { + "auxiliary_loss_clip": 0.0109714, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.036116, + "balance_loss_mlp": 1.01580787, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.5709197404710828, + "language_loss": 0.7421481, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76340026, + "num_input_tokens_seen": 300001330, + "step": 13907, + "time_per_iteration": 2.475522994995117 + }, + { + "auxiliary_loss_clip": 0.01077777, + "auxiliary_loss_mlp": 0.01035862, + "balance_loss_clip": 1.03498352, + "balance_loss_mlp": 1.02382803, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 1.7800308041111446, + "language_loss": 0.74984258, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77097893, + "num_input_tokens_seen": 300020645, + "step": 13908, + "time_per_iteration": 2.579158306121826 + }, + { + "auxiliary_loss_clip": 0.01099474, + "auxiliary_loss_mlp": 0.0102954, + "balance_loss_clip": 1.0365026, + "balance_loss_mlp": 1.01665914, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 7.220623926861039, + "language_loss": 0.71832502, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.7396152, + "num_input_tokens_seen": 300039945, + "step": 13909, + "time_per_iteration": 2.5131568908691406 + }, + { + "auxiliary_loss_clip": 0.01111438, + "auxiliary_loss_mlp": 0.00778676, + "balance_loss_clip": 1.0369122, + "balance_loss_mlp": 1.00072384, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 2.100211938327038, + "language_loss": 0.73744071, + "learning_rate": 2.744438449482338e-07, + "loss": 0.75634193, + "num_input_tokens_seen": 300058260, + "step": 13910, + "time_per_iteration": 2.4549734592437744 + }, + { + "auxiliary_loss_clip": 0.0109507, + "auxiliary_loss_mlp": 0.00780887, + "balance_loss_clip": 1.03447127, + "balance_loss_mlp": 1.00066864, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 1.765308351708916, + "language_loss": 0.73279774, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75155735, + "num_input_tokens_seen": 300076720, + "step": 13911, + "time_per_iteration": 2.497239112854004 + }, + { + "auxiliary_loss_clip": 0.01093141, + "auxiliary_loss_mlp": 0.01037816, + "balance_loss_clip": 1.03809369, + "balance_loss_mlp": 1.02552605, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 1.9860508816288431, + "language_loss": 0.79004693, + "learning_rate": 2.740501655534946e-07, + "loss": 0.81135654, + "num_input_tokens_seen": 300092950, + "step": 13912, + "time_per_iteration": 2.537838935852051 + }, + { + "auxiliary_loss_clip": 0.01100476, + "auxiliary_loss_mlp": 0.0103308, + "balance_loss_clip": 1.03710747, + "balance_loss_mlp": 1.02159429, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.6708602212322212, + "language_loss": 0.78690654, + "learning_rate": 2.738534240246797e-07, + "loss": 0.8082422, + "num_input_tokens_seen": 300110950, + "step": 13913, + "time_per_iteration": 2.528087854385376 + }, + { + "auxiliary_loss_clip": 0.010996, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.03699684, + "balance_loss_mlp": 1.02005172, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 2.1412846144004205, + "language_loss": 0.73592097, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75724483, + "num_input_tokens_seen": 300128705, + "step": 13914, + "time_per_iteration": 2.489499807357788 + }, + { + "auxiliary_loss_clip": 0.01066258, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.0407207, + "balance_loss_mlp": 1.02094579, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.710666716715013, + "language_loss": 0.71489203, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73588997, + "num_input_tokens_seen": 300148635, + "step": 13915, + "time_per_iteration": 2.6021220684051514 + }, + { + "auxiliary_loss_clip": 0.0107989, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.0180881, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.9157025549176958, + "language_loss": 0.72169411, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74278903, + "num_input_tokens_seen": 300165490, + "step": 13916, + "time_per_iteration": 2.5606160163879395 + }, + { + "auxiliary_loss_clip": 0.0107773, + "auxiliary_loss_mlp": 0.00777824, + "balance_loss_clip": 1.03662622, + "balance_loss_mlp": 1.0005672, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 1.859450557361339, + "language_loss": 0.7492699, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76782548, + "num_input_tokens_seen": 300182130, + "step": 13917, + "time_per_iteration": 2.5021910667419434 + }, + { + "auxiliary_loss_clip": 0.01105295, + "auxiliary_loss_mlp": 0.01033448, + "balance_loss_clip": 1.03754008, + "balance_loss_mlp": 1.02184319, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.5208608219302215, + "language_loss": 0.78834713, + "learning_rate": 2.728706983644933e-07, + "loss": 0.80973452, + "num_input_tokens_seen": 300203050, + "step": 13918, + "time_per_iteration": 2.456256628036499 + }, + { + "auxiliary_loss_clip": 0.01068936, + "auxiliary_loss_mlp": 0.01035727, + "balance_loss_clip": 1.03894591, + "balance_loss_mlp": 1.02355528, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.7647888729619956, + "language_loss": 0.68029743, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70134401, + "num_input_tokens_seen": 300224380, + "step": 13919, + "time_per_iteration": 4.151904106140137 + }, + { + "auxiliary_loss_clip": 0.01091255, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.03241491, + "balance_loss_mlp": 1.02215767, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 2.2766935761185043, + "language_loss": 0.73706913, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.75833488, + "num_input_tokens_seen": 300242915, + "step": 13920, + "time_per_iteration": 2.471489667892456 + }, + { + "auxiliary_loss_clip": 0.01088742, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.03391922, + "balance_loss_mlp": 1.01953459, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 1.9999810490768826, + "language_loss": 0.69188058, + "learning_rate": 2.722818488237566e-07, + "loss": 0.71308601, + "num_input_tokens_seen": 300261905, + "step": 13921, + "time_per_iteration": 2.4981658458709717 + }, + { + "auxiliary_loss_clip": 0.01101145, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.03595185, + "balance_loss_mlp": 1.02032769, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 1.888613090763298, + "language_loss": 0.85670209, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87803811, + "num_input_tokens_seen": 300281145, + "step": 13922, + "time_per_iteration": 2.462218999862671 + }, + { + "auxiliary_loss_clip": 0.01068088, + "auxiliary_loss_mlp": 0.00776249, + "balance_loss_clip": 1.03537631, + "balance_loss_mlp": 1.00059819, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.6597070182269842, + "language_loss": 0.71905595, + "learning_rate": 2.71889610027088e-07, + "loss": 0.7374993, + "num_input_tokens_seen": 300301610, + "step": 13923, + "time_per_iteration": 2.5705161094665527 + }, + { + "auxiliary_loss_clip": 0.01087095, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.04055524, + "balance_loss_mlp": 1.01782417, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 2.603863087865776, + "language_loss": 0.76029968, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.7814821, + "num_input_tokens_seen": 300319420, + "step": 13924, + "time_per_iteration": 2.5907649993896484 + }, + { + "auxiliary_loss_clip": 0.01084719, + "auxiliary_loss_mlp": 0.01027646, + "balance_loss_clip": 1.03320777, + "balance_loss_mlp": 1.01571298, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.5512846753666705, + "language_loss": 0.64417499, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66529858, + "num_input_tokens_seen": 300341325, + "step": 13925, + "time_per_iteration": 2.5619847774505615 + }, + { + "auxiliary_loss_clip": 0.0109093, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.04139853, + "balance_loss_mlp": 1.01847982, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 1.575412325638463, + "language_loss": 0.74492705, + "learning_rate": 2.713017433265543e-07, + "loss": 0.76614082, + "num_input_tokens_seen": 300361620, + "step": 13926, + "time_per_iteration": 2.592374086380005 + }, + { + "auxiliary_loss_clip": 0.01099589, + "auxiliary_loss_mlp": 0.0103657, + "balance_loss_clip": 1.03828442, + "balance_loss_mlp": 1.02346277, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 2.926534894295139, + "language_loss": 0.71107596, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73243755, + "num_input_tokens_seen": 300378675, + "step": 13927, + "time_per_iteration": 2.4946141242980957 + }, + { + "auxiliary_loss_clip": 0.01000754, + "auxiliary_loss_mlp": 0.01001221, + "balance_loss_clip": 1.00641847, + "balance_loss_mlp": 0.99998158, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.7029296850260259, + "language_loss": 0.58779252, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60781229, + "num_input_tokens_seen": 300449740, + "step": 13928, + "time_per_iteration": 4.706389427185059 + }, + { + "auxiliary_loss_clip": 0.01072437, + "auxiliary_loss_mlp": 0.0103837, + "balance_loss_clip": 1.03625762, + "balance_loss_mlp": 1.02455354, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.6884805346037572, + "language_loss": 0.69755989, + "learning_rate": 2.707144665977068e-07, + "loss": 0.71866792, + "num_input_tokens_seen": 300470000, + "step": 13929, + "time_per_iteration": 2.5409350395202637 + }, + { + "auxiliary_loss_clip": 0.01098376, + "auxiliary_loss_mlp": 0.0102543, + "balance_loss_clip": 1.03550839, + "balance_loss_mlp": 1.01251972, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.7628965405300783, + "language_loss": 0.67502332, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69626141, + "num_input_tokens_seen": 300494975, + "step": 13930, + "time_per_iteration": 2.668339967727661 + }, + { + "auxiliary_loss_clip": 0.0107082, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.04129779, + "balance_loss_mlp": 1.01639044, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.6450851806026585, + "language_loss": 0.71505904, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73605639, + "num_input_tokens_seen": 300513175, + "step": 13931, + "time_per_iteration": 4.047189474105835 + }, + { + "auxiliary_loss_clip": 0.01075629, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.03376639, + "balance_loss_mlp": 1.01969695, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.5420866506714057, + "language_loss": 0.717067, + "learning_rate": 2.701277800409705e-07, + "loss": 0.73814535, + "num_input_tokens_seen": 300533770, + "step": 13932, + "time_per_iteration": 2.5511035919189453 + }, + { + "auxiliary_loss_clip": 0.01047243, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.0355891, + "balance_loss_mlp": 1.02156687, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.9072729519798415, + "language_loss": 0.66644204, + "learning_rate": 2.699323490393628e-07, + "loss": 0.68723869, + "num_input_tokens_seen": 300552995, + "step": 13933, + "time_per_iteration": 2.7394607067108154 + }, + { + "auxiliary_loss_clip": 0.01079365, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.02965939, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 1.8699815898518284, + "language_loss": 0.76520145, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78641486, + "num_input_tokens_seen": 300570275, + "step": 13934, + "time_per_iteration": 2.4987316131591797 + }, + { + "auxiliary_loss_clip": 0.01101039, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.04139352, + "balance_loss_mlp": 1.01766288, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 1.7942257020017127, + "language_loss": 0.77556717, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79687643, + "num_input_tokens_seen": 300590875, + "step": 13935, + "time_per_iteration": 2.5042166709899902 + }, + { + "auxiliary_loss_clip": 0.0106793, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.03638411, + "balance_loss_mlp": 1.017658, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 4.302437997883048, + "language_loss": 0.56016064, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.58113807, + "num_input_tokens_seen": 300607490, + "step": 13936, + "time_per_iteration": 2.481079578399658 + }, + { + "auxiliary_loss_clip": 0.01092626, + "auxiliary_loss_mlp": 0.01037337, + "balance_loss_clip": 1.03297281, + "balance_loss_mlp": 1.02500534, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 1.752250237574584, + "language_loss": 0.89576244, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91706204, + "num_input_tokens_seen": 300623635, + "step": 13937, + "time_per_iteration": 2.4372947216033936 + }, + { + "auxiliary_loss_clip": 0.01101352, + "auxiliary_loss_mlp": 0.01029, + "balance_loss_clip": 1.03807783, + "balance_loss_mlp": 1.01691222, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 2.193080886321526, + "language_loss": 0.81616092, + "learning_rate": 2.689561782445313e-07, + "loss": 0.83746445, + "num_input_tokens_seen": 300643835, + "step": 13938, + "time_per_iteration": 2.491910219192505 + }, + { + "auxiliary_loss_clip": 0.01103384, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.03865933, + "balance_loss_mlp": 1.02098799, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.6381522494124252, + "language_loss": 0.70510334, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.72647989, + "num_input_tokens_seen": 300662500, + "step": 13939, + "time_per_iteration": 2.4396235942840576 + }, + { + "auxiliary_loss_clip": 0.01078947, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.03578055, + "balance_loss_mlp": 1.02252936, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 1.6940400150451957, + "language_loss": 0.76071286, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78186095, + "num_input_tokens_seen": 300681480, + "step": 13940, + "time_per_iteration": 2.5680794715881348 + }, + { + "auxiliary_loss_clip": 0.01092928, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.03389609, + "balance_loss_mlp": 1.02022672, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.6985971188168065, + "language_loss": 0.76243794, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78369164, + "num_input_tokens_seen": 300699165, + "step": 13941, + "time_per_iteration": 2.4592087268829346 + }, + { + "auxiliary_loss_clip": 0.01069571, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.03496122, + "balance_loss_mlp": 1.01806903, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 3.287892130106497, + "language_loss": 0.73053557, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75153822, + "num_input_tokens_seen": 300714615, + "step": 13942, + "time_per_iteration": 4.067942380905151 + }, + { + "auxiliary_loss_clip": 0.01069508, + "auxiliary_loss_mlp": 0.01036645, + "balance_loss_clip": 1.03681934, + "balance_loss_mlp": 1.02257848, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.5738326986035118, + "language_loss": 0.79803348, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81909502, + "num_input_tokens_seen": 300734860, + "step": 13943, + "time_per_iteration": 2.595604419708252 + }, + { + "auxiliary_loss_clip": 0.01062742, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.03543091, + "balance_loss_mlp": 1.01868749, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 1.9738376743304904, + "language_loss": 0.85128558, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.8722226, + "num_input_tokens_seen": 300752735, + "step": 13944, + "time_per_iteration": 2.549778938293457 + }, + { + "auxiliary_loss_clip": 0.01009231, + "auxiliary_loss_mlp": 0.00753126, + "balance_loss_clip": 1.00440478, + "balance_loss_mlp": 1.00013065, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.6402596480110013, + "language_loss": 0.50281167, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52043527, + "num_input_tokens_seen": 300820760, + "step": 13945, + "time_per_iteration": 3.1510844230651855 + }, + { + "auxiliary_loss_clip": 0.01067434, + "auxiliary_loss_mlp": 0.01028492, + "balance_loss_clip": 1.03774607, + "balance_loss_mlp": 1.01664281, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 3.297518495371827, + "language_loss": 0.65042841, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67138767, + "num_input_tokens_seen": 300840025, + "step": 13946, + "time_per_iteration": 2.5827555656433105 + }, + { + "auxiliary_loss_clip": 0.01061974, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.03421795, + "balance_loss_mlp": 1.01939273, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 2.048604151175758, + "language_loss": 0.67448759, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69543183, + "num_input_tokens_seen": 300860380, + "step": 13947, + "time_per_iteration": 2.6220273971557617 + }, + { + "auxiliary_loss_clip": 0.0108405, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.03531313, + "balance_loss_mlp": 1.01909566, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.4252733683039647, + "language_loss": 0.70104218, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.72220647, + "num_input_tokens_seen": 300881895, + "step": 13948, + "time_per_iteration": 2.6047492027282715 + }, + { + "auxiliary_loss_clip": 0.01081962, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.0343492, + "balance_loss_mlp": 1.01993942, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 2.357484151446276, + "language_loss": 0.8524, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.87352556, + "num_input_tokens_seen": 300901575, + "step": 13949, + "time_per_iteration": 2.5270352363586426 + }, + { + "auxiliary_loss_clip": 0.01082082, + "auxiliary_loss_mlp": 0.01030675, + "balance_loss_clip": 1.0378654, + "balance_loss_mlp": 1.01844394, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 1.9232677613734925, + "language_loss": 0.70322573, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72435331, + "num_input_tokens_seen": 300919735, + "step": 13950, + "time_per_iteration": 2.4971957206726074 + }, + { + "auxiliary_loss_clip": 0.01092045, + "auxiliary_loss_mlp": 0.01028164, + "balance_loss_clip": 1.03621233, + "balance_loss_mlp": 1.0157845, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 2.0533457213879887, + "language_loss": 0.64748466, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66868675, + "num_input_tokens_seen": 300939150, + "step": 13951, + "time_per_iteration": 2.4298603534698486 + }, + { + "auxiliary_loss_clip": 0.0109994, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.03843617, + "balance_loss_mlp": 1.0206871, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.5747277944813864, + "language_loss": 0.69789928, + "learning_rate": 2.662316332665393e-07, + "loss": 0.71922338, + "num_input_tokens_seen": 300959730, + "step": 13952, + "time_per_iteration": 2.5088987350463867 + }, + { + "auxiliary_loss_clip": 0.0109724, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.03798485, + "balance_loss_mlp": 1.01833272, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 1.8903177626063592, + "language_loss": 0.72780478, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.74907774, + "num_input_tokens_seen": 300976120, + "step": 13953, + "time_per_iteration": 2.4661026000976562 + }, + { + "auxiliary_loss_clip": 0.01039557, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.03148341, + "balance_loss_mlp": 1.01972198, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 1.9829224803211745, + "language_loss": 0.68305457, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70378554, + "num_input_tokens_seen": 300995080, + "step": 13954, + "time_per_iteration": 2.5668869018554688 + }, + { + "auxiliary_loss_clip": 0.01087747, + "auxiliary_loss_mlp": 0.01029015, + "balance_loss_clip": 1.037642, + "balance_loss_mlp": 1.01813734, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.8072603357345, + "language_loss": 0.73593736, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75710499, + "num_input_tokens_seen": 301012920, + "step": 13955, + "time_per_iteration": 2.457007884979248 + }, + { + "auxiliary_loss_clip": 0.01049548, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.03546453, + "balance_loss_mlp": 1.01874042, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 2.862220148620076, + "language_loss": 0.66481704, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68562531, + "num_input_tokens_seen": 301028875, + "step": 13956, + "time_per_iteration": 2.605323076248169 + }, + { + "auxiliary_loss_clip": 0.01099296, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.03703952, + "balance_loss_mlp": 1.01979136, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 1.8827544015510025, + "language_loss": 0.79448736, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81580842, + "num_input_tokens_seen": 301050115, + "step": 13957, + "time_per_iteration": 2.499605655670166 + }, + { + "auxiliary_loss_clip": 0.00992187, + "auxiliary_loss_mlp": 0.01005645, + "balance_loss_clip": 1.01927066, + "balance_loss_mlp": 1.0041312, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7630257518311993, + "language_loss": 0.53371757, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55369586, + "num_input_tokens_seen": 301114155, + "step": 13958, + "time_per_iteration": 4.70576286315918 + }, + { + "auxiliary_loss_clip": 0.01096445, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.0362252, + "balance_loss_mlp": 1.02319384, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.7984873706056217, + "language_loss": 0.73067665, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75200093, + "num_input_tokens_seen": 301133150, + "step": 13959, + "time_per_iteration": 2.433985710144043 + }, + { + "auxiliary_loss_clip": 0.01073286, + "auxiliary_loss_mlp": 0.010249, + "balance_loss_clip": 1.0363915, + "balance_loss_mlp": 1.01361656, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 2.633298112037376, + "language_loss": 0.55276674, + "learning_rate": 2.646805346545169e-07, + "loss": 0.57374859, + "num_input_tokens_seen": 301153600, + "step": 13960, + "time_per_iteration": 2.5738093852996826 + }, + { + "auxiliary_loss_clip": 0.01002248, + "auxiliary_loss_mlp": 0.00999861, + "balance_loss_clip": 1.00633597, + "balance_loss_mlp": 0.99858588, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.7688074851894849, + "language_loss": 0.60720682, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.6272279, + "num_input_tokens_seen": 301214335, + "step": 13961, + "time_per_iteration": 3.1592092514038086 + }, + { + "auxiliary_loss_clip": 0.01058357, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.03311813, + "balance_loss_mlp": 1.02665472, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.398010966916521, + "language_loss": 0.6817615, + "learning_rate": 2.642934178894405e-07, + "loss": 0.7027449, + "num_input_tokens_seen": 301228960, + "step": 13962, + "time_per_iteration": 2.5128841400146484 + }, + { + "auxiliary_loss_clip": 0.01078979, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.03591204, + "balance_loss_mlp": 1.01535153, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 1.9161571314895687, + "language_loss": 0.7330181, + "learning_rate": 2.640999582304841e-07, + "loss": 0.75407851, + "num_input_tokens_seen": 301245875, + "step": 13963, + "time_per_iteration": 2.4931082725524902 + }, + { + "auxiliary_loss_clip": 0.0108858, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.03621948, + "balance_loss_mlp": 1.02557695, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 1.5173403871673898, + "language_loss": 0.76377904, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78504062, + "num_input_tokens_seen": 301265550, + "step": 13964, + "time_per_iteration": 2.551651954650879 + }, + { + "auxiliary_loss_clip": 0.01090565, + "auxiliary_loss_mlp": 0.01038519, + "balance_loss_clip": 1.03740716, + "balance_loss_mlp": 1.02477419, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 2.266290937812173, + "language_loss": 0.78256971, + "learning_rate": 2.637132363964161e-07, + "loss": 0.80386055, + "num_input_tokens_seen": 301282035, + "step": 13965, + "time_per_iteration": 2.5224661827087402 + }, + { + "auxiliary_loss_clip": 0.01096312, + "auxiliary_loss_mlp": 0.01031108, + "balance_loss_clip": 1.03962123, + "balance_loss_mlp": 1.01964045, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 1.4790427729661468, + "language_loss": 0.65499747, + "learning_rate": 2.635199742359684e-07, + "loss": 0.67627168, + "num_input_tokens_seen": 301305210, + "step": 13966, + "time_per_iteration": 2.6232869625091553 + }, + { + "auxiliary_loss_clip": 0.01085888, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.03635824, + "balance_loss_mlp": 1.02161431, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.7592555363439473, + "language_loss": 0.74349403, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76468801, + "num_input_tokens_seen": 301324885, + "step": 13967, + "time_per_iteration": 4.114130258560181 + }, + { + "auxiliary_loss_clip": 0.01088163, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.04283619, + "balance_loss_mlp": 1.02179229, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 2.129066993164891, + "language_loss": 0.83160973, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85283065, + "num_input_tokens_seen": 301343070, + "step": 13968, + "time_per_iteration": 2.4875710010528564 + }, + { + "auxiliary_loss_clip": 0.0108279, + "auxiliary_loss_mlp": 0.01030444, + "balance_loss_clip": 1.03595567, + "balance_loss_mlp": 1.0185647, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 2.4172785239643835, + "language_loss": 0.77654696, + "learning_rate": 2.629405828689075e-07, + "loss": 0.79767925, + "num_input_tokens_seen": 301359280, + "step": 13969, + "time_per_iteration": 2.436208486557007 + }, + { + "auxiliary_loss_clip": 0.01091785, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.03690565, + "balance_loss_mlp": 1.01775944, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 3.245966262296129, + "language_loss": 0.77691734, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79813838, + "num_input_tokens_seen": 301376465, + "step": 13970, + "time_per_iteration": 2.4865102767944336 + }, + { + "auxiliary_loss_clip": 0.0108847, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.03687942, + "balance_loss_mlp": 1.02324069, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 1.8912276396370409, + "language_loss": 0.72074234, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74197966, + "num_input_tokens_seen": 301396000, + "step": 13971, + "time_per_iteration": 3.7006876468658447 + }, + { + "auxiliary_loss_clip": 0.01087828, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.03514242, + "balance_loss_mlp": 1.0191381, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 1.886858830467799, + "language_loss": 0.77871996, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79991734, + "num_input_tokens_seen": 301413160, + "step": 13972, + "time_per_iteration": 2.531372547149658 + }, + { + "auxiliary_loss_clip": 0.01042351, + "auxiliary_loss_mlp": 0.01038479, + "balance_loss_clip": 1.03067398, + "balance_loss_mlp": 1.02537143, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.4348669785966932, + "language_loss": 0.68507749, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70588577, + "num_input_tokens_seen": 301433325, + "step": 13973, + "time_per_iteration": 2.5811305046081543 + }, + { + "auxiliary_loss_clip": 0.0108964, + "auxiliary_loss_mlp": 0.01028122, + "balance_loss_clip": 1.03725266, + "balance_loss_mlp": 1.01573598, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 1.9947099114309723, + "language_loss": 0.7795859, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80076355, + "num_input_tokens_seen": 301450265, + "step": 13974, + "time_per_iteration": 2.4566972255706787 + }, + { + "auxiliary_loss_clip": 0.01094982, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.03790212, + "balance_loss_mlp": 1.01984406, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.61149034854049, + "language_loss": 0.72625291, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74752021, + "num_input_tokens_seen": 301470760, + "step": 13975, + "time_per_iteration": 2.4727439880371094 + }, + { + "auxiliary_loss_clip": 0.01090136, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.03674126, + "balance_loss_mlp": 1.01715696, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 1.653297035386954, + "language_loss": 0.72248816, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74368715, + "num_input_tokens_seen": 301489425, + "step": 13976, + "time_per_iteration": 2.4824187755584717 + }, + { + "auxiliary_loss_clip": 0.01107223, + "auxiliary_loss_mlp": 0.00776924, + "balance_loss_clip": 1.0360347, + "balance_loss_mlp": 1.00063586, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.9155720495639645, + "language_loss": 0.72004431, + "learning_rate": 2.61398438016311e-07, + "loss": 0.73888576, + "num_input_tokens_seen": 301508885, + "step": 13977, + "time_per_iteration": 2.424844980239868 + }, + { + "auxiliary_loss_clip": 0.01094125, + "auxiliary_loss_mlp": 0.01028704, + "balance_loss_clip": 1.0326798, + "balance_loss_mlp": 1.01714051, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.4370441686925473, + "language_loss": 0.68599278, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70722109, + "num_input_tokens_seen": 301533780, + "step": 13978, + "time_per_iteration": 2.5730576515197754 + }, + { + "auxiliary_loss_clip": 0.01074627, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.0344981, + "balance_loss_mlp": 1.01612973, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.800456776817963, + "language_loss": 0.78258848, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80361426, + "num_input_tokens_seen": 301551775, + "step": 13979, + "time_per_iteration": 2.494450807571411 + }, + { + "auxiliary_loss_clip": 0.01095032, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.03955364, + "balance_loss_mlp": 1.01730776, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 2.0968619931955126, + "language_loss": 0.77902079, + "learning_rate": 2.60821221306778e-07, + "loss": 0.8002646, + "num_input_tokens_seen": 301570495, + "step": 13980, + "time_per_iteration": 2.428741931915283 + }, + { + "auxiliary_loss_clip": 0.01073411, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.03533411, + "balance_loss_mlp": 1.0174737, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.7410794964844898, + "language_loss": 0.86721873, + "learning_rate": 2.606289476268757e-07, + "loss": 0.8882404, + "num_input_tokens_seen": 301591705, + "step": 13981, + "time_per_iteration": 2.576323986053467 + }, + { + "auxiliary_loss_clip": 0.01097559, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.03555727, + "balance_loss_mlp": 1.02408373, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 2.099226820695159, + "language_loss": 0.67901641, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.70035768, + "num_input_tokens_seen": 301611670, + "step": 13982, + "time_per_iteration": 3.888542413711548 + }, + { + "auxiliary_loss_clip": 0.01060679, + "auxiliary_loss_mlp": 0.0104137, + "balance_loss_clip": 1.0344758, + "balance_loss_mlp": 1.0274049, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 1.8806203927036864, + "language_loss": 0.68225706, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70327753, + "num_input_tokens_seen": 301632540, + "step": 13983, + "time_per_iteration": 2.615046262741089 + }, + { + "auxiliary_loss_clip": 0.0107166, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.03069866, + "balance_loss_mlp": 1.02259898, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.6764379988275224, + "language_loss": 0.78816944, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.8092401, + "num_input_tokens_seen": 301651480, + "step": 13984, + "time_per_iteration": 2.5699453353881836 + }, + { + "auxiliary_loss_clip": 0.01093689, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.03303742, + "balance_loss_mlp": 1.02075088, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 2.066306786314562, + "language_loss": 0.60079002, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62205154, + "num_input_tokens_seen": 301670010, + "step": 13985, + "time_per_iteration": 2.4792191982269287 + }, + { + "auxiliary_loss_clip": 0.0107154, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.03542638, + "balance_loss_mlp": 1.01750231, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 1.5736591799738886, + "language_loss": 0.81886971, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.83988827, + "num_input_tokens_seen": 301689785, + "step": 13986, + "time_per_iteration": 2.58809494972229 + }, + { + "auxiliary_loss_clip": 0.01087068, + "auxiliary_loss_mlp": 0.00777475, + "balance_loss_clip": 1.03823495, + "balance_loss_mlp": 1.00057065, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 2.874647966988225, + "language_loss": 0.66002846, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67867386, + "num_input_tokens_seen": 301712225, + "step": 13987, + "time_per_iteration": 2.580526113510132 + }, + { + "auxiliary_loss_clip": 0.0110911, + "auxiliary_loss_mlp": 0.00778399, + "balance_loss_clip": 1.03686893, + "balance_loss_mlp": 1.00073934, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 2.0066566247067246, + "language_loss": 0.67621231, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69508743, + "num_input_tokens_seen": 301730955, + "step": 13988, + "time_per_iteration": 2.4686763286590576 + }, + { + "auxiliary_loss_clip": 0.01100636, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.04139137, + "balance_loss_mlp": 1.02744567, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 2.1138573113609835, + "language_loss": 0.81350601, + "learning_rate": 2.590931332560622e-07, + "loss": 0.83491731, + "num_input_tokens_seen": 301746930, + "step": 13989, + "time_per_iteration": 2.429511308670044 + }, + { + "auxiliary_loss_clip": 0.01098423, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.0344758, + "balance_loss_mlp": 1.01793957, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.7470848689038594, + "language_loss": 0.75356287, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77485394, + "num_input_tokens_seen": 301766945, + "step": 13990, + "time_per_iteration": 2.5178415775299072 + }, + { + "auxiliary_loss_clip": 0.01092985, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.03502858, + "balance_loss_mlp": 1.02469087, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.6576063261559586, + "language_loss": 0.80782926, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.82912958, + "num_input_tokens_seen": 301785460, + "step": 13991, + "time_per_iteration": 2.4633517265319824 + }, + { + "auxiliary_loss_clip": 0.01073165, + "auxiliary_loss_mlp": 0.01035063, + "balance_loss_clip": 1.03858304, + "balance_loss_mlp": 1.02332699, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 2.1632850919586035, + "language_loss": 0.71122795, + "learning_rate": 2.585182919204105e-07, + "loss": 0.73231024, + "num_input_tokens_seen": 301804180, + "step": 13992, + "time_per_iteration": 2.5399181842803955 + }, + { + "auxiliary_loss_clip": 0.01075956, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.03345776, + "balance_loss_mlp": 1.01677036, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 1.8116182070903128, + "language_loss": 0.76519525, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78624368, + "num_input_tokens_seen": 301823670, + "step": 13993, + "time_per_iteration": 2.547896146774292 + }, + { + "auxiliary_loss_clip": 0.01103692, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.03490889, + "balance_loss_mlp": 1.02154756, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 3.7006298674800235, + "language_loss": 0.74079251, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76219016, + "num_input_tokens_seen": 301845890, + "step": 13994, + "time_per_iteration": 2.4993386268615723 + }, + { + "auxiliary_loss_clip": 0.01094915, + "auxiliary_loss_mlp": 0.01030396, + "balance_loss_clip": 1.0343852, + "balance_loss_mlp": 1.01875496, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.5682088067836046, + "language_loss": 0.59256405, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.6138171, + "num_input_tokens_seen": 301863985, + "step": 13995, + "time_per_iteration": 2.4521284103393555 + }, + { + "auxiliary_loss_clip": 0.01096816, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.03593755, + "balance_loss_mlp": 1.01854658, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.6671081755475186, + "language_loss": 0.71876818, + "learning_rate": 2.577527613603163e-07, + "loss": 0.74005079, + "num_input_tokens_seen": 301882765, + "step": 13996, + "time_per_iteration": 2.475069522857666 + }, + { + "auxiliary_loss_clip": 0.01083279, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.0332222, + "balance_loss_mlp": 1.02231956, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.822541688639068, + "language_loss": 0.64517874, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66635448, + "num_input_tokens_seen": 301902720, + "step": 13997, + "time_per_iteration": 4.0147528648376465 + }, + { + "auxiliary_loss_clip": 0.01087918, + "auxiliary_loss_mlp": 0.01040033, + "balance_loss_clip": 1.03947926, + "balance_loss_mlp": 1.02655661, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 3.0129027227280507, + "language_loss": 0.82412732, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84540677, + "num_input_tokens_seen": 301921245, + "step": 13998, + "time_per_iteration": 2.465674877166748 + }, + { + "auxiliary_loss_clip": 0.01102134, + "auxiliary_loss_mlp": 0.00777888, + "balance_loss_clip": 1.03991008, + "balance_loss_mlp": 1.00071824, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 2.2280961875037164, + "language_loss": 0.80320513, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82200533, + "num_input_tokens_seen": 301942320, + "step": 13999, + "time_per_iteration": 2.4972922801971436 + }, + { + "auxiliary_loss_clip": 0.01099793, + "auxiliary_loss_mlp": 0.01034068, + "balance_loss_clip": 1.03557372, + "balance_loss_mlp": 1.02101493, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 1.994141220826222, + "language_loss": 0.67110777, + "learning_rate": 2.569882878592096e-07, + "loss": 0.69244635, + "num_input_tokens_seen": 301963110, + "step": 14000, + "time_per_iteration": 2.4870340824127197 + }, + { + "auxiliary_loss_clip": 0.01104442, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.03850067, + "balance_loss_mlp": 1.01524639, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.9297917356795096, + "language_loss": 0.79638374, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81771135, + "num_input_tokens_seen": 301984915, + "step": 14001, + "time_per_iteration": 2.4897708892822266 + }, + { + "auxiliary_loss_clip": 0.01048827, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.03664327, + "balance_loss_mlp": 1.01792669, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 2.3092277856242514, + "language_loss": 0.78711957, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80790067, + "num_input_tokens_seen": 302004095, + "step": 14002, + "time_per_iteration": 2.583247184753418 + }, + { + "auxiliary_loss_clip": 0.01063089, + "auxiliary_loss_mlp": 0.00777039, + "balance_loss_clip": 1.03520989, + "balance_loss_mlp": 1.00058746, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.4420050212203779, + "language_loss": 0.78352284, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.80192411, + "num_input_tokens_seen": 302027250, + "step": 14003, + "time_per_iteration": 2.637146472930908 + }, + { + "auxiliary_loss_clip": 0.01083896, + "auxiliary_loss_mlp": 0.01031573, + "balance_loss_clip": 1.03864312, + "balance_loss_mlp": 1.0193063, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 1.6122126305713944, + "language_loss": 0.65463495, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67578965, + "num_input_tokens_seen": 302046950, + "step": 14004, + "time_per_iteration": 2.4717483520507812 + }, + { + "auxiliary_loss_clip": 0.01100175, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.03584516, + "balance_loss_mlp": 1.0227983, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 2.0448152682305296, + "language_loss": 0.7560724, + "learning_rate": 2.560341831785724e-07, + "loss": 0.77743685, + "num_input_tokens_seen": 302065470, + "step": 14005, + "time_per_iteration": 2.5214972496032715 + }, + { + "auxiliary_loss_clip": 0.01074446, + "auxiliary_loss_mlp": 0.00778594, + "balance_loss_clip": 1.03358698, + "balance_loss_mlp": 1.00068927, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 2.043034418889446, + "language_loss": 0.77814221, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79667264, + "num_input_tokens_seen": 302083190, + "step": 14006, + "time_per_iteration": 4.071217775344849 + }, + { + "auxiliary_loss_clip": 0.01097565, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.03569293, + "balance_loss_mlp": 1.02410209, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.88680463485615, + "language_loss": 0.76920485, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79054415, + "num_input_tokens_seen": 302098820, + "step": 14007, + "time_per_iteration": 2.438549757003784 + }, + { + "auxiliary_loss_clip": 0.0108129, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.03474021, + "balance_loss_mlp": 1.0210073, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 1.8974489916232953, + "language_loss": 0.65751588, + "learning_rate": 2.554625138886102e-07, + "loss": 0.67866373, + "num_input_tokens_seen": 302117075, + "step": 14008, + "time_per_iteration": 2.5705068111419678 + }, + { + "auxiliary_loss_clip": 0.01020551, + "auxiliary_loss_mlp": 0.01005768, + "balance_loss_clip": 1.00712347, + "balance_loss_mlp": 1.00452185, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7175353364337871, + "language_loss": 0.56944817, + "learning_rate": 2.552720897550631e-07, + "loss": 0.58971137, + "num_input_tokens_seen": 302179735, + "step": 14009, + "time_per_iteration": 3.1287195682525635 + }, + { + "auxiliary_loss_clip": 0.01041346, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.03233922, + "balance_loss_mlp": 1.02260852, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.2209594891217184, + "language_loss": 0.78061932, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80137098, + "num_input_tokens_seen": 302202055, + "step": 14010, + "time_per_iteration": 3.9984240531921387 + }, + { + "auxiliary_loss_clip": 0.01113616, + "auxiliary_loss_mlp": 0.0104115, + "balance_loss_clip": 1.03864646, + "balance_loss_mlp": 1.02769172, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 2.0153543988478697, + "language_loss": 0.7226665, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74421412, + "num_input_tokens_seen": 302221360, + "step": 14011, + "time_per_iteration": 2.410905122756958 + }, + { + "auxiliary_loss_clip": 0.01092889, + "auxiliary_loss_mlp": 0.01034997, + "balance_loss_clip": 1.03648043, + "balance_loss_mlp": 1.02323115, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 2.3164086868251044, + "language_loss": 0.84695268, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86823153, + "num_input_tokens_seen": 302240715, + "step": 14012, + "time_per_iteration": 2.504730701446533 + }, + { + "auxiliary_loss_clip": 0.01100133, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.03423452, + "balance_loss_mlp": 1.02220392, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 1.5247049713749121, + "language_loss": 0.67795229, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.69927895, + "num_input_tokens_seen": 302260950, + "step": 14013, + "time_per_iteration": 2.4901082515716553 + }, + { + "auxiliary_loss_clip": 0.01114, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.0379957, + "balance_loss_mlp": 1.01691055, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 6.102354973458748, + "language_loss": 0.78860676, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.81004357, + "num_input_tokens_seen": 302277500, + "step": 14014, + "time_per_iteration": 2.3754172325134277 + }, + { + "auxiliary_loss_clip": 0.01076995, + "auxiliary_loss_mlp": 0.00777218, + "balance_loss_clip": 1.03343725, + "balance_loss_mlp": 1.00067019, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.7864430686161974, + "language_loss": 0.67346478, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69200695, + "num_input_tokens_seen": 302297930, + "step": 14015, + "time_per_iteration": 2.5474700927734375 + }, + { + "auxiliary_loss_clip": 0.01109611, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.03774667, + "balance_loss_mlp": 1.01685405, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 2.105334912289432, + "language_loss": 0.75778133, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.77917767, + "num_input_tokens_seen": 302315735, + "step": 14016, + "time_per_iteration": 2.3957409858703613 + }, + { + "auxiliary_loss_clip": 0.01086231, + "auxiliary_loss_mlp": 0.01032174, + "balance_loss_clip": 1.03481281, + "balance_loss_mlp": 1.01978803, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 1.9540219351527623, + "language_loss": 0.79669642, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81788051, + "num_input_tokens_seen": 302332790, + "step": 14017, + "time_per_iteration": 2.47308087348938 + }, + { + "auxiliary_loss_clip": 0.01088874, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.03924716, + "balance_loss_mlp": 1.01894307, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 2.1322884766655807, + "language_loss": 0.62912464, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.65032035, + "num_input_tokens_seen": 302346490, + "step": 14018, + "time_per_iteration": 2.496659755706787 + }, + { + "auxiliary_loss_clip": 0.01095041, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.03631628, + "balance_loss_mlp": 1.01968336, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 2.2515921243613004, + "language_loss": 0.79107857, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81234276, + "num_input_tokens_seen": 302363235, + "step": 14019, + "time_per_iteration": 2.453749656677246 + }, + { + "auxiliary_loss_clip": 0.01068737, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.03189671, + "balance_loss_mlp": 1.02766776, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 2.353572284435037, + "language_loss": 0.78551441, + "learning_rate": 2.531817924498265e-07, + "loss": 0.80662173, + "num_input_tokens_seen": 302383270, + "step": 14020, + "time_per_iteration": 2.5972585678100586 + }, + { + "auxiliary_loss_clip": 0.01098039, + "auxiliary_loss_mlp": 0.01026589, + "balance_loss_clip": 1.04012704, + "balance_loss_mlp": 1.01464975, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.641968106449965, + "language_loss": 0.71416986, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73541611, + "num_input_tokens_seen": 302401355, + "step": 14021, + "time_per_iteration": 3.986010789871216 + }, + { + "auxiliary_loss_clip": 0.01079926, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.03553557, + "balance_loss_mlp": 1.02625346, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.9526000687560898, + "language_loss": 0.70075721, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.72194785, + "num_input_tokens_seen": 302419515, + "step": 14022, + "time_per_iteration": 2.5242271423339844 + }, + { + "auxiliary_loss_clip": 0.01057973, + "auxiliary_loss_mlp": 0.01037021, + "balance_loss_clip": 1.03702676, + "balance_loss_mlp": 1.02380085, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 1.8748907988283399, + "language_loss": 0.72311038, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74406028, + "num_input_tokens_seen": 302438280, + "step": 14023, + "time_per_iteration": 2.617586135864258 + }, + { + "auxiliary_loss_clip": 0.01098358, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.03750992, + "balance_loss_mlp": 1.02463388, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.7562748710484983, + "language_loss": 0.66692364, + "learning_rate": 2.524236710204559e-07, + "loss": 0.68828076, + "num_input_tokens_seen": 302460860, + "step": 14024, + "time_per_iteration": 2.5143699645996094 + }, + { + "auxiliary_loss_clip": 0.01093232, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.03566551, + "balance_loss_mlp": 1.02015662, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 1.895048918330767, + "language_loss": 0.80887938, + "learning_rate": 2.522343063158261e-07, + "loss": 0.83013916, + "num_input_tokens_seen": 302476980, + "step": 14025, + "time_per_iteration": 2.458869695663452 + }, + { + "auxiliary_loss_clip": 0.01093267, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.03562164, + "balance_loss_mlp": 1.02338362, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.4973492874256493, + "language_loss": 0.77898151, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.80025315, + "num_input_tokens_seen": 302496380, + "step": 14026, + "time_per_iteration": 2.477203845977783 + }, + { + "auxiliary_loss_clip": 0.01085565, + "auxiliary_loss_mlp": 0.01035966, + "balance_loss_clip": 1.03351688, + "balance_loss_mlp": 1.02330565, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.634723791676736, + "language_loss": 0.8305068, + "learning_rate": 2.518557757400945e-07, + "loss": 0.85172206, + "num_input_tokens_seen": 302516845, + "step": 14027, + "time_per_iteration": 2.528935194015503 + }, + { + "auxiliary_loss_clip": 0.01088247, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.04024541, + "balance_loss_mlp": 1.0205313, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.4154375863535857, + "language_loss": 0.56505913, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58626485, + "num_input_tokens_seen": 302538865, + "step": 14028, + "time_per_iteration": 2.6677587032318115 + }, + { + "auxiliary_loss_clip": 0.01083836, + "auxiliary_loss_mlp": 0.01027635, + "balance_loss_clip": 1.0342381, + "balance_loss_mlp": 1.01595855, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 2.012166977222852, + "language_loss": 0.64240915, + "learning_rate": 2.51477510323578e-07, + "loss": 0.66352385, + "num_input_tokens_seen": 302557970, + "step": 14029, + "time_per_iteration": 2.526191473007202 + }, + { + "auxiliary_loss_clip": 0.01105169, + "auxiliary_loss_mlp": 0.01030546, + "balance_loss_clip": 1.03730774, + "balance_loss_mlp": 1.01960874, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.5772154294175014, + "language_loss": 0.75151956, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77287668, + "num_input_tokens_seen": 302578915, + "step": 14030, + "time_per_iteration": 2.4766950607299805 + }, + { + "auxiliary_loss_clip": 0.01088032, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.03678703, + "balance_loss_mlp": 1.02053976, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 1.7597279426427377, + "language_loss": 0.82972717, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85093307, + "num_input_tokens_seen": 302596300, + "step": 14031, + "time_per_iteration": 2.5193655490875244 + }, + { + "auxiliary_loss_clip": 0.01085627, + "auxiliary_loss_mlp": 0.01027766, + "balance_loss_clip": 1.03497505, + "balance_loss_mlp": 1.01613152, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 1.7466874208165726, + "language_loss": 0.80395329, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82508725, + "num_input_tokens_seen": 302614975, + "step": 14032, + "time_per_iteration": 2.570891857147217 + }, + { + "auxiliary_loss_clip": 0.01072805, + "auxiliary_loss_mlp": 0.01035665, + "balance_loss_clip": 1.03189027, + "balance_loss_mlp": 1.02144337, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.8707235236349307, + "language_loss": 0.75307167, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77415639, + "num_input_tokens_seen": 302636415, + "step": 14033, + "time_per_iteration": 2.5829343795776367 + }, + { + "auxiliary_loss_clip": 0.01070405, + "auxiliary_loss_mlp": 0.01035787, + "balance_loss_clip": 1.0334208, + "balance_loss_mlp": 1.0246166, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.8431265418315477, + "language_loss": 0.83447987, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85554183, + "num_input_tokens_seen": 302653605, + "step": 14034, + "time_per_iteration": 2.5450150966644287 + }, + { + "auxiliary_loss_clip": 0.01075295, + "auxiliary_loss_mlp": 0.01035433, + "balance_loss_clip": 1.03412414, + "balance_loss_mlp": 1.02143741, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 1.5400780703659709, + "language_loss": 0.78300571, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80411303, + "num_input_tokens_seen": 302673965, + "step": 14035, + "time_per_iteration": 2.6179473400115967 + }, + { + "auxiliary_loss_clip": 0.0109428, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.03410149, + "balance_loss_mlp": 1.0178287, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 1.3714251266666535, + "language_loss": 0.72145689, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74270111, + "num_input_tokens_seen": 302695560, + "step": 14036, + "time_per_iteration": 4.067933559417725 + }, + { + "auxiliary_loss_clip": 0.01104113, + "auxiliary_loss_mlp": 0.01028812, + "balance_loss_clip": 1.03689957, + "balance_loss_mlp": 1.01866102, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 1.7941054184531158, + "language_loss": 0.69792539, + "learning_rate": 2.49967101396557e-07, + "loss": 0.71925467, + "num_input_tokens_seen": 302713480, + "step": 14037, + "time_per_iteration": 2.464078426361084 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.03591526, + "balance_loss_mlp": 1.01815534, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.9353535597065148, + "language_loss": 0.6925621, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.7139374, + "num_input_tokens_seen": 302736860, + "step": 14038, + "time_per_iteration": 2.5292420387268066 + }, + { + "auxiliary_loss_clip": 0.01054436, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.03087664, + "balance_loss_mlp": 1.02665746, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.5398363779464543, + "language_loss": 0.76842403, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78936481, + "num_input_tokens_seen": 302757745, + "step": 14039, + "time_per_iteration": 2.605436086654663 + }, + { + "auxiliary_loss_clip": 0.01114593, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.03859854, + "balance_loss_mlp": 1.02165687, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 2.474909954462368, + "language_loss": 0.79228079, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81376791, + "num_input_tokens_seen": 302774885, + "step": 14040, + "time_per_iteration": 2.4903244972229004 + }, + { + "auxiliary_loss_clip": 0.01078694, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.02230644, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 2.4552381369490313, + "language_loss": 0.6947937, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71592796, + "num_input_tokens_seen": 302791035, + "step": 14041, + "time_per_iteration": 2.5675761699676514 + }, + { + "auxiliary_loss_clip": 0.01089921, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.03470564, + "balance_loss_mlp": 1.02593148, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 2.163352684491531, + "language_loss": 0.69247222, + "learning_rate": 2.490252523307341e-07, + "loss": 0.71375722, + "num_input_tokens_seen": 302808650, + "step": 14042, + "time_per_iteration": 2.4886796474456787 + }, + { + "auxiliary_loss_clip": 0.01085133, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.03659248, + "balance_loss_mlp": 1.01919818, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 1.7168670102644412, + "language_loss": 0.74843335, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.76959252, + "num_input_tokens_seen": 302824605, + "step": 14043, + "time_per_iteration": 2.504481792449951 + }, + { + "auxiliary_loss_clip": 0.01106152, + "auxiliary_loss_mlp": 0.00777836, + "balance_loss_clip": 1.03556168, + "balance_loss_mlp": 1.00059628, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 4.537171022441386, + "language_loss": 0.72039652, + "learning_rate": 2.486489774343865e-07, + "loss": 0.73923635, + "num_input_tokens_seen": 302840170, + "step": 14044, + "time_per_iteration": 2.4184811115264893 + }, + { + "auxiliary_loss_clip": 0.0108387, + "auxiliary_loss_mlp": 0.0102599, + "balance_loss_clip": 1.0347079, + "balance_loss_mlp": 1.01418841, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.600513887940703, + "language_loss": 0.74890125, + "learning_rate": 2.484609395997559e-07, + "loss": 0.7699998, + "num_input_tokens_seen": 302858320, + "step": 14045, + "time_per_iteration": 3.952432632446289 + }, + { + "auxiliary_loss_clip": 0.01087569, + "auxiliary_loss_mlp": 0.00777501, + "balance_loss_clip": 1.03334904, + "balance_loss_mlp": 1.00067139, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 2.033430249926969, + "language_loss": 0.78570652, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80435717, + "num_input_tokens_seen": 302875255, + "step": 14046, + "time_per_iteration": 2.4766671657562256 + }, + { + "auxiliary_loss_clip": 0.01082889, + "auxiliary_loss_mlp": 0.01037684, + "balance_loss_clip": 1.03417134, + "balance_loss_mlp": 1.02312875, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 2.2351728950640632, + "language_loss": 0.78477567, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.8059814, + "num_input_tokens_seen": 302894690, + "step": 14047, + "time_per_iteration": 2.4993793964385986 + }, + { + "auxiliary_loss_clip": 0.01090258, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.04270601, + "balance_loss_mlp": 1.01836467, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 3.33337776240551, + "language_loss": 0.72133064, + "learning_rate": 2.478972246355935e-07, + "loss": 0.74253839, + "num_input_tokens_seen": 302912405, + "step": 14048, + "time_per_iteration": 2.588521957397461 + }, + { + "auxiliary_loss_clip": 0.01031768, + "auxiliary_loss_mlp": 0.01033535, + "balance_loss_clip": 1.03855085, + "balance_loss_mlp": 1.02148843, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.5887359439108748, + "language_loss": 0.73397374, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75462675, + "num_input_tokens_seen": 302932525, + "step": 14049, + "time_per_iteration": 4.0513105392456055 + }, + { + "auxiliary_loss_clip": 0.01020508, + "auxiliary_loss_mlp": 0.00753337, + "balance_loss_clip": 1.00684118, + "balance_loss_mlp": 1.0003252, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.9385964546782417, + "language_loss": 0.60644335, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62418181, + "num_input_tokens_seen": 302991285, + "step": 14050, + "time_per_iteration": 3.2454371452331543 + }, + { + "auxiliary_loss_clip": 0.01083183, + "auxiliary_loss_mlp": 0.00782074, + "balance_loss_clip": 1.03195393, + "balance_loss_mlp": 1.00066972, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 2.5283149295971596, + "language_loss": 0.7285403, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74719292, + "num_input_tokens_seen": 303009515, + "step": 14051, + "time_per_iteration": 2.568160057067871 + }, + { + "auxiliary_loss_clip": 0.01095917, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.03614128, + "balance_loss_mlp": 1.01480675, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 1.8371202029880456, + "language_loss": 0.74703473, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76825547, + "num_input_tokens_seen": 303026905, + "step": 14052, + "time_per_iteration": 2.592542886734009 + }, + { + "auxiliary_loss_clip": 0.01078872, + "auxiliary_loss_mlp": 0.01027522, + "balance_loss_clip": 1.03750837, + "balance_loss_mlp": 1.01670337, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 1.6546206374826522, + "language_loss": 0.73844659, + "learning_rate": 2.469590285884575e-07, + "loss": 0.75951052, + "num_input_tokens_seen": 303045245, + "step": 14053, + "time_per_iteration": 2.511460781097412 + }, + { + "auxiliary_loss_clip": 0.01091802, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.03436184, + "balance_loss_mlp": 1.01731908, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 3.1258831239381712, + "language_loss": 0.7419436, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76315868, + "num_input_tokens_seen": 303065205, + "step": 14054, + "time_per_iteration": 2.4851882457733154 + }, + { + "auxiliary_loss_clip": 0.01103217, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.03730714, + "balance_loss_mlp": 1.01960683, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.6171425297026694, + "language_loss": 0.78535688, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.806705, + "num_input_tokens_seen": 303088250, + "step": 14055, + "time_per_iteration": 2.554638624191284 + }, + { + "auxiliary_loss_clip": 0.01095028, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.03526115, + "balance_loss_mlp": 1.0213517, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.6690229502286322, + "language_loss": 0.73103333, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75231457, + "num_input_tokens_seen": 303109280, + "step": 14056, + "time_per_iteration": 2.4872586727142334 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01038212, + "balance_loss_clip": 1.0374434, + "balance_loss_mlp": 1.02530766, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.010437973172004, + "language_loss": 0.67416251, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69557583, + "num_input_tokens_seen": 303126075, + "step": 14057, + "time_per_iteration": 2.4700520038604736 + }, + { + "auxiliary_loss_clip": 0.01062254, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.03363705, + "balance_loss_mlp": 1.01727581, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.8154982943739173, + "language_loss": 0.77850592, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79942459, + "num_input_tokens_seen": 303146920, + "step": 14058, + "time_per_iteration": 2.6351239681243896 + }, + { + "auxiliary_loss_clip": 0.01110155, + "auxiliary_loss_mlp": 0.01039116, + "balance_loss_clip": 1.03698397, + "balance_loss_mlp": 1.02728462, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.6213616405331215, + "language_loss": 0.69684762, + "learning_rate": 2.45835387101033e-07, + "loss": 0.71834028, + "num_input_tokens_seen": 303167885, + "step": 14059, + "time_per_iteration": 2.4519221782684326 + }, + { + "auxiliary_loss_clip": 0.01113246, + "auxiliary_loss_mlp": 0.01036176, + "balance_loss_clip": 1.0377636, + "balance_loss_mlp": 1.02310455, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 1.9529027577451392, + "language_loss": 0.57694149, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.5984357, + "num_input_tokens_seen": 303185000, + "step": 14060, + "time_per_iteration": 2.3816747665405273 + }, + { + "auxiliary_loss_clip": 0.01090434, + "auxiliary_loss_mlp": 0.01038039, + "balance_loss_clip": 1.03382158, + "balance_loss_mlp": 1.02422833, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 1.4973371223504055, + "language_loss": 0.75540721, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77669197, + "num_input_tokens_seen": 303205210, + "step": 14061, + "time_per_iteration": 4.222871541976929 + }, + { + "auxiliary_loss_clip": 0.0108438, + "auxiliary_loss_mlp": 0.01028167, + "balance_loss_clip": 1.03435016, + "balance_loss_mlp": 1.0149045, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 2.0561618034653453, + "language_loss": 0.70621109, + "learning_rate": 2.452744642558013e-07, + "loss": 0.72733659, + "num_input_tokens_seen": 303224655, + "step": 14062, + "time_per_iteration": 2.4932796955108643 + }, + { + "auxiliary_loss_clip": 0.00999775, + "auxiliary_loss_mlp": 0.01008068, + "balance_loss_clip": 1.02040112, + "balance_loss_mlp": 1.00672138, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6375701175980532, + "language_loss": 0.52656829, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54664671, + "num_input_tokens_seen": 303289645, + "step": 14063, + "time_per_iteration": 3.2014622688293457 + }, + { + "auxiliary_loss_clip": 0.01065817, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.0357399, + "balance_loss_mlp": 1.01556015, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 1.9288522358558269, + "language_loss": 0.82257962, + "learning_rate": 2.449008483773378e-07, + "loss": 0.84350419, + "num_input_tokens_seen": 303308350, + "step": 14064, + "time_per_iteration": 2.519226551055908 + }, + { + "auxiliary_loss_clip": 0.01103278, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.04018283, + "balance_loss_mlp": 1.02016783, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 1.8458758463297054, + "language_loss": 0.72289234, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74425876, + "num_input_tokens_seen": 303325230, + "step": 14065, + "time_per_iteration": 2.4739415645599365 + }, + { + "auxiliary_loss_clip": 0.01075062, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.03581238, + "balance_loss_mlp": 1.01804376, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 2.8095640404018614, + "language_loss": 0.77521837, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79626495, + "num_input_tokens_seen": 303345810, + "step": 14066, + "time_per_iteration": 2.6010260581970215 + }, + { + "auxiliary_loss_clip": 0.01074014, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.03718984, + "balance_loss_mlp": 1.01857829, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.5105547430190247, + "language_loss": 0.70027804, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72132534, + "num_input_tokens_seen": 303365140, + "step": 14067, + "time_per_iteration": 2.5414206981658936 + }, + { + "auxiliary_loss_clip": 0.01070788, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.03120363, + "balance_loss_mlp": 1.01639032, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 1.8265087875330246, + "language_loss": 0.70750546, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.72849524, + "num_input_tokens_seen": 303386150, + "step": 14068, + "time_per_iteration": 2.6538848876953125 + }, + { + "auxiliary_loss_clip": 0.00993432, + "auxiliary_loss_mlp": 0.01003542, + "balance_loss_clip": 1.0076313, + "balance_loss_mlp": 1.00236213, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6972421797033077, + "language_loss": 0.60506964, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62503934, + "num_input_tokens_seen": 303453770, + "step": 14069, + "time_per_iteration": 3.228825807571411 + }, + { + "auxiliary_loss_clip": 0.01086184, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.03818345, + "balance_loss_mlp": 1.01947117, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.7941122235722773, + "language_loss": 0.74296081, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.7641362, + "num_input_tokens_seen": 303474520, + "step": 14070, + "time_per_iteration": 2.534329414367676 + }, + { + "auxiliary_loss_clip": 0.01065115, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.03589201, + "balance_loss_mlp": 1.01966989, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.489642458248851, + "language_loss": 0.67096573, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69193757, + "num_input_tokens_seen": 303497345, + "step": 14071, + "time_per_iteration": 2.7194528579711914 + }, + { + "auxiliary_loss_clip": 0.01020803, + "auxiliary_loss_mlp": 0.00753262, + "balance_loss_clip": 1.00759804, + "balance_loss_mlp": 1.00024974, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7321141978011698, + "language_loss": 0.60991532, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.62765592, + "num_input_tokens_seen": 303554890, + "step": 14072, + "time_per_iteration": 2.9168529510498047 + }, + { + "auxiliary_loss_clip": 0.01077049, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.04306149, + "balance_loss_mlp": 1.01841998, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 1.802992668309039, + "language_loss": 0.72510767, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74620104, + "num_input_tokens_seen": 303574380, + "step": 14073, + "time_per_iteration": 2.553856134414673 + }, + { + "auxiliary_loss_clip": 0.01093565, + "auxiliary_loss_mlp": 0.01037157, + "balance_loss_clip": 1.03838563, + "balance_loss_mlp": 1.02350783, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 2.0500328029920634, + "language_loss": 0.7813707, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80267787, + "num_input_tokens_seen": 303594910, + "step": 14074, + "time_per_iteration": 2.635051727294922 + }, + { + "auxiliary_loss_clip": 0.0110138, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.03979897, + "balance_loss_mlp": 1.02229333, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 4.0243808779393975, + "language_loss": 0.75228524, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77364743, + "num_input_tokens_seen": 303613520, + "step": 14075, + "time_per_iteration": 2.519411087036133 + }, + { + "auxiliary_loss_clip": 0.01085865, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.03655243, + "balance_loss_mlp": 1.01637077, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 3.240926949667553, + "language_loss": 0.73126459, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.75241351, + "num_input_tokens_seen": 303631225, + "step": 14076, + "time_per_iteration": 3.960109233856201 + }, + { + "auxiliary_loss_clip": 0.01092134, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.03617239, + "balance_loss_mlp": 1.02365601, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 1.8363290554559437, + "language_loss": 0.77381647, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79509741, + "num_input_tokens_seen": 303649175, + "step": 14077, + "time_per_iteration": 2.487576484680176 + }, + { + "auxiliary_loss_clip": 0.01080805, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.03862798, + "balance_loss_mlp": 1.02222586, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 2.4045185410150305, + "language_loss": 0.75173187, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77288592, + "num_input_tokens_seen": 303665915, + "step": 14078, + "time_per_iteration": 2.4879369735717773 + }, + { + "auxiliary_loss_clip": 0.01070379, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.0376538, + "balance_loss_mlp": 1.01657963, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.5577581861083902, + "language_loss": 0.85223985, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87323177, + "num_input_tokens_seen": 303679985, + "step": 14079, + "time_per_iteration": 2.5890772342681885 + }, + { + "auxiliary_loss_clip": 0.01085425, + "auxiliary_loss_mlp": 0.01036822, + "balance_loss_clip": 1.03620625, + "balance_loss_mlp": 1.02314889, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 2.0479295076763657, + "language_loss": 0.58696425, + "learning_rate": 2.419215098104965e-07, + "loss": 0.60818666, + "num_input_tokens_seen": 303698470, + "step": 14080, + "time_per_iteration": 2.535344123840332 + }, + { + "auxiliary_loss_clip": 0.01084783, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.03623796, + "balance_loss_mlp": 1.02066791, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 4.48176826952205, + "language_loss": 0.66722345, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.6884017, + "num_input_tokens_seen": 303716415, + "step": 14081, + "time_per_iteration": 2.4685070514678955 + }, + { + "auxiliary_loss_clip": 0.01096844, + "auxiliary_loss_mlp": 0.01041275, + "balance_loss_clip": 1.03461766, + "balance_loss_mlp": 1.0289433, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 1.738638781531532, + "language_loss": 0.7309767, + "learning_rate": 2.41550291894576e-07, + "loss": 0.7523579, + "num_input_tokens_seen": 303734490, + "step": 14082, + "time_per_iteration": 2.4906246662139893 + }, + { + "auxiliary_loss_clip": 0.01063856, + "auxiliary_loss_mlp": 0.01038288, + "balance_loss_clip": 1.03670204, + "balance_loss_mlp": 1.02435803, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 1.9070734465153514, + "language_loss": 0.76266962, + "learning_rate": 2.413647829539809e-07, + "loss": 0.78369105, + "num_input_tokens_seen": 303752310, + "step": 14083, + "time_per_iteration": 2.5634751319885254 + }, + { + "auxiliary_loss_clip": 0.01064399, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.03405917, + "balance_loss_mlp": 1.02145886, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.8609786960192292, + "language_loss": 0.65932792, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68032992, + "num_input_tokens_seen": 303776065, + "step": 14084, + "time_per_iteration": 2.636211395263672 + }, + { + "auxiliary_loss_clip": 0.01068884, + "auxiliary_loss_mlp": 0.01033154, + "balance_loss_clip": 1.03800857, + "balance_loss_mlp": 1.02103662, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 2.6095508268500365, + "language_loss": 0.69627786, + "learning_rate": 2.409939651426938e-07, + "loss": 0.71729827, + "num_input_tokens_seen": 303793500, + "step": 14085, + "time_per_iteration": 4.053870439529419 + }, + { + "auxiliary_loss_clip": 0.01066298, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.03369021, + "balance_loss_mlp": 1.01783395, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.6752138588620895, + "language_loss": 0.71070302, + "learning_rate": 2.408086562860634e-07, + "loss": 0.73165739, + "num_input_tokens_seen": 303814835, + "step": 14086, + "time_per_iteration": 2.633641242980957 + }, + { + "auxiliary_loss_clip": 0.01093378, + "auxiliary_loss_mlp": 0.01036092, + "balance_loss_clip": 1.03490305, + "balance_loss_mlp": 1.0237658, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.6682939536046124, + "language_loss": 0.7434172, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.76471186, + "num_input_tokens_seen": 303834505, + "step": 14087, + "time_per_iteration": 2.4490725994110107 + }, + { + "auxiliary_loss_clip": 0.01080997, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_clip": 1.03703833, + "balance_loss_mlp": 1.01479006, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.4065186256725326, + "language_loss": 0.73901093, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76009071, + "num_input_tokens_seen": 303855050, + "step": 14088, + "time_per_iteration": 2.498105525970459 + }, + { + "auxiliary_loss_clip": 0.01099931, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.03704262, + "balance_loss_mlp": 1.02206397, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 2.195704257917902, + "language_loss": 0.72255844, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74390382, + "num_input_tokens_seen": 303875635, + "step": 14089, + "time_per_iteration": 3.8037683963775635 + }, + { + "auxiliary_loss_clip": 0.01106324, + "auxiliary_loss_mlp": 0.01031059, + "balance_loss_clip": 1.03720963, + "balance_loss_mlp": 1.01937616, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.3928967349622834, + "language_loss": 0.79216683, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81354058, + "num_input_tokens_seen": 303896750, + "step": 14090, + "time_per_iteration": 2.4444637298583984 + }, + { + "auxiliary_loss_clip": 0.01058805, + "auxiliary_loss_mlp": 0.01041384, + "balance_loss_clip": 1.03456163, + "balance_loss_mlp": 1.02735877, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 2.090686587049704, + "language_loss": 0.76879203, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.78979397, + "num_input_tokens_seen": 303915435, + "step": 14091, + "time_per_iteration": 2.5778682231903076 + }, + { + "auxiliary_loss_clip": 0.01027482, + "auxiliary_loss_mlp": 0.01001954, + "balance_loss_clip": 1.00421321, + "balance_loss_mlp": 1.00061285, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.9142799750040888, + "language_loss": 0.59381455, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61410886, + "num_input_tokens_seen": 303977245, + "step": 14092, + "time_per_iteration": 3.0553946495056152 + }, + { + "auxiliary_loss_clip": 0.01083432, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.03244817, + "balance_loss_mlp": 1.02277029, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 2.1323088388184597, + "language_loss": 0.70379466, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72499394, + "num_input_tokens_seen": 303996055, + "step": 14093, + "time_per_iteration": 2.490346670150757 + }, + { + "auxiliary_loss_clip": 0.01106977, + "auxiliary_loss_mlp": 0.01028022, + "balance_loss_clip": 1.03723335, + "balance_loss_mlp": 1.0165776, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 2.5557663946715574, + "language_loss": 0.83115458, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85250461, + "num_input_tokens_seen": 304012205, + "step": 14094, + "time_per_iteration": 2.3859639167785645 + }, + { + "auxiliary_loss_clip": 0.01089795, + "auxiliary_loss_mlp": 0.01034358, + "balance_loss_clip": 1.03692555, + "balance_loss_mlp": 1.02255619, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 2.6713987519394333, + "language_loss": 0.71137416, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73261571, + "num_input_tokens_seen": 304033475, + "step": 14095, + "time_per_iteration": 2.490596294403076 + }, + { + "auxiliary_loss_clip": 0.0109466, + "auxiliary_loss_mlp": 0.00777691, + "balance_loss_clip": 1.03531146, + "balance_loss_mlp": 1.00070596, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 1.7421147087712485, + "language_loss": 0.80367279, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.82239634, + "num_input_tokens_seen": 304051845, + "step": 14096, + "time_per_iteration": 2.4868738651275635 + }, + { + "auxiliary_loss_clip": 0.01100647, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.03697705, + "balance_loss_mlp": 1.01795745, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.9566156549804488, + "language_loss": 0.77562273, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79693973, + "num_input_tokens_seen": 304069965, + "step": 14097, + "time_per_iteration": 2.500771999359131 + }, + { + "auxiliary_loss_clip": 0.01079221, + "auxiliary_loss_mlp": 0.01025942, + "balance_loss_clip": 1.03568685, + "balance_loss_mlp": 1.01429486, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.9824837497534973, + "language_loss": 0.80001879, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82107043, + "num_input_tokens_seen": 304086805, + "step": 14098, + "time_per_iteration": 2.5049688816070557 + }, + { + "auxiliary_loss_clip": 0.01093063, + "auxiliary_loss_mlp": 0.0077828, + "balance_loss_clip": 1.03491962, + "balance_loss_mlp": 1.00057757, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 2.2848168403377214, + "language_loss": 0.72047895, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.73919237, + "num_input_tokens_seen": 304105865, + "step": 14099, + "time_per_iteration": 2.4466488361358643 + }, + { + "auxiliary_loss_clip": 0.01098538, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.03530216, + "balance_loss_mlp": 1.01609004, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 2.0034851752743603, + "language_loss": 0.636531, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.65781343, + "num_input_tokens_seen": 304128300, + "step": 14100, + "time_per_iteration": 3.9975779056549072 + }, + { + "auxiliary_loss_clip": 0.01100192, + "auxiliary_loss_mlp": 0.01031972, + "balance_loss_clip": 1.03646755, + "balance_loss_mlp": 1.0192045, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 2.1232927624346094, + "language_loss": 0.74253607, + "learning_rate": 2.380370324111085e-07, + "loss": 0.76385772, + "num_input_tokens_seen": 304143695, + "step": 14101, + "time_per_iteration": 2.460116386413574 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.03410912, + "balance_loss_mlp": 1.01796377, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 1.7228529810614475, + "language_loss": 0.70892453, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73020148, + "num_input_tokens_seen": 304165800, + "step": 14102, + "time_per_iteration": 2.492177963256836 + }, + { + "auxiliary_loss_clip": 0.01085745, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.03561592, + "balance_loss_mlp": 1.01948571, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.2351054921485676, + "language_loss": 0.82054549, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.84173125, + "num_input_tokens_seen": 304182910, + "step": 14103, + "time_per_iteration": 2.453202486038208 + }, + { + "auxiliary_loss_clip": 0.01110564, + "auxiliary_loss_mlp": 0.01036107, + "balance_loss_clip": 1.03954148, + "balance_loss_mlp": 1.02413201, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 11.942178173803034, + "language_loss": 0.78803039, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80949706, + "num_input_tokens_seen": 304200175, + "step": 14104, + "time_per_iteration": 2.4032886028289795 + }, + { + "auxiliary_loss_clip": 0.01102413, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.03933036, + "balance_loss_mlp": 1.02421761, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 2.4787165425962017, + "language_loss": 0.78905863, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.81045532, + "num_input_tokens_seen": 304217775, + "step": 14105, + "time_per_iteration": 2.4535744190216064 + }, + { + "auxiliary_loss_clip": 0.0108856, + "auxiliary_loss_mlp": 0.01041825, + "balance_loss_clip": 1.03845346, + "balance_loss_mlp": 1.02705538, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 4.237863047843283, + "language_loss": 0.50093591, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.5222398, + "num_input_tokens_seen": 304235760, + "step": 14106, + "time_per_iteration": 2.494011878967285 + }, + { + "auxiliary_loss_clip": 0.01076657, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.03623033, + "balance_loss_mlp": 1.02401423, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 2.1035329470187936, + "language_loss": 0.75734758, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.77847248, + "num_input_tokens_seen": 304253985, + "step": 14107, + "time_per_iteration": 2.5118777751922607 + }, + { + "auxiliary_loss_clip": 0.01078288, + "auxiliary_loss_mlp": 0.01027762, + "balance_loss_clip": 1.03619671, + "balance_loss_mlp": 1.01556134, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.5217996391660171, + "language_loss": 0.73383427, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75489473, + "num_input_tokens_seen": 304276785, + "step": 14108, + "time_per_iteration": 2.6502106189727783 + }, + { + "auxiliary_loss_clip": 0.01105821, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.03620243, + "balance_loss_mlp": 1.01527464, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.6077066019795885, + "language_loss": 0.7234205, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74476129, + "num_input_tokens_seen": 304296310, + "step": 14109, + "time_per_iteration": 2.4989185333251953 + }, + { + "auxiliary_loss_clip": 0.01042077, + "auxiliary_loss_mlp": 0.01032938, + "balance_loss_clip": 1.03346372, + "balance_loss_mlp": 1.02052855, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 2.116886497866449, + "language_loss": 0.73840833, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.75915849, + "num_input_tokens_seen": 304311715, + "step": 14110, + "time_per_iteration": 2.5947763919830322 + }, + { + "auxiliary_loss_clip": 0.01058018, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.04141903, + "balance_loss_mlp": 1.02251267, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.724788526260712, + "language_loss": 0.7625891, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78351283, + "num_input_tokens_seen": 304331910, + "step": 14111, + "time_per_iteration": 2.6254186630249023 + }, + { + "auxiliary_loss_clip": 0.011068, + "auxiliary_loss_mlp": 0.01027247, + "balance_loss_clip": 1.03668463, + "balance_loss_mlp": 1.01633906, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 1.6887891514793527, + "language_loss": 0.67536813, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69670862, + "num_input_tokens_seen": 304351405, + "step": 14112, + "time_per_iteration": 2.4627225399017334 + }, + { + "auxiliary_loss_clip": 0.01095185, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.03294134, + "balance_loss_mlp": 1.02094507, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.4021035453469892, + "language_loss": 0.73780572, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75908756, + "num_input_tokens_seen": 304372935, + "step": 14113, + "time_per_iteration": 2.50034761428833 + }, + { + "auxiliary_loss_clip": 0.01074701, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.04063869, + "balance_loss_mlp": 1.02011538, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 2.030456896780558, + "language_loss": 0.67020071, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.69126892, + "num_input_tokens_seen": 304393070, + "step": 14114, + "time_per_iteration": 2.5649783611297607 + }, + { + "auxiliary_loss_clip": 0.01112537, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.0387466, + "balance_loss_mlp": 1.02103543, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.591170484314318, + "language_loss": 0.79000789, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.81147301, + "num_input_tokens_seen": 304411195, + "step": 14115, + "time_per_iteration": 3.8616690635681152 + }, + { + "auxiliary_loss_clip": 0.0110944, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.03658295, + "balance_loss_mlp": 1.01836002, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 2.0038985262020548, + "language_loss": 0.79197979, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81337571, + "num_input_tokens_seen": 304429425, + "step": 14116, + "time_per_iteration": 2.3933091163635254 + }, + { + "auxiliary_loss_clip": 0.01101235, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.03737032, + "balance_loss_mlp": 1.01949871, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 1.8204946776013307, + "language_loss": 0.6830548, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70439178, + "num_input_tokens_seen": 304447460, + "step": 14117, + "time_per_iteration": 2.545346736907959 + }, + { + "auxiliary_loss_clip": 0.01090961, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.03397071, + "balance_loss_mlp": 1.01625764, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 2.2535286012790263, + "language_loss": 0.65347362, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.67467415, + "num_input_tokens_seen": 304468230, + "step": 14118, + "time_per_iteration": 2.514188766479492 + }, + { + "auxiliary_loss_clip": 0.01071481, + "auxiliary_loss_mlp": 0.01031957, + "balance_loss_clip": 1.0374819, + "balance_loss_mlp": 1.02010798, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.705104500964132, + "language_loss": 0.73205268, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75308704, + "num_input_tokens_seen": 304484860, + "step": 14119, + "time_per_iteration": 2.4830923080444336 + }, + { + "auxiliary_loss_clip": 0.01078, + "auxiliary_loss_mlp": 0.0103095, + "balance_loss_clip": 1.03443384, + "balance_loss_mlp": 1.0174017, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 2.3821193458009975, + "language_loss": 0.77697033, + "learning_rate": 2.345478926864446e-07, + "loss": 0.79805982, + "num_input_tokens_seen": 304503575, + "step": 14120, + "time_per_iteration": 2.499025821685791 + }, + { + "auxiliary_loss_clip": 0.01096547, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.03915167, + "balance_loss_mlp": 1.01941395, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 2.5191135001981695, + "language_loss": 0.75518268, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.77646822, + "num_input_tokens_seen": 304525005, + "step": 14121, + "time_per_iteration": 2.496584177017212 + }, + { + "auxiliary_loss_clip": 0.00996102, + "auxiliary_loss_mlp": 0.0100035, + "balance_loss_clip": 1.00995207, + "balance_loss_mlp": 0.99908078, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.8053446933765985, + "language_loss": 0.60125321, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62121773, + "num_input_tokens_seen": 304585220, + "step": 14122, + "time_per_iteration": 3.1037728786468506 + }, + { + "auxiliary_loss_clip": 0.01098465, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.03648305, + "balance_loss_mlp": 1.01881003, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 1.8583309737327782, + "language_loss": 0.79754937, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.81884289, + "num_input_tokens_seen": 304604665, + "step": 14123, + "time_per_iteration": 2.4700119495391846 + }, + { + "auxiliary_loss_clip": 0.010938, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.03656745, + "balance_loss_mlp": 1.01893497, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 2.112449521543174, + "language_loss": 0.83530408, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85654974, + "num_input_tokens_seen": 304620600, + "step": 14124, + "time_per_iteration": 2.4554264545440674 + }, + { + "auxiliary_loss_clip": 0.01065204, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.03684139, + "balance_loss_mlp": 1.02027047, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 2.069086608086428, + "language_loss": 0.71623766, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.73721921, + "num_input_tokens_seen": 304639540, + "step": 14125, + "time_per_iteration": 4.030200958251953 + }, + { + "auxiliary_loss_clip": 0.01114356, + "auxiliary_loss_mlp": 0.01040026, + "balance_loss_clip": 1.03807878, + "balance_loss_mlp": 1.02606118, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.75319432881046, + "language_loss": 0.73692799, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75847185, + "num_input_tokens_seen": 304660595, + "step": 14126, + "time_per_iteration": 2.4322614669799805 + }, + { + "auxiliary_loss_clip": 0.01065612, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.03616214, + "balance_loss_mlp": 1.02388406, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.448011265355675, + "language_loss": 0.67745578, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69847584, + "num_input_tokens_seen": 304679580, + "step": 14127, + "time_per_iteration": 2.5481579303741455 + }, + { + "auxiliary_loss_clip": 0.01078249, + "auxiliary_loss_mlp": 0.00779945, + "balance_loss_clip": 1.03397274, + "balance_loss_mlp": 1.00072598, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 1.7075195884937235, + "language_loss": 0.68976247, + "learning_rate": 2.330860086502211e-07, + "loss": 0.7083444, + "num_input_tokens_seen": 304698385, + "step": 14128, + "time_per_iteration": 3.8528900146484375 + }, + { + "auxiliary_loss_clip": 0.0108417, + "auxiliary_loss_mlp": 0.01034676, + "balance_loss_clip": 1.03460634, + "balance_loss_mlp": 1.02149701, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 1.9777989037230943, + "language_loss": 0.78060669, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.80179513, + "num_input_tokens_seen": 304715430, + "step": 14129, + "time_per_iteration": 2.4624173641204834 + }, + { + "auxiliary_loss_clip": 0.01050807, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.04059768, + "balance_loss_mlp": 1.02233601, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 2.7343859021783645, + "language_loss": 0.67476946, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.69562441, + "num_input_tokens_seen": 304734345, + "step": 14130, + "time_per_iteration": 2.612783908843994 + }, + { + "auxiliary_loss_clip": 0.01100511, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.03787327, + "balance_loss_mlp": 1.020419, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 2.918853334728992, + "language_loss": 0.70991409, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73124963, + "num_input_tokens_seen": 304755030, + "step": 14131, + "time_per_iteration": 2.502190589904785 + }, + { + "auxiliary_loss_clip": 0.01077827, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.03609753, + "balance_loss_mlp": 1.01675606, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.8435430302254614, + "language_loss": 0.68583983, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70690465, + "num_input_tokens_seen": 304774320, + "step": 14132, + "time_per_iteration": 2.6047778129577637 + }, + { + "auxiliary_loss_clip": 0.01104764, + "auxiliary_loss_mlp": 0.01037176, + "balance_loss_clip": 1.0338316, + "balance_loss_mlp": 1.02607751, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.7114644050255219, + "language_loss": 0.70032716, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72174656, + "num_input_tokens_seen": 304795355, + "step": 14133, + "time_per_iteration": 2.4463891983032227 + }, + { + "auxiliary_loss_clip": 0.01004799, + "auxiliary_loss_mlp": 0.00753154, + "balance_loss_clip": 1.01688886, + "balance_loss_mlp": 1.00009966, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.9489843743826074, + "language_loss": 0.57576561, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59334505, + "num_input_tokens_seen": 304863915, + "step": 14134, + "time_per_iteration": 3.304640531539917 + }, + { + "auxiliary_loss_clip": 0.0107846, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.03961444, + "balance_loss_mlp": 1.01717806, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 2.1896023591802494, + "language_loss": 0.78943658, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81052029, + "num_input_tokens_seen": 304881555, + "step": 14135, + "time_per_iteration": 2.5389347076416016 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01029532, + "balance_loss_clip": 1.04152048, + "balance_loss_mlp": 1.01672912, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 1.9183547393021827, + "language_loss": 0.63255763, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65386516, + "num_input_tokens_seen": 304898760, + "step": 14136, + "time_per_iteration": 2.435288906097412 + }, + { + "auxiliary_loss_clip": 0.01101393, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.03676581, + "balance_loss_mlp": 1.01742971, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 2.3167292691720816, + "language_loss": 0.83775097, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.85907531, + "num_input_tokens_seen": 304915465, + "step": 14137, + "time_per_iteration": 2.430833339691162 + }, + { + "auxiliary_loss_clip": 0.01082266, + "auxiliary_loss_mlp": 0.01026368, + "balance_loss_clip": 1.03947818, + "balance_loss_mlp": 1.01540649, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 2.043398293397802, + "language_loss": 0.78559828, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.80668467, + "num_input_tokens_seen": 304933190, + "step": 14138, + "time_per_iteration": 2.538715362548828 + }, + { + "auxiliary_loss_clip": 0.01098983, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.03751981, + "balance_loss_mlp": 1.01683593, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.6067648783809527, + "language_loss": 0.64646184, + "learning_rate": 2.310829204839073e-07, + "loss": 0.6677404, + "num_input_tokens_seen": 304951110, + "step": 14139, + "time_per_iteration": 3.907390832901001 + }, + { + "auxiliary_loss_clip": 0.0106573, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.03495157, + "balance_loss_mlp": 1.01709867, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 1.5549885676391175, + "language_loss": 0.70825481, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72919858, + "num_input_tokens_seen": 304969095, + "step": 14140, + "time_per_iteration": 2.4729418754577637 + }, + { + "auxiliary_loss_clip": 0.0107311, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.03600001, + "balance_loss_mlp": 1.0221746, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 2.984956952864105, + "language_loss": 0.64492434, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.6660074, + "num_input_tokens_seen": 304989315, + "step": 14141, + "time_per_iteration": 2.547236919403076 + }, + { + "auxiliary_loss_clip": 0.01082148, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.03962207, + "balance_loss_mlp": 1.02032256, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.4928656960792757, + "language_loss": 0.70752835, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.72867823, + "num_input_tokens_seen": 305011020, + "step": 14142, + "time_per_iteration": 2.6605045795440674 + }, + { + "auxiliary_loss_clip": 0.01062177, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.03297377, + "balance_loss_mlp": 1.01756144, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 1.5500141849026146, + "language_loss": 0.65285408, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67376834, + "num_input_tokens_seen": 305033550, + "step": 14143, + "time_per_iteration": 2.6591379642486572 + }, + { + "auxiliary_loss_clip": 0.01080618, + "auxiliary_loss_mlp": 0.00777922, + "balance_loss_clip": 1.03491068, + "balance_loss_mlp": 1.00062263, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 3.4815995313661188, + "language_loss": 0.67713296, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.69571841, + "num_input_tokens_seen": 305052885, + "step": 14144, + "time_per_iteration": 2.540536880493164 + }, + { + "auxiliary_loss_clip": 0.0104331, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.03205502, + "balance_loss_mlp": 1.01952386, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 2.4159027284555212, + "language_loss": 0.64943147, + "learning_rate": 2.299937473050777e-07, + "loss": 0.67019272, + "num_input_tokens_seen": 305071995, + "step": 14145, + "time_per_iteration": 2.5516469478607178 + }, + { + "auxiliary_loss_clip": 0.01089463, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.03470635, + "balance_loss_mlp": 1.02079642, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.9031531918773514, + "language_loss": 0.8562395, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.87747204, + "num_input_tokens_seen": 305090190, + "step": 14146, + "time_per_iteration": 2.5194449424743652 + }, + { + "auxiliary_loss_clip": 0.01105926, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.03404307, + "balance_loss_mlp": 1.01944435, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 1.5644362078467382, + "language_loss": 0.8393296, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.86070317, + "num_input_tokens_seen": 305109355, + "step": 14147, + "time_per_iteration": 2.4649298191070557 + }, + { + "auxiliary_loss_clip": 0.01099551, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.03510892, + "balance_loss_mlp": 1.01981449, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 3.0997478307213973, + "language_loss": 0.85460442, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87592912, + "num_input_tokens_seen": 305124165, + "step": 14148, + "time_per_iteration": 2.4537320137023926 + }, + { + "auxiliary_loss_clip": 0.01086583, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.03456748, + "balance_loss_mlp": 1.01983428, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 1.6293716816588237, + "language_loss": 0.71818435, + "learning_rate": 2.292689741370204e-07, + "loss": 0.73937511, + "num_input_tokens_seen": 305143940, + "step": 14149, + "time_per_iteration": 2.526400566101074 + }, + { + "auxiliary_loss_clip": 0.01087797, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.03652763, + "balance_loss_mlp": 1.01694822, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.7490710449651896, + "language_loss": 0.76111448, + "learning_rate": 2.290879486935804e-07, + "loss": 0.78228152, + "num_input_tokens_seen": 305163505, + "step": 14150, + "time_per_iteration": 2.500675916671753 + }, + { + "auxiliary_loss_clip": 0.01072122, + "auxiliary_loss_mlp": 0.0104025, + "balance_loss_clip": 1.03512943, + "balance_loss_mlp": 1.02795398, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.7617419612734049, + "language_loss": 0.72395515, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74507886, + "num_input_tokens_seen": 305182325, + "step": 14151, + "time_per_iteration": 2.536041736602783 + }, + { + "auxiliary_loss_clip": 0.00998136, + "auxiliary_loss_mlp": 0.01002271, + "balance_loss_clip": 1.02160764, + "balance_loss_mlp": 1.00108457, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8793096809365272, + "language_loss": 0.595891, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61589503, + "num_input_tokens_seen": 305230775, + "step": 14152, + "time_per_iteration": 2.9580416679382324 + }, + { + "auxiliary_loss_clip": 0.01013415, + "auxiliary_loss_mlp": 0.01004151, + "balance_loss_clip": 1.0087781, + "balance_loss_mlp": 1.00294101, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.8049679765588578, + "language_loss": 0.61158758, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63176328, + "num_input_tokens_seen": 305296000, + "step": 14153, + "time_per_iteration": 3.0838146209716797 + }, + { + "auxiliary_loss_clip": 0.01100286, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.03746605, + "balance_loss_mlp": 1.01657295, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 1.6368970328089056, + "language_loss": 0.80759209, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.82888734, + "num_input_tokens_seen": 305314705, + "step": 14154, + "time_per_iteration": 2.5152087211608887 + }, + { + "auxiliary_loss_clip": 0.01070527, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.03360963, + "balance_loss_mlp": 1.02037489, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 1.7611676775307625, + "language_loss": 0.79699856, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81801474, + "num_input_tokens_seen": 305333870, + "step": 14155, + "time_per_iteration": 4.315592050552368 + }, + { + "auxiliary_loss_clip": 0.01073306, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.03240287, + "balance_loss_mlp": 1.02136123, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 1.830354141843575, + "language_loss": 0.70638353, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72745883, + "num_input_tokens_seen": 305352780, + "step": 14156, + "time_per_iteration": 2.552086353302002 + }, + { + "auxiliary_loss_clip": 0.01071288, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.03998303, + "balance_loss_mlp": 1.0205611, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 2.550145809497846, + "language_loss": 0.7386111, + "learning_rate": 2.278226512621386e-07, + "loss": 0.75965136, + "num_input_tokens_seen": 305371370, + "step": 14157, + "time_per_iteration": 2.55131459236145 + }, + { + "auxiliary_loss_clip": 0.01043273, + "auxiliary_loss_mlp": 0.010241, + "balance_loss_clip": 1.03624415, + "balance_loss_mlp": 1.01306176, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 6.409005505296528, + "language_loss": 0.79372132, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.81439507, + "num_input_tokens_seen": 305387955, + "step": 14158, + "time_per_iteration": 2.6513266563415527 + }, + { + "auxiliary_loss_clip": 0.01095452, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.03516412, + "balance_loss_mlp": 1.02637577, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 2.317474415856008, + "language_loss": 0.79088789, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81224394, + "num_input_tokens_seen": 305406285, + "step": 14159, + "time_per_iteration": 2.470705270767212 + }, + { + "auxiliary_loss_clip": 0.01088899, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.03558862, + "balance_loss_mlp": 1.02356982, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 2.8848419146107376, + "language_loss": 0.71349406, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73474085, + "num_input_tokens_seen": 305424500, + "step": 14160, + "time_per_iteration": 2.484856367111206 + }, + { + "auxiliary_loss_clip": 0.01106888, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.03911376, + "balance_loss_mlp": 1.01570177, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 2.636534077178448, + "language_loss": 0.70180023, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72315955, + "num_input_tokens_seen": 305442990, + "step": 14161, + "time_per_iteration": 2.557176351547241 + }, + { + "auxiliary_loss_clip": 0.01100176, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.03369522, + "balance_loss_mlp": 1.02132416, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 2.1412380209036788, + "language_loss": 0.7837534, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.80509204, + "num_input_tokens_seen": 305463065, + "step": 14162, + "time_per_iteration": 2.513101816177368 + }, + { + "auxiliary_loss_clip": 0.01101194, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.03796589, + "balance_loss_mlp": 1.02412665, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 4.220941363038507, + "language_loss": 0.76468086, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.7860586, + "num_input_tokens_seen": 305489070, + "step": 14163, + "time_per_iteration": 2.577789068222046 + }, + { + "auxiliary_loss_clip": 0.01015416, + "auxiliary_loss_mlp": 0.01001059, + "balance_loss_clip": 1.01150513, + "balance_loss_mlp": 0.99972934, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.6951553139787153, + "language_loss": 0.54994595, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57011068, + "num_input_tokens_seen": 305551490, + "step": 14164, + "time_per_iteration": 3.118671417236328 + }, + { + "auxiliary_loss_clip": 0.01101207, + "auxiliary_loss_mlp": 0.01035832, + "balance_loss_clip": 1.03721249, + "balance_loss_mlp": 1.02327323, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 1.7376092029382828, + "language_loss": 0.72539884, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.74676919, + "num_input_tokens_seen": 305570535, + "step": 14165, + "time_per_iteration": 4.033723592758179 + }, + { + "auxiliary_loss_clip": 0.01071943, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.03680682, + "balance_loss_mlp": 1.01841307, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.5262457384194141, + "language_loss": 0.67540491, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69643474, + "num_input_tokens_seen": 305590800, + "step": 14166, + "time_per_iteration": 2.581315755844116 + }, + { + "auxiliary_loss_clip": 0.01086687, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.03331232, + "balance_loss_mlp": 1.02319598, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 1.9283197277329391, + "language_loss": 0.73405141, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75526798, + "num_input_tokens_seen": 305609495, + "step": 14167, + "time_per_iteration": 3.8311126232147217 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.03748059, + "balance_loss_mlp": 1.01853716, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.6798041316367496, + "language_loss": 0.80855227, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82994974, + "num_input_tokens_seen": 305629420, + "step": 14168, + "time_per_iteration": 2.4620566368103027 + }, + { + "auxiliary_loss_clip": 0.01108255, + "auxiliary_loss_mlp": 0.0102907, + "balance_loss_clip": 1.03550267, + "balance_loss_mlp": 1.01721478, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 2.0621589953255723, + "language_loss": 0.75859386, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.77996713, + "num_input_tokens_seen": 305649835, + "step": 14169, + "time_per_iteration": 2.4827611446380615 + }, + { + "auxiliary_loss_clip": 0.01111454, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.03710651, + "balance_loss_mlp": 1.01847935, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 1.6968290919629152, + "language_loss": 0.63605237, + "learning_rate": 2.254815511000452e-07, + "loss": 0.65748024, + "num_input_tokens_seen": 305668840, + "step": 14170, + "time_per_iteration": 2.408073902130127 + }, + { + "auxiliary_loss_clip": 0.01090599, + "auxiliary_loss_mlp": 0.01026435, + "balance_loss_clip": 1.03383303, + "balance_loss_mlp": 1.01460981, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.302681256926311, + "language_loss": 0.86579835, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88696867, + "num_input_tokens_seen": 305686955, + "step": 14171, + "time_per_iteration": 2.3910999298095703 + }, + { + "auxiliary_loss_clip": 0.01095354, + "auxiliary_loss_mlp": 0.01038709, + "balance_loss_clip": 1.03581941, + "balance_loss_mlp": 1.02588201, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.7134643544716113, + "language_loss": 0.55007827, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.57141888, + "num_input_tokens_seen": 305706290, + "step": 14172, + "time_per_iteration": 2.539764404296875 + }, + { + "auxiliary_loss_clip": 0.01086608, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.03645563, + "balance_loss_mlp": 1.01865113, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 3.1115068267726578, + "language_loss": 0.69622636, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71738523, + "num_input_tokens_seen": 305723835, + "step": 14173, + "time_per_iteration": 2.4683680534362793 + }, + { + "auxiliary_loss_clip": 0.01087522, + "auxiliary_loss_mlp": 0.00778379, + "balance_loss_clip": 1.03491414, + "balance_loss_mlp": 1.0006485, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 7.7722204984746845, + "language_loss": 0.76085198, + "learning_rate": 2.247634997500205e-07, + "loss": 0.77951097, + "num_input_tokens_seen": 305741655, + "step": 14174, + "time_per_iteration": 2.5063490867614746 + }, + { + "auxiliary_loss_clip": 0.01075183, + "auxiliary_loss_mlp": 0.00780102, + "balance_loss_clip": 1.03215718, + "balance_loss_mlp": 1.00073695, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.623368002986106, + "language_loss": 0.81785429, + "learning_rate": 2.245841551883676e-07, + "loss": 0.83640718, + "num_input_tokens_seen": 305761890, + "step": 14175, + "time_per_iteration": 2.584390878677368 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.03864419, + "balance_loss_mlp": 1.0209887, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 2.8958130853316124, + "language_loss": 0.65425968, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67574, + "num_input_tokens_seen": 305779190, + "step": 14176, + "time_per_iteration": 2.4073243141174316 + }, + { + "auxiliary_loss_clip": 0.01087788, + "auxiliary_loss_mlp": 0.00777533, + "balance_loss_clip": 1.03636956, + "balance_loss_mlp": 1.00063396, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 1.723968110161879, + "language_loss": 0.78665078, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80530393, + "num_input_tokens_seen": 305799870, + "step": 14177, + "time_per_iteration": 2.569305181503296 + }, + { + "auxiliary_loss_clip": 0.0108666, + "auxiliary_loss_mlp": 0.01027871, + "balance_loss_clip": 1.03566563, + "balance_loss_mlp": 1.01568818, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 1.5621745155913742, + "language_loss": 0.73544651, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.7565918, + "num_input_tokens_seen": 305819695, + "step": 14178, + "time_per_iteration": 2.573690891265869 + }, + { + "auxiliary_loss_clip": 0.01078114, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.0411253, + "balance_loss_mlp": 1.02423334, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.7868024046139896, + "language_loss": 0.75226355, + "learning_rate": 2.238674502491935e-07, + "loss": 0.77340853, + "num_input_tokens_seen": 305837270, + "step": 14179, + "time_per_iteration": 3.9358770847320557 + }, + { + "auxiliary_loss_clip": 0.01107208, + "auxiliary_loss_mlp": 0.01030241, + "balance_loss_clip": 1.03689337, + "balance_loss_mlp": 1.01818299, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 2.089266486928837, + "language_loss": 0.81689215, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83826661, + "num_input_tokens_seen": 305855250, + "step": 14180, + "time_per_iteration": 2.438627004623413 + }, + { + "auxiliary_loss_clip": 0.01055044, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.03440857, + "balance_loss_mlp": 1.02058601, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 2.6977046081083085, + "language_loss": 0.61023569, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63112152, + "num_input_tokens_seen": 305875660, + "step": 14181, + "time_per_iteration": 2.578801393508911 + }, + { + "auxiliary_loss_clip": 0.01109009, + "auxiliary_loss_mlp": 0.01034428, + "balance_loss_clip": 1.03813171, + "balance_loss_mlp": 1.02273321, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.346888876929593, + "language_loss": 0.72654665, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74798095, + "num_input_tokens_seen": 305892415, + "step": 14182, + "time_per_iteration": 2.3991646766662598 + }, + { + "auxiliary_loss_clip": 0.01059023, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.03628838, + "balance_loss_mlp": 1.02172112, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 2.1375717145820956, + "language_loss": 0.70623034, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.727157, + "num_input_tokens_seen": 305912665, + "step": 14183, + "time_per_iteration": 2.582430124282837 + }, + { + "auxiliary_loss_clip": 0.01083265, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.03848457, + "balance_loss_mlp": 1.01882529, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 1.8895803230852986, + "language_loss": 0.72741288, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.7485463, + "num_input_tokens_seen": 305931515, + "step": 14184, + "time_per_iteration": 2.488400459289551 + }, + { + "auxiliary_loss_clip": 0.01110031, + "auxiliary_loss_mlp": 0.01031372, + "balance_loss_clip": 1.0379312, + "balance_loss_mlp": 1.01880121, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 1.809804335276981, + "language_loss": 0.76609176, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.78750587, + "num_input_tokens_seen": 305949965, + "step": 14185, + "time_per_iteration": 2.436227560043335 + }, + { + "auxiliary_loss_clip": 0.01070085, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.03366685, + "balance_loss_mlp": 1.01904798, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 1.9007623776684006, + "language_loss": 0.79540259, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.81643713, + "num_input_tokens_seen": 305967820, + "step": 14186, + "time_per_iteration": 2.512160062789917 + }, + { + "auxiliary_loss_clip": 0.01086852, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.03534329, + "balance_loss_mlp": 1.01618576, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.7968624261297304, + "language_loss": 0.62833738, + "learning_rate": 2.224372736588449e-07, + "loss": 0.64950001, + "num_input_tokens_seen": 305985505, + "step": 14187, + "time_per_iteration": 2.469153642654419 + }, + { + "auxiliary_loss_clip": 0.01058125, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.03122544, + "balance_loss_mlp": 1.01996136, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.5492149715910737, + "language_loss": 0.7654382, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.78635639, + "num_input_tokens_seen": 306005220, + "step": 14188, + "time_per_iteration": 2.635577917098999 + }, + { + "auxiliary_loss_clip": 0.01101021, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.03748131, + "balance_loss_mlp": 1.01875937, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.5090459865754584, + "language_loss": 0.78431714, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80565035, + "num_input_tokens_seen": 306023785, + "step": 14189, + "time_per_iteration": 2.501668691635132 + }, + { + "auxiliary_loss_clip": 0.01087323, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.03431344, + "balance_loss_mlp": 1.0195756, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 2.5806884205174656, + "language_loss": 0.79723394, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81843084, + "num_input_tokens_seen": 306041600, + "step": 14190, + "time_per_iteration": 2.50959849357605 + }, + { + "auxiliary_loss_clip": 0.01063043, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.03885996, + "balance_loss_mlp": 1.01906884, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 2.376727467794731, + "language_loss": 0.75885963, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.77980304, + "num_input_tokens_seen": 306060345, + "step": 14191, + "time_per_iteration": 2.553809642791748 + }, + { + "auxiliary_loss_clip": 0.01098506, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.03732872, + "balance_loss_mlp": 1.01886833, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 1.9399060105914716, + "language_loss": 0.69106895, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.71236622, + "num_input_tokens_seen": 306078285, + "step": 14192, + "time_per_iteration": 2.443610906600952 + }, + { + "auxiliary_loss_clip": 0.01101751, + "auxiliary_loss_mlp": 0.01038403, + "balance_loss_clip": 1.03776932, + "balance_loss_mlp": 1.024086, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.224043952513153, + "language_loss": 0.62647498, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.6478765, + "num_input_tokens_seen": 306093760, + "step": 14193, + "time_per_iteration": 2.448953628540039 + }, + { + "auxiliary_loss_clip": 0.01085381, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.03695178, + "balance_loss_mlp": 1.01846814, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 2.111507781857792, + "language_loss": 0.76772487, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78888297, + "num_input_tokens_seen": 306112595, + "step": 14194, + "time_per_iteration": 3.9526684284210205 + }, + { + "auxiliary_loss_clip": 0.01110375, + "auxiliary_loss_mlp": 0.01030221, + "balance_loss_clip": 1.03648818, + "balance_loss_mlp": 1.01843119, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 2.0696773829653488, + "language_loss": 0.69403404, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.71544003, + "num_input_tokens_seen": 306131800, + "step": 14195, + "time_per_iteration": 2.428098678588867 + }, + { + "auxiliary_loss_clip": 0.01084405, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.03732347, + "balance_loss_mlp": 1.02178836, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 2.003891559689262, + "language_loss": 0.85984981, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.88103986, + "num_input_tokens_seen": 306150590, + "step": 14196, + "time_per_iteration": 2.4927408695220947 + }, + { + "auxiliary_loss_clip": 0.01012864, + "auxiliary_loss_mlp": 0.01000487, + "balance_loss_clip": 1.00842774, + "balance_loss_mlp": 0.99943221, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.8211702350529582, + "language_loss": 0.55064821, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57078171, + "num_input_tokens_seen": 306205850, + "step": 14197, + "time_per_iteration": 3.010129451751709 + }, + { + "auxiliary_loss_clip": 0.01074267, + "auxiliary_loss_mlp": 0.00777598, + "balance_loss_clip": 1.03442299, + "balance_loss_mlp": 1.00058866, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 1.6587393398891723, + "language_loss": 0.8111378, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.82965648, + "num_input_tokens_seen": 306225220, + "step": 14198, + "time_per_iteration": 2.5622856616973877 + }, + { + "auxiliary_loss_clip": 0.01108752, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.03693128, + "balance_loss_mlp": 1.02069855, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 1.5609005382692553, + "language_loss": 0.68386257, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70526582, + "num_input_tokens_seen": 306249865, + "step": 14199, + "time_per_iteration": 2.668931484222412 + }, + { + "auxiliary_loss_clip": 0.01071985, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.03344953, + "balance_loss_mlp": 1.01899278, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.6257259705105136, + "language_loss": 0.8639183, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88493663, + "num_input_tokens_seen": 306270215, + "step": 14200, + "time_per_iteration": 2.546086072921753 + }, + { + "auxiliary_loss_clip": 0.0108027, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.03731751, + "balance_loss_mlp": 1.01648259, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.7980128218396636, + "language_loss": 0.77796239, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.79904622, + "num_input_tokens_seen": 306288960, + "step": 14201, + "time_per_iteration": 2.54500675201416 + }, + { + "auxiliary_loss_clip": 0.01086795, + "auxiliary_loss_mlp": 0.01026616, + "balance_loss_clip": 1.04073822, + "balance_loss_mlp": 1.01495719, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 2.1717882907092996, + "language_loss": 0.69155532, + "learning_rate": 2.19767322694256e-07, + "loss": 0.7126894, + "num_input_tokens_seen": 306308735, + "step": 14202, + "time_per_iteration": 2.5167059898376465 + }, + { + "auxiliary_loss_clip": 0.01099337, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.03729844, + "balance_loss_mlp": 1.02271032, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 1.5397921118991824, + "language_loss": 0.80277753, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82412326, + "num_input_tokens_seen": 306329015, + "step": 14203, + "time_per_iteration": 2.5641021728515625 + }, + { + "auxiliary_loss_clip": 0.01093154, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.03600407, + "balance_loss_mlp": 1.01832807, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 4.569982530986661, + "language_loss": 0.65694445, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.6781953, + "num_input_tokens_seen": 306349085, + "step": 14204, + "time_per_iteration": 4.077742576599121 + }, + { + "auxiliary_loss_clip": 0.01111663, + "auxiliary_loss_mlp": 0.01033587, + "balance_loss_clip": 1.03739369, + "balance_loss_mlp": 1.02071214, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 2.9274410323809, + "language_loss": 0.59773016, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.61918259, + "num_input_tokens_seen": 306365385, + "step": 14205, + "time_per_iteration": 2.4238393306732178 + }, + { + "auxiliary_loss_clip": 0.01088103, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.03795481, + "balance_loss_mlp": 1.01703691, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 1.9812838695396977, + "language_loss": 0.72302169, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74420369, + "num_input_tokens_seen": 306384585, + "step": 14206, + "time_per_iteration": 3.866408109664917 + }, + { + "auxiliary_loss_clip": 0.01100257, + "auxiliary_loss_mlp": 0.01028563, + "balance_loss_clip": 1.03739023, + "balance_loss_mlp": 1.01618862, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 2.6539932255018113, + "language_loss": 0.76075786, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.78204608, + "num_input_tokens_seen": 306401565, + "step": 14207, + "time_per_iteration": 2.4509899616241455 + }, + { + "auxiliary_loss_clip": 0.01111458, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.03738928, + "balance_loss_mlp": 1.01804566, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.977684781410492, + "language_loss": 0.84995449, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87138128, + "num_input_tokens_seen": 306419995, + "step": 14208, + "time_per_iteration": 2.4051735401153564 + }, + { + "auxiliary_loss_clip": 0.01092936, + "auxiliary_loss_mlp": 0.01032908, + "balance_loss_clip": 1.03763497, + "balance_loss_mlp": 1.02096963, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.4831229997063016, + "language_loss": 0.65756804, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.67882651, + "num_input_tokens_seen": 306439240, + "step": 14209, + "time_per_iteration": 2.464343309402466 + }, + { + "auxiliary_loss_clip": 0.01067536, + "auxiliary_loss_mlp": 0.01026761, + "balance_loss_clip": 1.03652918, + "balance_loss_mlp": 1.01477468, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 1.8285065268206124, + "language_loss": 0.70702165, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72796464, + "num_input_tokens_seen": 306458425, + "step": 14210, + "time_per_iteration": 2.6064116954803467 + }, + { + "auxiliary_loss_clip": 0.01083484, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.03427482, + "balance_loss_mlp": 1.01928651, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.343628210904469, + "language_loss": 0.70211053, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72325742, + "num_input_tokens_seen": 306477210, + "step": 14211, + "time_per_iteration": 2.5167148113250732 + }, + { + "auxiliary_loss_clip": 0.01090489, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.03632808, + "balance_loss_mlp": 1.01816165, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 2.1830242269638926, + "language_loss": 0.81097102, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83218348, + "num_input_tokens_seen": 306495820, + "step": 14212, + "time_per_iteration": 2.521880865097046 + }, + { + "auxiliary_loss_clip": 0.01077645, + "auxiliary_loss_mlp": 0.01038395, + "balance_loss_clip": 1.0332886, + "balance_loss_mlp": 1.02459621, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 2.014908771559903, + "language_loss": 0.66532683, + "learning_rate": 2.178190108088105e-07, + "loss": 0.6864872, + "num_input_tokens_seen": 306516420, + "step": 14213, + "time_per_iteration": 2.7042102813720703 + }, + { + "auxiliary_loss_clip": 0.01106766, + "auxiliary_loss_mlp": 0.01026823, + "balance_loss_clip": 1.03557324, + "balance_loss_mlp": 1.01474702, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 1.8401444263389968, + "language_loss": 0.78117549, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80251139, + "num_input_tokens_seen": 306534785, + "step": 14214, + "time_per_iteration": 2.429597854614258 + }, + { + "auxiliary_loss_clip": 0.01092792, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.036731, + "balance_loss_mlp": 1.0145992, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 4.343204127359461, + "language_loss": 0.66772866, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68894064, + "num_input_tokens_seen": 306552440, + "step": 14215, + "time_per_iteration": 2.4762001037597656 + }, + { + "auxiliary_loss_clip": 0.01108573, + "auxiliary_loss_mlp": 0.01032102, + "balance_loss_clip": 1.0369333, + "balance_loss_mlp": 1.01993716, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 2.482088489347137, + "language_loss": 0.62660801, + "learning_rate": 2.172890718362279e-07, + "loss": 0.64801478, + "num_input_tokens_seen": 306573600, + "step": 14216, + "time_per_iteration": 2.5753049850463867 + }, + { + "auxiliary_loss_clip": 0.01074348, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.0323925, + "balance_loss_mlp": 1.02265, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 1.8302526930008047, + "language_loss": 0.65359777, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67469239, + "num_input_tokens_seen": 306592840, + "step": 14217, + "time_per_iteration": 2.49993896484375 + }, + { + "auxiliary_loss_clip": 0.01095231, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.03757155, + "balance_loss_mlp": 1.01951957, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 1.5203576143218709, + "language_loss": 0.65043736, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67170548, + "num_input_tokens_seen": 306613210, + "step": 14218, + "time_per_iteration": 3.8823390007019043 + }, + { + "auxiliary_loss_clip": 0.01095133, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.03327489, + "balance_loss_mlp": 1.02313077, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 2.1015312843937086, + "language_loss": 0.70090508, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72221828, + "num_input_tokens_seen": 306631620, + "step": 14219, + "time_per_iteration": 2.441509485244751 + }, + { + "auxiliary_loss_clip": 0.01089328, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.03506458, + "balance_loss_mlp": 1.02507281, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 4.270600742098344, + "language_loss": 0.67264378, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69391412, + "num_input_tokens_seen": 306646695, + "step": 14220, + "time_per_iteration": 2.45879864692688 + }, + { + "auxiliary_loss_clip": 0.01104914, + "auxiliary_loss_mlp": 0.0102796, + "balance_loss_clip": 1.03556359, + "balance_loss_mlp": 1.01645029, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 2.0435718936541756, + "language_loss": 0.71295083, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73427951, + "num_input_tokens_seen": 306665465, + "step": 14221, + "time_per_iteration": 2.3991785049438477 + }, + { + "auxiliary_loss_clip": 0.01078999, + "auxiliary_loss_mlp": 0.01038921, + "balance_loss_clip": 1.03560591, + "balance_loss_mlp": 1.02671409, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 2.1927397126833608, + "language_loss": 0.59712297, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.61830211, + "num_input_tokens_seen": 306685950, + "step": 14222, + "time_per_iteration": 2.569089889526367 + }, + { + "auxiliary_loss_clip": 0.01081074, + "auxiliary_loss_mlp": 0.01037381, + "balance_loss_clip": 1.03188515, + "balance_loss_mlp": 1.02472067, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.585948572265393, + "language_loss": 0.83829653, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.85948104, + "num_input_tokens_seen": 306705740, + "step": 14223, + "time_per_iteration": 2.5465240478515625 + }, + { + "auxiliary_loss_clip": 0.01097297, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.03724027, + "balance_loss_mlp": 1.02043605, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.5178677517606036, + "language_loss": 0.74425507, + "learning_rate": 2.158788761585515e-07, + "loss": 0.7655549, + "num_input_tokens_seen": 306725065, + "step": 14224, + "time_per_iteration": 2.4673469066619873 + }, + { + "auxiliary_loss_clip": 0.01083423, + "auxiliary_loss_mlp": 0.00781539, + "balance_loss_clip": 1.0339011, + "balance_loss_mlp": 1.00069797, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 1.8753695053738086, + "language_loss": 0.75341141, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77206099, + "num_input_tokens_seen": 306743630, + "step": 14225, + "time_per_iteration": 2.559873342514038 + }, + { + "auxiliary_loss_clip": 0.01054068, + "auxiliary_loss_mlp": 0.01038533, + "balance_loss_clip": 1.0376606, + "balance_loss_mlp": 1.02646923, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.838525321635449, + "language_loss": 0.77186072, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79278672, + "num_input_tokens_seen": 306763105, + "step": 14226, + "time_per_iteration": 2.6593637466430664 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01035353, + "balance_loss_clip": 1.03767133, + "balance_loss_mlp": 1.02243686, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 2.126408515873523, + "language_loss": 0.54631943, + "learning_rate": 2.153511688875702e-07, + "loss": 0.56780291, + "num_input_tokens_seen": 306779875, + "step": 14227, + "time_per_iteration": 2.401559591293335 + }, + { + "auxiliary_loss_clip": 0.01082077, + "auxiliary_loss_mlp": 0.00779073, + "balance_loss_clip": 1.03870654, + "balance_loss_mlp": 1.00057864, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 1.9049238919575737, + "language_loss": 0.65281093, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67142242, + "num_input_tokens_seen": 306800015, + "step": 14228, + "time_per_iteration": 2.49814510345459 + }, + { + "auxiliary_loss_clip": 0.01078151, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.03693032, + "balance_loss_mlp": 1.02003455, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 2.5917769273343567, + "language_loss": 0.74199176, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.76309705, + "num_input_tokens_seen": 306814160, + "step": 14229, + "time_per_iteration": 2.5201199054718018 + }, + { + "auxiliary_loss_clip": 0.01096381, + "auxiliary_loss_mlp": 0.01028223, + "balance_loss_clip": 1.03531349, + "balance_loss_mlp": 1.01729178, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 2.579384046706395, + "language_loss": 0.72724581, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.74849188, + "num_input_tokens_seen": 306833310, + "step": 14230, + "time_per_iteration": 2.4422647953033447 + }, + { + "auxiliary_loss_clip": 0.01096401, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.03480625, + "balance_loss_mlp": 1.0179323, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 1.7870665955640852, + "language_loss": 0.82553864, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84680796, + "num_input_tokens_seen": 306851345, + "step": 14231, + "time_per_iteration": 2.4499917030334473 + }, + { + "auxiliary_loss_clip": 0.01101958, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.03767323, + "balance_loss_mlp": 1.02356696, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 1.83128072513769, + "language_loss": 0.68006015, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.70144749, + "num_input_tokens_seen": 306871040, + "step": 14232, + "time_per_iteration": 2.4614953994750977 + }, + { + "auxiliary_loss_clip": 0.01091709, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.03932977, + "balance_loss_mlp": 1.01856804, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.4944115047124564, + "language_loss": 0.66978478, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69101924, + "num_input_tokens_seen": 306891625, + "step": 14233, + "time_per_iteration": 2.539872884750366 + }, + { + "auxiliary_loss_clip": 0.01096595, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.03584671, + "balance_loss_mlp": 1.01880026, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.7208956890811145, + "language_loss": 0.76675117, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.78802156, + "num_input_tokens_seen": 306910020, + "step": 14234, + "time_per_iteration": 3.903895378112793 + }, + { + "auxiliary_loss_clip": 0.01001821, + "auxiliary_loss_mlp": 0.01009966, + "balance_loss_clip": 1.00460446, + "balance_loss_mlp": 1.00838029, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.9280971745236297, + "language_loss": 0.57985389, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.59997177, + "num_input_tokens_seen": 306969505, + "step": 14235, + "time_per_iteration": 3.135634183883667 + }, + { + "auxiliary_loss_clip": 0.0101258, + "auxiliary_loss_mlp": 0.01003275, + "balance_loss_clip": 1.00777555, + "balance_loss_mlp": 1.00204682, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7938734694115605, + "language_loss": 0.56668353, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58684212, + "num_input_tokens_seen": 307027710, + "step": 14236, + "time_per_iteration": 2.986508846282959 + }, + { + "auxiliary_loss_clip": 0.01088188, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.0365473, + "balance_loss_mlp": 1.02103138, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.7150678562933825, + "language_loss": 0.7016114, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72282922, + "num_input_tokens_seen": 307045515, + "step": 14237, + "time_per_iteration": 2.493372917175293 + }, + { + "auxiliary_loss_clip": 0.01085756, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.03347993, + "balance_loss_mlp": 1.01513171, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.755547423415841, + "language_loss": 0.63960177, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.660725, + "num_input_tokens_seen": 307064470, + "step": 14238, + "time_per_iteration": 2.527151346206665 + }, + { + "auxiliary_loss_clip": 0.01104348, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.03574681, + "balance_loss_mlp": 1.02317679, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 2.178284971082298, + "language_loss": 0.69375503, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71513402, + "num_input_tokens_seen": 307083900, + "step": 14239, + "time_per_iteration": 2.413757562637329 + }, + { + "auxiliary_loss_clip": 0.01110597, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.03557563, + "balance_loss_mlp": 1.02394843, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 2.051487264029555, + "language_loss": 0.66521943, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68669188, + "num_input_tokens_seen": 307104590, + "step": 14240, + "time_per_iteration": 2.48732328414917 + }, + { + "auxiliary_loss_clip": 0.01075563, + "auxiliary_loss_mlp": 0.01040554, + "balance_loss_clip": 1.03891158, + "balance_loss_mlp": 1.02653515, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 2.281186064317209, + "language_loss": 0.62240177, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64356291, + "num_input_tokens_seen": 307125580, + "step": 14241, + "time_per_iteration": 2.678907632827759 + }, + { + "auxiliary_loss_clip": 0.01113978, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.03709555, + "balance_loss_mlp": 1.02247572, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 1.5677148613321414, + "language_loss": 0.74342018, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76491809, + "num_input_tokens_seen": 307147625, + "step": 14242, + "time_per_iteration": 2.5098071098327637 + }, + { + "auxiliary_loss_clip": 0.01043626, + "auxiliary_loss_mlp": 0.0104695, + "balance_loss_clip": 1.03688157, + "balance_loss_mlp": 1.03319347, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 2.04472497207114, + "language_loss": 0.76629561, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.7872014, + "num_input_tokens_seen": 307164665, + "step": 14243, + "time_per_iteration": 2.738896131515503 + }, + { + "auxiliary_loss_clip": 0.01088595, + "auxiliary_loss_mlp": 0.00776378, + "balance_loss_clip": 1.03753734, + "balance_loss_mlp": 1.0007323, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 1.6960770900259108, + "language_loss": 0.68036437, + "learning_rate": 2.123723375556974e-07, + "loss": 0.69901413, + "num_input_tokens_seen": 307182530, + "step": 14244, + "time_per_iteration": 4.3570756912231445 + }, + { + "auxiliary_loss_clip": 0.01019661, + "auxiliary_loss_mlp": 0.01004136, + "balance_loss_clip": 1.00468588, + "balance_loss_mlp": 1.00297391, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7553086632870928, + "language_loss": 0.5843057, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60454369, + "num_input_tokens_seen": 307241240, + "step": 14245, + "time_per_iteration": 4.2250096797943115 + }, + { + "auxiliary_loss_clip": 0.01104344, + "auxiliary_loss_mlp": 0.01029032, + "balance_loss_clip": 1.03950357, + "balance_loss_mlp": 1.01594889, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.8265671712589229, + "language_loss": 0.77883184, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.80016565, + "num_input_tokens_seen": 307261485, + "step": 14246, + "time_per_iteration": 2.561725378036499 + }, + { + "auxiliary_loss_clip": 0.01084815, + "auxiliary_loss_mlp": 0.01028101, + "balance_loss_clip": 1.03080034, + "balance_loss_mlp": 1.01518512, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 1.8026662871141532, + "language_loss": 0.81310731, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.8342365, + "num_input_tokens_seen": 307279160, + "step": 14247, + "time_per_iteration": 2.5057077407836914 + }, + { + "auxiliary_loss_clip": 0.01089584, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.0369817, + "balance_loss_mlp": 1.0192548, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 1.68563955774876, + "language_loss": 0.77465022, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79586536, + "num_input_tokens_seen": 307297920, + "step": 14248, + "time_per_iteration": 2.5453529357910156 + }, + { + "auxiliary_loss_clip": 0.01062458, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.03187919, + "balance_loss_mlp": 1.02719331, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 2.034577254990374, + "language_loss": 0.77931541, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.80035722, + "num_input_tokens_seen": 307318320, + "step": 14249, + "time_per_iteration": 2.611406087875366 + }, + { + "auxiliary_loss_clip": 0.01085774, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.03507233, + "balance_loss_mlp": 1.02377057, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 1.7295000905917677, + "language_loss": 0.78063601, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80185562, + "num_input_tokens_seen": 307336720, + "step": 14250, + "time_per_iteration": 2.545663833618164 + }, + { + "auxiliary_loss_clip": 0.0108631, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.03683972, + "balance_loss_mlp": 1.01944971, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 2.195041058097785, + "language_loss": 0.80226278, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.8234328, + "num_input_tokens_seen": 307354120, + "step": 14251, + "time_per_iteration": 2.531496524810791 + }, + { + "auxiliary_loss_clip": 0.01075578, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.04021978, + "balance_loss_mlp": 1.01888907, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 1.950891151049919, + "language_loss": 0.61745548, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63851905, + "num_input_tokens_seen": 307373165, + "step": 14252, + "time_per_iteration": 2.5933609008789062 + }, + { + "auxiliary_loss_clip": 0.01089219, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.04068136, + "balance_loss_mlp": 1.0215627, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.831289210393909, + "language_loss": 0.70100641, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.72224498, + "num_input_tokens_seen": 307391000, + "step": 14253, + "time_per_iteration": 2.4717729091644287 + }, + { + "auxiliary_loss_clip": 0.01015225, + "auxiliary_loss_mlp": 0.01001003, + "balance_loss_clip": 1.01089621, + "balance_loss_mlp": 0.99973911, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.7877905971212481, + "language_loss": 0.5919441, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61210638, + "num_input_tokens_seen": 307452865, + "step": 14254, + "time_per_iteration": 3.1327528953552246 + }, + { + "auxiliary_loss_clip": 0.0108658, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.03648925, + "balance_loss_mlp": 1.02018297, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.8226991666102061, + "language_loss": 0.80911297, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83031487, + "num_input_tokens_seen": 307471940, + "step": 14255, + "time_per_iteration": 2.5341219902038574 + }, + { + "auxiliary_loss_clip": 0.0110803, + "auxiliary_loss_mlp": 0.01027825, + "balance_loss_clip": 1.03728473, + "balance_loss_mlp": 1.01541543, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 2.7906984539434143, + "language_loss": 0.67347538, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69483399, + "num_input_tokens_seen": 307488745, + "step": 14256, + "time_per_iteration": 2.5637001991271973 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.03658843, + "balance_loss_mlp": 1.02034807, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.5349228105649821, + "language_loss": 0.69978195, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.72109228, + "num_input_tokens_seen": 307506855, + "step": 14257, + "time_per_iteration": 3.9393770694732666 + }, + { + "auxiliary_loss_clip": 0.01075541, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.0335077, + "balance_loss_mlp": 1.02011955, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 2.3094633485211986, + "language_loss": 0.76977992, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.7908659, + "num_input_tokens_seen": 307526115, + "step": 14258, + "time_per_iteration": 2.622504234313965 + }, + { + "auxiliary_loss_clip": 0.01097467, + "auxiliary_loss_mlp": 0.0077745, + "balance_loss_clip": 1.03572989, + "balance_loss_mlp": 1.00071514, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.7695635816573123, + "language_loss": 0.68111181, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69986099, + "num_input_tokens_seen": 307545230, + "step": 14259, + "time_per_iteration": 2.47591233253479 + }, + { + "auxiliary_loss_clip": 0.01095672, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.03320217, + "balance_loss_mlp": 1.0206151, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.7769533606433001, + "language_loss": 0.77136743, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.792656, + "num_input_tokens_seen": 307564900, + "step": 14260, + "time_per_iteration": 2.503706932067871 + }, + { + "auxiliary_loss_clip": 0.01084881, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.03356338, + "balance_loss_mlp": 1.01611543, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 2.0929226400613588, + "language_loss": 0.74174774, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76288819, + "num_input_tokens_seen": 307583500, + "step": 14261, + "time_per_iteration": 2.495924949645996 + }, + { + "auxiliary_loss_clip": 0.01097531, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.03760636, + "balance_loss_mlp": 1.02264833, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.7076445862995595, + "language_loss": 0.78877532, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.81011033, + "num_input_tokens_seen": 307601430, + "step": 14262, + "time_per_iteration": 2.42496395111084 + }, + { + "auxiliary_loss_clip": 0.01077501, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.03752136, + "balance_loss_mlp": 1.02529991, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.765854403399211, + "language_loss": 0.67978787, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.70093024, + "num_input_tokens_seen": 307621495, + "step": 14263, + "time_per_iteration": 2.5423154830932617 + }, + { + "auxiliary_loss_clip": 0.01073426, + "auxiliary_loss_mlp": 0.00779839, + "balance_loss_clip": 1.0353179, + "balance_loss_mlp": 1.00069857, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.6051429401413444, + "language_loss": 0.79799438, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81652701, + "num_input_tokens_seen": 307640840, + "step": 14264, + "time_per_iteration": 2.5348031520843506 + }, + { + "auxiliary_loss_clip": 0.01073076, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.03262925, + "balance_loss_mlp": 1.02205181, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 3.3296672896824915, + "language_loss": 0.69673896, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.7178179, + "num_input_tokens_seen": 307663820, + "step": 14265, + "time_per_iteration": 2.6316089630126953 + }, + { + "auxiliary_loss_clip": 0.01104307, + "auxiliary_loss_mlp": 0.01025156, + "balance_loss_clip": 1.03543282, + "balance_loss_mlp": 1.01356268, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 1.882221323128108, + "language_loss": 0.66019273, + "learning_rate": 2.085464646918027e-07, + "loss": 0.68148732, + "num_input_tokens_seen": 307682385, + "step": 14266, + "time_per_iteration": 2.4232683181762695 + }, + { + "auxiliary_loss_clip": 0.01087588, + "auxiliary_loss_mlp": 0.01029832, + "balance_loss_clip": 1.03654981, + "balance_loss_mlp": 1.01783931, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 1.7540263822192366, + "language_loss": 0.75345546, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77462959, + "num_input_tokens_seen": 307704680, + "step": 14267, + "time_per_iteration": 2.534271001815796 + }, + { + "auxiliary_loss_clip": 0.01097002, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.03725219, + "balance_loss_mlp": 1.02062964, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.7308291767019384, + "language_loss": 0.87975073, + "learning_rate": 2.082002873852946e-07, + "loss": 0.90104413, + "num_input_tokens_seen": 307723245, + "step": 14268, + "time_per_iteration": 2.446735143661499 + }, + { + "auxiliary_loss_clip": 0.01099801, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.03700018, + "balance_loss_mlp": 1.0231303, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 2.1575821555141053, + "language_loss": 0.72862196, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.74997628, + "num_input_tokens_seen": 307742510, + "step": 14269, + "time_per_iteration": 2.480602979660034 + }, + { + "auxiliary_loss_clip": 0.01099532, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.03583837, + "balance_loss_mlp": 1.0178144, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.5716146764405858, + "language_loss": 0.66213155, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68342793, + "num_input_tokens_seen": 307766030, + "step": 14270, + "time_per_iteration": 2.5796408653259277 + }, + { + "auxiliary_loss_clip": 0.01085497, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.03271616, + "balance_loss_mlp": 1.01781631, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.5497545559760253, + "language_loss": 0.73969501, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76085711, + "num_input_tokens_seen": 307785800, + "step": 14271, + "time_per_iteration": 2.504256010055542 + }, + { + "auxiliary_loss_clip": 0.00993999, + "auxiliary_loss_mlp": 0.00753795, + "balance_loss_clip": 1.00897145, + "balance_loss_mlp": 1.00028503, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.7950030177864038, + "language_loss": 0.59374523, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.6112231, + "num_input_tokens_seen": 307850995, + "step": 14272, + "time_per_iteration": 3.186401844024658 + }, + { + "auxiliary_loss_clip": 0.01087972, + "auxiliary_loss_mlp": 0.01039333, + "balance_loss_clip": 1.03528476, + "balance_loss_mlp": 1.02494502, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 3.1593485894641558, + "language_loss": 0.75486851, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77614158, + "num_input_tokens_seen": 307868585, + "step": 14273, + "time_per_iteration": 3.9420573711395264 + }, + { + "auxiliary_loss_clip": 0.01098528, + "auxiliary_loss_mlp": 0.01032023, + "balance_loss_clip": 1.03638768, + "balance_loss_mlp": 1.01907718, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 1.7939594027504004, + "language_loss": 0.82085812, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84216368, + "num_input_tokens_seen": 307886820, + "step": 14274, + "time_per_iteration": 2.478163480758667 + }, + { + "auxiliary_loss_clip": 0.01017757, + "auxiliary_loss_mlp": 0.0100257, + "balance_loss_clip": 1.00393724, + "balance_loss_mlp": 1.00137794, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.7966217489641075, + "language_loss": 0.60765493, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.62785816, + "num_input_tokens_seen": 307944020, + "step": 14275, + "time_per_iteration": 3.100780725479126 + }, + { + "auxiliary_loss_clip": 0.01098159, + "auxiliary_loss_mlp": 0.01028732, + "balance_loss_clip": 1.03962553, + "balance_loss_mlp": 1.01532066, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 4.392579518608745, + "language_loss": 0.59037387, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61164278, + "num_input_tokens_seen": 307961055, + "step": 14276, + "time_per_iteration": 2.517861843109131 + }, + { + "auxiliary_loss_clip": 0.01088766, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.03732824, + "balance_loss_mlp": 1.01844549, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 1.8757966795151706, + "language_loss": 0.76090074, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78209555, + "num_input_tokens_seen": 307978690, + "step": 14277, + "time_per_iteration": 2.485663414001465 + }, + { + "auxiliary_loss_clip": 0.01086013, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.0345602, + "balance_loss_mlp": 1.01814139, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.6255936818640206, + "language_loss": 0.83501595, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.85618573, + "num_input_tokens_seen": 307995870, + "step": 14278, + "time_per_iteration": 2.4558730125427246 + }, + { + "auxiliary_loss_clip": 0.01088206, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.02054787, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 2.0268797539469463, + "language_loss": 0.74761146, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76883119, + "num_input_tokens_seen": 308013645, + "step": 14279, + "time_per_iteration": 2.467280626296997 + }, + { + "auxiliary_loss_clip": 0.01109679, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.03810346, + "balance_loss_mlp": 1.0222044, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 4.852317197261575, + "language_loss": 0.66554344, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68698502, + "num_input_tokens_seen": 308032490, + "step": 14280, + "time_per_iteration": 2.471355676651001 + }, + { + "auxiliary_loss_clip": 0.01094875, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.03562832, + "balance_loss_mlp": 1.01983035, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 2.060438154835789, + "language_loss": 0.6264773, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64774173, + "num_input_tokens_seen": 308052110, + "step": 14281, + "time_per_iteration": 2.4618635177612305 + }, + { + "auxiliary_loss_clip": 0.01089412, + "auxiliary_loss_mlp": 0.00777664, + "balance_loss_clip": 1.04003775, + "balance_loss_mlp": 1.00072336, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.7757269025816784, + "language_loss": 0.73221242, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75088316, + "num_input_tokens_seen": 308070660, + "step": 14282, + "time_per_iteration": 2.490752935409546 + }, + { + "auxiliary_loss_clip": 0.01079989, + "auxiliary_loss_mlp": 0.01026206, + "balance_loss_clip": 1.03091216, + "balance_loss_mlp": 1.0149647, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 1.6493190220693354, + "language_loss": 0.75571442, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77677637, + "num_input_tokens_seen": 308089520, + "step": 14283, + "time_per_iteration": 3.95457124710083 + }, + { + "auxiliary_loss_clip": 0.01094928, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.03368032, + "balance_loss_mlp": 1.01912689, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 2.998434097904825, + "language_loss": 0.60327685, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.62454104, + "num_input_tokens_seen": 308111545, + "step": 14284, + "time_per_iteration": 2.5448999404907227 + }, + { + "auxiliary_loss_clip": 0.01086284, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.04084563, + "balance_loss_mlp": 1.0203495, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.777592728004262, + "language_loss": 0.75818205, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77937067, + "num_input_tokens_seen": 308129690, + "step": 14285, + "time_per_iteration": 3.7971031665802 + }, + { + "auxiliary_loss_clip": 0.01097147, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.0401597, + "balance_loss_mlp": 1.01979709, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 1.9737375125675607, + "language_loss": 0.74113572, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76243335, + "num_input_tokens_seen": 308147410, + "step": 14286, + "time_per_iteration": 2.432227611541748 + }, + { + "auxiliary_loss_clip": 0.01010381, + "auxiliary_loss_mlp": 0.00752851, + "balance_loss_clip": 1.00758743, + "balance_loss_mlp": 1.00015306, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7742087027870205, + "language_loss": 0.49480709, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51243937, + "num_input_tokens_seen": 308204875, + "step": 14287, + "time_per_iteration": 3.0366084575653076 + }, + { + "auxiliary_loss_clip": 0.0110051, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03805256, + "balance_loss_mlp": 1.0193305, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 1.7308634618942853, + "language_loss": 0.7894128, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81073058, + "num_input_tokens_seen": 308225690, + "step": 14288, + "time_per_iteration": 2.5234878063201904 + }, + { + "auxiliary_loss_clip": 0.01075662, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.04302239, + "balance_loss_mlp": 1.01914036, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 2.149758358490005, + "language_loss": 0.80875605, + "learning_rate": 2.045818444528553e-07, + "loss": 0.82983625, + "num_input_tokens_seen": 308245255, + "step": 14289, + "time_per_iteration": 2.5743727684020996 + }, + { + "auxiliary_loss_clip": 0.0110069, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.03697944, + "balance_loss_mlp": 1.02061284, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 1.6607212022438824, + "language_loss": 0.65255463, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67389125, + "num_input_tokens_seen": 308261755, + "step": 14290, + "time_per_iteration": 2.436208486557007 + }, + { + "auxiliary_loss_clip": 0.01089326, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.03637421, + "balance_loss_mlp": 1.01806319, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 1.9001113488573622, + "language_loss": 0.54952192, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57071984, + "num_input_tokens_seen": 308285145, + "step": 14291, + "time_per_iteration": 2.595203399658203 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01031704, + "balance_loss_clip": 1.03864396, + "balance_loss_mlp": 1.01965141, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 2.8231406283521303, + "language_loss": 0.71115905, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73250532, + "num_input_tokens_seen": 308304130, + "step": 14292, + "time_per_iteration": 2.4401419162750244 + }, + { + "auxiliary_loss_clip": 0.01098672, + "auxiliary_loss_mlp": 0.01033262, + "balance_loss_clip": 1.03573775, + "balance_loss_mlp": 1.02100086, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.469419041084522, + "language_loss": 0.71458149, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73590082, + "num_input_tokens_seen": 308324670, + "step": 14293, + "time_per_iteration": 2.507662534713745 + }, + { + "auxiliary_loss_clip": 0.01080857, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.03743327, + "balance_loss_mlp": 1.02078497, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.7147371961746949, + "language_loss": 0.68390858, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.70504576, + "num_input_tokens_seen": 308344215, + "step": 14294, + "time_per_iteration": 2.4815073013305664 + }, + { + "auxiliary_loss_clip": 0.0110651, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.0359658, + "balance_loss_mlp": 1.02110791, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 2.2341718504474883, + "language_loss": 0.77828974, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.79968667, + "num_input_tokens_seen": 308360520, + "step": 14295, + "time_per_iteration": 2.428420305252075 + }, + { + "auxiliary_loss_clip": 0.01087011, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.03513861, + "balance_loss_mlp": 1.02624774, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 3.287138272332335, + "language_loss": 0.69253969, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71382308, + "num_input_tokens_seen": 308376865, + "step": 14296, + "time_per_iteration": 3.8121933937072754 + }, + { + "auxiliary_loss_clip": 0.01082681, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.03385842, + "balance_loss_mlp": 1.02105844, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 2.4633358259365714, + "language_loss": 0.79506719, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81623363, + "num_input_tokens_seen": 308395870, + "step": 14297, + "time_per_iteration": 2.537940263748169 + }, + { + "auxiliary_loss_clip": 0.01094972, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.03478909, + "balance_loss_mlp": 1.01836681, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 1.6508411589915155, + "language_loss": 0.67850727, + "learning_rate": 2.030402708016954e-07, + "loss": 0.69974953, + "num_input_tokens_seen": 308417250, + "step": 14298, + "time_per_iteration": 2.519622325897217 + }, + { + "auxiliary_loss_clip": 0.01084084, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.03446293, + "balance_loss_mlp": 1.02154827, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 1.9144043736315357, + "language_loss": 0.68325925, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.70443666, + "num_input_tokens_seen": 308434565, + "step": 14299, + "time_per_iteration": 2.4618968963623047 + }, + { + "auxiliary_loss_clip": 0.01081816, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.03888917, + "balance_loss_mlp": 1.02427721, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.4461810673220716, + "language_loss": 0.71568906, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.7368747, + "num_input_tokens_seen": 308450040, + "step": 14300, + "time_per_iteration": 2.620356321334839 + }, + { + "auxiliary_loss_clip": 0.0108346, + "auxiliary_loss_mlp": 0.01039136, + "balance_loss_clip": 1.03228712, + "balance_loss_mlp": 1.02561164, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.5227530140793513, + "language_loss": 0.69160724, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71283317, + "num_input_tokens_seen": 308470545, + "step": 14301, + "time_per_iteration": 2.5415422916412354 + }, + { + "auxiliary_loss_clip": 0.0106264, + "auxiliary_loss_mlp": 0.01032671, + "balance_loss_clip": 1.03830373, + "balance_loss_mlp": 1.02035689, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 1.7195109600065008, + "language_loss": 0.74307042, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76402354, + "num_input_tokens_seen": 308490020, + "step": 14302, + "time_per_iteration": 2.60760498046875 + }, + { + "auxiliary_loss_clip": 0.01092389, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.03318012, + "balance_loss_mlp": 1.01910734, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.818663724441081, + "language_loss": 0.83829474, + "learning_rate": 2.02186225623733e-07, + "loss": 0.85952246, + "num_input_tokens_seen": 308509065, + "step": 14303, + "time_per_iteration": 2.5049102306365967 + }, + { + "auxiliary_loss_clip": 0.01096822, + "auxiliary_loss_mlp": 0.01038336, + "balance_loss_clip": 1.03350627, + "balance_loss_mlp": 1.0247519, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 1.9787734383235245, + "language_loss": 0.7729094, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79426098, + "num_input_tokens_seen": 308524725, + "step": 14304, + "time_per_iteration": 2.410804033279419 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.03744781, + "balance_loss_mlp": 1.02034736, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 2.0731407581759775, + "language_loss": 0.54302388, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.56446761, + "num_input_tokens_seen": 308543525, + "step": 14305, + "time_per_iteration": 2.378669261932373 + }, + { + "auxiliary_loss_clip": 0.01109105, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.03757548, + "balance_loss_mlp": 1.01881433, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 2.1624440015010853, + "language_loss": 0.83287978, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.85428786, + "num_input_tokens_seen": 308557995, + "step": 14306, + "time_per_iteration": 2.398721933364868 + }, + { + "auxiliary_loss_clip": 0.01096972, + "auxiliary_loss_mlp": 0.00777849, + "balance_loss_clip": 1.03621054, + "balance_loss_mlp": 1.00064552, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.3750353793656829, + "language_loss": 0.71661341, + "learning_rate": 2.01504216561474e-07, + "loss": 0.7353617, + "num_input_tokens_seen": 308582750, + "step": 14307, + "time_per_iteration": 2.549766778945923 + }, + { + "auxiliary_loss_clip": 0.01099144, + "auxiliary_loss_mlp": 0.00780338, + "balance_loss_clip": 1.03438401, + "balance_loss_mlp": 1.00076103, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 1.5515448733310118, + "language_loss": 0.63910633, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65790117, + "num_input_tokens_seen": 308603770, + "step": 14308, + "time_per_iteration": 2.498544216156006 + }, + { + "auxiliary_loss_clip": 0.01012089, + "auxiliary_loss_mlp": 0.01006763, + "balance_loss_clip": 1.00739169, + "balance_loss_mlp": 1.00552881, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.6222882788791213, + "language_loss": 0.48395586, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50414437, + "num_input_tokens_seen": 308667735, + "step": 14309, + "time_per_iteration": 3.1522154808044434 + }, + { + "auxiliary_loss_clip": 0.01056361, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.03637147, + "balance_loss_mlp": 1.0212841, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.9618769735737005, + "language_loss": 0.67178369, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69269323, + "num_input_tokens_seen": 308686300, + "step": 14310, + "time_per_iteration": 2.6076929569244385 + }, + { + "auxiliary_loss_clip": 0.01049426, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.03477585, + "balance_loss_mlp": 1.02049077, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 2.179700056318066, + "language_loss": 0.78212345, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80294555, + "num_input_tokens_seen": 308705825, + "step": 14311, + "time_per_iteration": 2.621779680252075 + }, + { + "auxiliary_loss_clip": 0.01096093, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.03627455, + "balance_loss_mlp": 1.01823545, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 2.0159809483278432, + "language_loss": 0.71394706, + "learning_rate": 2.006532397626639e-07, + "loss": 0.73520696, + "num_input_tokens_seen": 308723340, + "step": 14312, + "time_per_iteration": 3.9299468994140625 + }, + { + "auxiliary_loss_clip": 0.01081657, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.0340215, + "balance_loss_mlp": 1.01974583, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 2.3354695718932192, + "language_loss": 0.77801859, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.79915702, + "num_input_tokens_seen": 308741280, + "step": 14313, + "time_per_iteration": 2.486067295074463 + }, + { + "auxiliary_loss_clip": 0.01085485, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.03356171, + "balance_loss_mlp": 1.01841998, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 1.9771156525977527, + "language_loss": 0.729312, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75048661, + "num_input_tokens_seen": 308762875, + "step": 14314, + "time_per_iteration": 2.606654405593872 + }, + { + "auxiliary_loss_clip": 0.01085478, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.03560197, + "balance_loss_mlp": 1.01845694, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.8524109788255145, + "language_loss": 0.69026399, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71142435, + "num_input_tokens_seen": 308780315, + "step": 14315, + "time_per_iteration": 2.5583419799804688 + }, + { + "auxiliary_loss_clip": 0.01097542, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.03674078, + "balance_loss_mlp": 1.02118385, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 4.223765958317835, + "language_loss": 0.72006047, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74136627, + "num_input_tokens_seen": 308799435, + "step": 14316, + "time_per_iteration": 2.4995741844177246 + }, + { + "auxiliary_loss_clip": 0.01092699, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.04089713, + "balance_loss_mlp": 1.01778018, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 2.020650978196182, + "language_loss": 0.82941103, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.8506366, + "num_input_tokens_seen": 308817730, + "step": 14317, + "time_per_iteration": 2.5177183151245117 + }, + { + "auxiliary_loss_clip": 0.01092518, + "auxiliary_loss_mlp": 0.01027833, + "balance_loss_clip": 1.03903317, + "balance_loss_mlp": 1.01561379, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.612573118386388, + "language_loss": 0.66896296, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69016647, + "num_input_tokens_seen": 308841735, + "step": 14318, + "time_per_iteration": 2.7838637828826904 + }, + { + "auxiliary_loss_clip": 0.01094749, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.03601551, + "balance_loss_mlp": 1.01872456, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.5340876209467424, + "language_loss": 0.71483713, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73608351, + "num_input_tokens_seen": 308865050, + "step": 14319, + "time_per_iteration": 2.683260202407837 + }, + { + "auxiliary_loss_clip": 0.01090688, + "auxiliary_loss_mlp": 0.00777548, + "balance_loss_clip": 1.0366298, + "balance_loss_mlp": 1.00067735, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 2.3167736186135968, + "language_loss": 0.67295974, + "learning_rate": 1.992952252525839e-07, + "loss": 0.69164211, + "num_input_tokens_seen": 308885375, + "step": 14320, + "time_per_iteration": 2.5316660404205322 + }, + { + "auxiliary_loss_clip": 0.0108587, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.03715694, + "balance_loss_mlp": 1.02433705, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 2.0436727845591207, + "language_loss": 0.80215895, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82340765, + "num_input_tokens_seen": 308904700, + "step": 14321, + "time_per_iteration": 2.4850828647613525 + }, + { + "auxiliary_loss_clip": 0.01093889, + "auxiliary_loss_mlp": 0.00778098, + "balance_loss_clip": 1.03432226, + "balance_loss_mlp": 1.00055861, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 2.0069332036345773, + "language_loss": 0.71105117, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.72977102, + "num_input_tokens_seen": 308922985, + "step": 14322, + "time_per_iteration": 2.4615776538848877 + }, + { + "auxiliary_loss_clip": 0.01091097, + "auxiliary_loss_mlp": 0.01037339, + "balance_loss_clip": 1.03572786, + "balance_loss_mlp": 1.02396321, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 3.313170748494139, + "language_loss": 0.55939615, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58068049, + "num_input_tokens_seen": 308940765, + "step": 14323, + "time_per_iteration": 3.9850540161132812 + }, + { + "auxiliary_loss_clip": 0.01073416, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.0336237, + "balance_loss_mlp": 1.01667595, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 1.6732925318726515, + "language_loss": 0.75694877, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77796912, + "num_input_tokens_seen": 308960110, + "step": 14324, + "time_per_iteration": 3.8056108951568604 + }, + { + "auxiliary_loss_clip": 0.01066225, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.03576815, + "balance_loss_mlp": 1.0199616, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 2.0314588983199684, + "language_loss": 0.66675305, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68776274, + "num_input_tokens_seen": 308976665, + "step": 14325, + "time_per_iteration": 2.548029661178589 + }, + { + "auxiliary_loss_clip": 0.01099334, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.03609431, + "balance_loss_mlp": 1.01929545, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.9315341989766803, + "language_loss": 0.65071714, + "learning_rate": 1.982795820716472e-07, + "loss": 0.67203474, + "num_input_tokens_seen": 308997015, + "step": 14326, + "time_per_iteration": 2.4733710289001465 + }, + { + "auxiliary_loss_clip": 0.01087134, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.03276122, + "balance_loss_mlp": 1.01961756, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 1.980357570927985, + "language_loss": 0.84394902, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.86514854, + "num_input_tokens_seen": 309015250, + "step": 14327, + "time_per_iteration": 2.4769887924194336 + }, + { + "auxiliary_loss_clip": 0.01098526, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.03555202, + "balance_loss_mlp": 1.01958299, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 2.8750177210702996, + "language_loss": 0.75276816, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77407175, + "num_input_tokens_seen": 309034140, + "step": 14328, + "time_per_iteration": 2.4822354316711426 + }, + { + "auxiliary_loss_clip": 0.01097791, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.03543639, + "balance_loss_mlp": 1.01736712, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 1.653236937681769, + "language_loss": 0.80285788, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.8241303, + "num_input_tokens_seen": 309055075, + "step": 14329, + "time_per_iteration": 2.502821922302246 + }, + { + "auxiliary_loss_clip": 0.0108282, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.03729904, + "balance_loss_mlp": 1.01890945, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 2.7808176487901197, + "language_loss": 0.76936877, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.79050791, + "num_input_tokens_seen": 309074650, + "step": 14330, + "time_per_iteration": 2.597679615020752 + }, + { + "auxiliary_loss_clip": 0.01096724, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.03471303, + "balance_loss_mlp": 1.01677227, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 2.2252538938059057, + "language_loss": 0.65015882, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67142051, + "num_input_tokens_seen": 309094385, + "step": 14331, + "time_per_iteration": 2.4601383209228516 + }, + { + "auxiliary_loss_clip": 0.01083527, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.03686261, + "balance_loss_mlp": 1.0250349, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.9269101731287404, + "language_loss": 0.75893116, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.78012872, + "num_input_tokens_seen": 309111815, + "step": 14332, + "time_per_iteration": 2.4853646755218506 + }, + { + "auxiliary_loss_clip": 0.01100938, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.0400455, + "balance_loss_mlp": 1.01759052, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 2.0569691270670853, + "language_loss": 0.67350525, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.69482827, + "num_input_tokens_seen": 309131385, + "step": 14333, + "time_per_iteration": 2.4867806434631348 + }, + { + "auxiliary_loss_clip": 0.01087413, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.03656173, + "balance_loss_mlp": 1.02521253, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 3.6798354887672886, + "language_loss": 0.62494171, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64621359, + "num_input_tokens_seen": 309155020, + "step": 14334, + "time_per_iteration": 2.6326935291290283 + }, + { + "auxiliary_loss_clip": 0.01082332, + "auxiliary_loss_mlp": 0.0105038, + "balance_loss_clip": 1.03784704, + "balance_loss_mlp": 1.03677249, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 2.5025608654885416, + "language_loss": 0.69579804, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71712518, + "num_input_tokens_seen": 309172865, + "step": 14335, + "time_per_iteration": 2.5270330905914307 + }, + { + "auxiliary_loss_clip": 0.01099945, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.03730142, + "balance_loss_mlp": 1.01929903, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 1.4955384704474244, + "language_loss": 0.82891876, + "learning_rate": 1.965923098328135e-07, + "loss": 0.85023379, + "num_input_tokens_seen": 309193575, + "step": 14336, + "time_per_iteration": 4.16803765296936 + }, + { + "auxiliary_loss_clip": 0.01112526, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.03666461, + "balance_loss_mlp": 1.01876926, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 1.673397952266588, + "language_loss": 0.67274439, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.6941846, + "num_input_tokens_seen": 309212680, + "step": 14337, + "time_per_iteration": 2.4690182209014893 + }, + { + "auxiliary_loss_clip": 0.01071974, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.03035593, + "balance_loss_mlp": 1.02136207, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.7128362838427384, + "language_loss": 0.67306256, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69412255, + "num_input_tokens_seen": 309234485, + "step": 14338, + "time_per_iteration": 2.6722710132598877 + }, + { + "auxiliary_loss_clip": 0.01087499, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.03655839, + "balance_loss_mlp": 1.01998401, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 3.0538421659915747, + "language_loss": 0.61906576, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.64025688, + "num_input_tokens_seen": 309253630, + "step": 14339, + "time_per_iteration": 2.489081382751465 + }, + { + "auxiliary_loss_clip": 0.0108749, + "auxiliary_loss_mlp": 0.00777814, + "balance_loss_clip": 1.03472888, + "balance_loss_mlp": 1.00062954, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 1.9278045710524259, + "language_loss": 0.62678546, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64543843, + "num_input_tokens_seen": 309270950, + "step": 14340, + "time_per_iteration": 2.471499443054199 + }, + { + "auxiliary_loss_clip": 0.0106017, + "auxiliary_loss_mlp": 0.01022739, + "balance_loss_clip": 1.03352928, + "balance_loss_mlp": 1.01136637, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 1.7687484593241325, + "language_loss": 0.80045092, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82128, + "num_input_tokens_seen": 309288780, + "step": 14341, + "time_per_iteration": 2.566405773162842 + }, + { + "auxiliary_loss_clip": 0.01096325, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.03932762, + "balance_loss_mlp": 1.02139091, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 3.667000099728719, + "language_loss": 0.74848783, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.76978719, + "num_input_tokens_seen": 309310875, + "step": 14342, + "time_per_iteration": 2.5024361610412598 + }, + { + "auxiliary_loss_clip": 0.01071634, + "auxiliary_loss_mlp": 0.01027562, + "balance_loss_clip": 1.03585362, + "balance_loss_mlp": 1.01449633, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 1.7980642199490584, + "language_loss": 0.68446875, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70546067, + "num_input_tokens_seen": 309329900, + "step": 14343, + "time_per_iteration": 2.4903290271759033 + }, + { + "auxiliary_loss_clip": 0.01098872, + "auxiliary_loss_mlp": 0.01040752, + "balance_loss_clip": 1.03621364, + "balance_loss_mlp": 1.02672672, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 1.7315379966568716, + "language_loss": 0.6741966, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.69559288, + "num_input_tokens_seen": 309347870, + "step": 14344, + "time_per_iteration": 2.4182817935943604 + }, + { + "auxiliary_loss_clip": 0.01069555, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.03305709, + "balance_loss_mlp": 1.02798104, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.3986154198469172, + "language_loss": 0.81417096, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83528519, + "num_input_tokens_seen": 309371695, + "step": 14345, + "time_per_iteration": 2.59131121635437 + }, + { + "auxiliary_loss_clip": 0.0110353, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.03822136, + "balance_loss_mlp": 1.01608789, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.022025283648962, + "language_loss": 0.50340867, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.5247314, + "num_input_tokens_seen": 309394645, + "step": 14346, + "time_per_iteration": 2.5965416431427 + }, + { + "auxiliary_loss_clip": 0.01036306, + "auxiliary_loss_mlp": 0.01030907, + "balance_loss_clip": 1.03513014, + "balance_loss_mlp": 1.01768088, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.4118555669682185, + "language_loss": 0.74784738, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.76851952, + "num_input_tokens_seen": 309413170, + "step": 14347, + "time_per_iteration": 2.674562454223633 + }, + { + "auxiliary_loss_clip": 0.01083864, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.035218, + "balance_loss_mlp": 1.01804996, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 1.880685573594303, + "language_loss": 0.80887705, + "learning_rate": 1.945766105774449e-07, + "loss": 0.83002871, + "num_input_tokens_seen": 309431315, + "step": 14348, + "time_per_iteration": 2.523744583129883 + }, + { + "auxiliary_loss_clip": 0.01091752, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.03418875, + "balance_loss_mlp": 1.01766706, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.6290731071322986, + "language_loss": 0.66195136, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68316245, + "num_input_tokens_seen": 309453020, + "step": 14349, + "time_per_iteration": 2.6139981746673584 + }, + { + "auxiliary_loss_clip": 0.01099566, + "auxiliary_loss_mlp": 0.01041153, + "balance_loss_clip": 1.0360316, + "balance_loss_mlp": 1.02823019, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 2.1134998545018098, + "language_loss": 0.69952524, + "learning_rate": 1.942416188703573e-07, + "loss": 0.72093236, + "num_input_tokens_seen": 309469780, + "step": 14350, + "time_per_iteration": 2.4434385299682617 + }, + { + "auxiliary_loss_clip": 0.01082062, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.0341568, + "balance_loss_mlp": 1.02115726, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 1.6954437629057557, + "language_loss": 0.76993549, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.79109532, + "num_input_tokens_seen": 309489610, + "step": 14351, + "time_per_iteration": 2.491042375564575 + }, + { + "auxiliary_loss_clip": 0.01099372, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.03813672, + "balance_loss_mlp": 1.02201629, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 2.344971281651446, + "language_loss": 0.84484518, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.86617732, + "num_input_tokens_seen": 309508295, + "step": 14352, + "time_per_iteration": 3.964036464691162 + }, + { + "auxiliary_loss_clip": 0.01021729, + "auxiliary_loss_mlp": 0.01002343, + "balance_loss_clip": 1.00746703, + "balance_loss_mlp": 1.00121021, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.7854421123171762, + "language_loss": 0.61885935, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.63910007, + "num_input_tokens_seen": 309567960, + "step": 14353, + "time_per_iteration": 3.0850889682769775 + }, + { + "auxiliary_loss_clip": 0.01109803, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.03858948, + "balance_loss_mlp": 1.01617432, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.9797760766655503, + "language_loss": 0.81847697, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.83985078, + "num_input_tokens_seen": 309586050, + "step": 14354, + "time_per_iteration": 2.444509267807007 + }, + { + "auxiliary_loss_clip": 0.01088007, + "auxiliary_loss_mlp": 0.01025482, + "balance_loss_clip": 1.03347301, + "balance_loss_mlp": 1.01295269, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 1.9957323040330543, + "language_loss": 0.85611087, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87724578, + "num_input_tokens_seen": 309602910, + "step": 14355, + "time_per_iteration": 2.4595160484313965 + }, + { + "auxiliary_loss_clip": 0.01073967, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.03543961, + "balance_loss_mlp": 1.02059186, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 1.9726644501775998, + "language_loss": 0.58922446, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.61029851, + "num_input_tokens_seen": 309621175, + "step": 14356, + "time_per_iteration": 2.534729480743408 + }, + { + "auxiliary_loss_clip": 0.0106844, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.03634155, + "balance_loss_mlp": 1.01955009, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.7715029405685552, + "language_loss": 0.76975971, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79077244, + "num_input_tokens_seen": 309639395, + "step": 14357, + "time_per_iteration": 2.528130054473877 + }, + { + "auxiliary_loss_clip": 0.01099736, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.03729725, + "balance_loss_mlp": 1.01719856, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.25530215413704, + "language_loss": 0.77302796, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79432589, + "num_input_tokens_seen": 309657265, + "step": 14358, + "time_per_iteration": 2.4446120262145996 + }, + { + "auxiliary_loss_clip": 0.01075799, + "auxiliary_loss_mlp": 0.01036892, + "balance_loss_clip": 1.03273189, + "balance_loss_mlp": 1.02224123, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.4567490485631092, + "language_loss": 0.75179017, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77291709, + "num_input_tokens_seen": 309678610, + "step": 14359, + "time_per_iteration": 2.585766553878784 + }, + { + "auxiliary_loss_clip": 0.01049027, + "auxiliary_loss_mlp": 0.01026185, + "balance_loss_clip": 1.03269839, + "balance_loss_mlp": 1.01362658, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.9448850134375366, + "language_loss": 0.70345271, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72420484, + "num_input_tokens_seen": 309697710, + "step": 14360, + "time_per_iteration": 2.591306686401367 + }, + { + "auxiliary_loss_clip": 0.01081169, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.03862023, + "balance_loss_mlp": 1.01962876, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 6.047302701015315, + "language_loss": 0.76024091, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78138214, + "num_input_tokens_seen": 309715985, + "step": 14361, + "time_per_iteration": 2.539994955062866 + }, + { + "auxiliary_loss_clip": 0.01027708, + "auxiliary_loss_mlp": 0.01000561, + "balance_loss_clip": 1.00443196, + "balance_loss_mlp": 0.99936926, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9612368343761608, + "language_loss": 0.58769214, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60797483, + "num_input_tokens_seen": 309779930, + "step": 14362, + "time_per_iteration": 4.43407678604126 + }, + { + "auxiliary_loss_clip": 0.0105231, + "auxiliary_loss_mlp": 0.01032778, + "balance_loss_clip": 1.04418492, + "balance_loss_mlp": 1.01911092, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 1.7645440059448325, + "language_loss": 0.80717635, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82802725, + "num_input_tokens_seen": 309800580, + "step": 14363, + "time_per_iteration": 4.059329032897949 + }, + { + "auxiliary_loss_clip": 0.01084612, + "auxiliary_loss_mlp": 0.01048506, + "balance_loss_clip": 1.03332353, + "balance_loss_mlp": 1.0336411, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 4.157146832394927, + "language_loss": 0.72322547, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.74455667, + "num_input_tokens_seen": 309821725, + "step": 14364, + "time_per_iteration": 2.539978504180908 + }, + { + "auxiliary_loss_clip": 0.01088397, + "auxiliary_loss_mlp": 0.01036888, + "balance_loss_clip": 1.03435981, + "balance_loss_mlp": 1.02387071, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 1.5222802863707698, + "language_loss": 0.71761054, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73886335, + "num_input_tokens_seen": 309841565, + "step": 14365, + "time_per_iteration": 2.5422282218933105 + }, + { + "auxiliary_loss_clip": 0.01089631, + "auxiliary_loss_mlp": 0.01049521, + "balance_loss_clip": 1.03776228, + "balance_loss_mlp": 1.03400028, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 2.5928694314474816, + "language_loss": 0.70874983, + "learning_rate": 1.915715498065993e-07, + "loss": 0.73014134, + "num_input_tokens_seen": 309858635, + "step": 14366, + "time_per_iteration": 2.517674446105957 + }, + { + "auxiliary_loss_clip": 0.01082612, + "auxiliary_loss_mlp": 0.01026349, + "balance_loss_clip": 1.03808427, + "balance_loss_mlp": 1.01490533, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 2.766291426601554, + "language_loss": 0.8175562, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.83864582, + "num_input_tokens_seen": 309877885, + "step": 14367, + "time_per_iteration": 2.5040900707244873 + }, + { + "auxiliary_loss_clip": 0.01091175, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.04283917, + "balance_loss_mlp": 1.0180192, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 2.2069572441093372, + "language_loss": 0.61907494, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.64030313, + "num_input_tokens_seen": 309893140, + "step": 14368, + "time_per_iteration": 2.5137109756469727 + }, + { + "auxiliary_loss_clip": 0.01098246, + "auxiliary_loss_mlp": 0.010294, + "balance_loss_clip": 1.03708041, + "balance_loss_mlp": 1.01715755, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 2.471222971844612, + "language_loss": 0.76379514, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78507161, + "num_input_tokens_seen": 309914175, + "step": 14369, + "time_per_iteration": 2.4742071628570557 + }, + { + "auxiliary_loss_clip": 0.01086837, + "auxiliary_loss_mlp": 0.01036443, + "balance_loss_clip": 1.03711963, + "balance_loss_mlp": 1.02316284, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 1.9553237629955034, + "language_loss": 0.64371097, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66494381, + "num_input_tokens_seen": 309932395, + "step": 14370, + "time_per_iteration": 2.5440309047698975 + }, + { + "auxiliary_loss_clip": 0.01053226, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.03880787, + "balance_loss_mlp": 1.01987457, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 1.5955731113207736, + "language_loss": 0.6647988, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68565923, + "num_input_tokens_seen": 309951720, + "step": 14371, + "time_per_iteration": 2.6177351474761963 + }, + { + "auxiliary_loss_clip": 0.01012453, + "auxiliary_loss_mlp": 0.01000399, + "balance_loss_clip": 1.0075984, + "balance_loss_mlp": 0.99913579, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8614410938672558, + "language_loss": 0.5694086, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58953714, + "num_input_tokens_seen": 310006120, + "step": 14372, + "time_per_iteration": 2.94571852684021 + }, + { + "auxiliary_loss_clip": 0.01109881, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.03934741, + "balance_loss_mlp": 1.02268803, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 2.7249873164616427, + "language_loss": 0.79185188, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81330907, + "num_input_tokens_seen": 310026740, + "step": 14373, + "time_per_iteration": 2.445563316345215 + }, + { + "auxiliary_loss_clip": 0.01109311, + "auxiliary_loss_mlp": 0.01028094, + "balance_loss_clip": 1.03678441, + "balance_loss_mlp": 1.01534438, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 1.8048433113270774, + "language_loss": 0.63783765, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65921175, + "num_input_tokens_seen": 310044135, + "step": 14374, + "time_per_iteration": 2.398439407348633 + }, + { + "auxiliary_loss_clip": 0.01081752, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.03853273, + "balance_loss_mlp": 1.02227163, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 1.8829207276449502, + "language_loss": 0.77169687, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79286498, + "num_input_tokens_seen": 310061560, + "step": 14375, + "time_per_iteration": 4.066818714141846 + }, + { + "auxiliary_loss_clip": 0.01066517, + "auxiliary_loss_mlp": 0.00777366, + "balance_loss_clip": 1.03572845, + "balance_loss_mlp": 1.00056911, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.6029359539394754, + "language_loss": 0.60404772, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62248659, + "num_input_tokens_seen": 310087310, + "step": 14376, + "time_per_iteration": 2.896907329559326 + }, + { + "auxiliary_loss_clip": 0.01067023, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.03327966, + "balance_loss_mlp": 1.02580357, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.456589399305999, + "language_loss": 0.66149008, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68253756, + "num_input_tokens_seen": 310106260, + "step": 14377, + "time_per_iteration": 2.503887176513672 + }, + { + "auxiliary_loss_clip": 0.01083589, + "auxiliary_loss_mlp": 0.01039742, + "balance_loss_clip": 1.03222036, + "balance_loss_mlp": 1.02527046, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.6576423130145277, + "language_loss": 0.70095146, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72218478, + "num_input_tokens_seen": 310125305, + "step": 14378, + "time_per_iteration": 2.49234676361084 + }, + { + "auxiliary_loss_clip": 0.01019178, + "auxiliary_loss_mlp": 0.01003197, + "balance_loss_clip": 1.00564039, + "balance_loss_mlp": 1.00204647, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.8033199822274893, + "language_loss": 0.6028657, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62308949, + "num_input_tokens_seen": 310189270, + "step": 14379, + "time_per_iteration": 3.043177843093872 + }, + { + "auxiliary_loss_clip": 0.0108059, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.03451085, + "balance_loss_mlp": 1.01977515, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.5669066930402378, + "language_loss": 0.74540544, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76653278, + "num_input_tokens_seen": 310208395, + "step": 14380, + "time_per_iteration": 2.49349308013916 + }, + { + "auxiliary_loss_clip": 0.01082059, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.03193927, + "balance_loss_mlp": 1.01844692, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 1.9688173951310624, + "language_loss": 0.75622761, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.77736568, + "num_input_tokens_seen": 310227415, + "step": 14381, + "time_per_iteration": 2.482187032699585 + }, + { + "auxiliary_loss_clip": 0.0108548, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.03678226, + "balance_loss_mlp": 1.02201998, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 2.2992120505405014, + "language_loss": 0.84606832, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86726308, + "num_input_tokens_seen": 310242625, + "step": 14382, + "time_per_iteration": 2.438328981399536 + }, + { + "auxiliary_loss_clip": 0.01101809, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.03808272, + "balance_loss_mlp": 1.01970291, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 2.137537903784015, + "language_loss": 0.75672889, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.77807033, + "num_input_tokens_seen": 310260585, + "step": 14383, + "time_per_iteration": 2.451669454574585 + }, + { + "auxiliary_loss_clip": 0.01089509, + "auxiliary_loss_mlp": 0.01029976, + "balance_loss_clip": 1.03779066, + "balance_loss_mlp": 1.01741159, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 1.8785028040642195, + "language_loss": 0.85277581, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87397075, + "num_input_tokens_seen": 310277210, + "step": 14384, + "time_per_iteration": 2.472929000854492 + }, + { + "auxiliary_loss_clip": 0.01095356, + "auxiliary_loss_mlp": 0.01029344, + "balance_loss_clip": 1.03482568, + "balance_loss_mlp": 1.0170064, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 2.161061849962912, + "language_loss": 0.81264496, + "learning_rate": 1.884236463176072e-07, + "loss": 0.83389199, + "num_input_tokens_seen": 310296610, + "step": 14385, + "time_per_iteration": 2.439546585083008 + }, + { + "auxiliary_loss_clip": 0.01094883, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.04003096, + "balance_loss_mlp": 1.01773632, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 2.988262447501721, + "language_loss": 0.7296229, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.75087875, + "num_input_tokens_seen": 310316830, + "step": 14386, + "time_per_iteration": 2.515266180038452 + }, + { + "auxiliary_loss_clip": 0.01093657, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.03371382, + "balance_loss_mlp": 1.0254966, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 1.8039726550667492, + "language_loss": 0.82228827, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84361064, + "num_input_tokens_seen": 310334355, + "step": 14387, + "time_per_iteration": 2.4103336334228516 + }, + { + "auxiliary_loss_clip": 0.01108071, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.03802133, + "balance_loss_mlp": 1.01905775, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 2.0918671824218706, + "language_loss": 0.68966508, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.71105576, + "num_input_tokens_seen": 310352900, + "step": 14388, + "time_per_iteration": 2.390690565109253 + }, + { + "auxiliary_loss_clip": 0.01075194, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.03716803, + "balance_loss_mlp": 1.02141929, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.7011930344335975, + "language_loss": 0.90304947, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92412704, + "num_input_tokens_seen": 310372855, + "step": 14389, + "time_per_iteration": 2.546614646911621 + }, + { + "auxiliary_loss_clip": 0.0106582, + "auxiliary_loss_mlp": 0.00776806, + "balance_loss_clip": 1.03842616, + "balance_loss_mlp": 1.00069678, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 2.5908615387752274, + "language_loss": 0.70864487, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72707111, + "num_input_tokens_seen": 310391595, + "step": 14390, + "time_per_iteration": 2.547891139984131 + }, + { + "auxiliary_loss_clip": 0.01112206, + "auxiliary_loss_mlp": 0.0103616, + "balance_loss_clip": 1.03805923, + "balance_loss_mlp": 1.02363133, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 1.791103881242051, + "language_loss": 0.81989992, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84138358, + "num_input_tokens_seen": 310410090, + "step": 14391, + "time_per_iteration": 3.956988573074341 + }, + { + "auxiliary_loss_clip": 0.01014126, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.02353501, + "balance_loss_mlp": 1.0014292, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.7950641001800015, + "language_loss": 0.67934, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.69950753, + "num_input_tokens_seen": 310470055, + "step": 14392, + "time_per_iteration": 3.027092695236206 + }, + { + "auxiliary_loss_clip": 0.01104511, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.03767705, + "balance_loss_mlp": 1.0227952, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 2.090253373208343, + "language_loss": 0.75839865, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.77980357, + "num_input_tokens_seen": 310487665, + "step": 14393, + "time_per_iteration": 2.450969696044922 + }, + { + "auxiliary_loss_clip": 0.01086898, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.03293514, + "balance_loss_mlp": 1.02256346, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 1.834935728846406, + "language_loss": 0.73979294, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.76101029, + "num_input_tokens_seen": 310506130, + "step": 14394, + "time_per_iteration": 2.4948344230651855 + }, + { + "auxiliary_loss_clip": 0.01100755, + "auxiliary_loss_mlp": 0.01031007, + "balance_loss_clip": 1.03537035, + "balance_loss_mlp": 1.01757216, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 1.9040073967110782, + "language_loss": 0.65221655, + "learning_rate": 1.867768130747036e-07, + "loss": 0.67353415, + "num_input_tokens_seen": 310532445, + "step": 14395, + "time_per_iteration": 2.7780396938323975 + }, + { + "auxiliary_loss_clip": 0.01092668, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.03474808, + "balance_loss_mlp": 1.02362859, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 1.8315319329633868, + "language_loss": 0.67829347, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.6995821, + "num_input_tokens_seen": 310552300, + "step": 14396, + "time_per_iteration": 2.4697272777557373 + }, + { + "auxiliary_loss_clip": 0.01102239, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.03884292, + "balance_loss_mlp": 1.02197051, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 2.576751367727253, + "language_loss": 0.69433284, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71570569, + "num_input_tokens_seen": 310572710, + "step": 14397, + "time_per_iteration": 2.4793496131896973 + }, + { + "auxiliary_loss_clip": 0.01094379, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.03885865, + "balance_loss_mlp": 1.01782942, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 2.1320558957343665, + "language_loss": 0.63396311, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65520447, + "num_input_tokens_seen": 310592460, + "step": 14398, + "time_per_iteration": 2.516051769256592 + }, + { + "auxiliary_loss_clip": 0.01071934, + "auxiliary_loss_mlp": 0.01037693, + "balance_loss_clip": 1.03426445, + "balance_loss_mlp": 1.02374601, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 2.411245307817647, + "language_loss": 0.76355422, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78465044, + "num_input_tokens_seen": 310609375, + "step": 14399, + "time_per_iteration": 2.5130372047424316 + }, + { + "auxiliary_loss_clip": 0.01092002, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.03706646, + "balance_loss_mlp": 1.01742744, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 2.115496034304214, + "language_loss": 0.92906547, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95027637, + "num_input_tokens_seen": 310627405, + "step": 14400, + "time_per_iteration": 2.431095838546753 + }, + { + "auxiliary_loss_clip": 0.01047441, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.03519773, + "balance_loss_mlp": 1.02204561, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 1.8469744493046447, + "language_loss": 0.67409444, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69491339, + "num_input_tokens_seen": 310649945, + "step": 14401, + "time_per_iteration": 4.137044668197632 + }, + { + "auxiliary_loss_clip": 0.01098649, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.0353092, + "balance_loss_mlp": 1.01788056, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 2.606469819768503, + "language_loss": 0.74122143, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.76251078, + "num_input_tokens_seen": 310668285, + "step": 14402, + "time_per_iteration": 2.422844648361206 + }, + { + "auxiliary_loss_clip": 0.01041241, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.03544402, + "balance_loss_mlp": 1.02063429, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 1.6449214796047056, + "language_loss": 0.74948084, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.77021724, + "num_input_tokens_seen": 310687015, + "step": 14403, + "time_per_iteration": 3.9213361740112305 + }, + { + "auxiliary_loss_clip": 0.01084429, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.03425753, + "balance_loss_mlp": 1.02215934, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 1.673696741001787, + "language_loss": 0.73141146, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75261509, + "num_input_tokens_seen": 310707580, + "step": 14404, + "time_per_iteration": 2.526460647583008 + }, + { + "auxiliary_loss_clip": 0.01073093, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.03791213, + "balance_loss_mlp": 1.02156329, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 1.6483452446298854, + "language_loss": 0.70790124, + "learning_rate": 1.851368555901447e-07, + "loss": 0.72897756, + "num_input_tokens_seen": 310727300, + "step": 14405, + "time_per_iteration": 2.5507447719573975 + }, + { + "auxiliary_loss_clip": 0.01099981, + "auxiliary_loss_mlp": 0.00778557, + "balance_loss_clip": 1.03563428, + "balance_loss_mlp": 1.00071311, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 1.716895131040654, + "language_loss": 0.66258991, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.68137532, + "num_input_tokens_seen": 310744935, + "step": 14406, + "time_per_iteration": 2.4346790313720703 + }, + { + "auxiliary_loss_clip": 0.01091001, + "auxiliary_loss_mlp": 0.01025769, + "balance_loss_clip": 1.03824234, + "balance_loss_mlp": 1.01458144, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.8032602505573296, + "language_loss": 0.83232158, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.85348934, + "num_input_tokens_seen": 310765085, + "step": 14407, + "time_per_iteration": 2.50221586227417 + }, + { + "auxiliary_loss_clip": 0.0109794, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.0371685, + "balance_loss_mlp": 1.02453673, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.853624515728243, + "language_loss": 0.70191407, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72326046, + "num_input_tokens_seen": 310783260, + "step": 14408, + "time_per_iteration": 2.463480234146118 + }, + { + "auxiliary_loss_clip": 0.01090801, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.03717494, + "balance_loss_mlp": 1.01846719, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 2.154585742055808, + "language_loss": 0.77340281, + "learning_rate": 1.844827992025304e-07, + "loss": 0.7946099, + "num_input_tokens_seen": 310801970, + "step": 14409, + "time_per_iteration": 2.4232161045074463 + }, + { + "auxiliary_loss_clip": 0.01102248, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.03878689, + "balance_loss_mlp": 1.01913917, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 1.9676396459887768, + "language_loss": 0.77307642, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79442644, + "num_input_tokens_seen": 310822070, + "step": 14410, + "time_per_iteration": 2.458724021911621 + }, + { + "auxiliary_loss_clip": 0.01072123, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.03798079, + "balance_loss_mlp": 1.02079749, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 2.2741560951281476, + "language_loss": 0.77706438, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79812515, + "num_input_tokens_seen": 310838355, + "step": 14411, + "time_per_iteration": 2.5286178588867188 + }, + { + "auxiliary_loss_clip": 0.01083178, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.03757811, + "balance_loss_mlp": 1.01991272, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 1.7180523498683593, + "language_loss": 0.73798662, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.75913197, + "num_input_tokens_seen": 310856055, + "step": 14412, + "time_per_iteration": 2.492805004119873 + }, + { + "auxiliary_loss_clip": 0.01091131, + "auxiliary_loss_mlp": 0.00780344, + "balance_loss_clip": 1.03345287, + "balance_loss_mlp": 1.00071812, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 2.014601763586116, + "language_loss": 0.69271207, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71142679, + "num_input_tokens_seen": 310876695, + "step": 14413, + "time_per_iteration": 2.467954635620117 + }, + { + "auxiliary_loss_clip": 0.0109994, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.0387764, + "balance_loss_mlp": 1.02063227, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.4736305519590425, + "language_loss": 0.63111669, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.65245175, + "num_input_tokens_seen": 310893880, + "step": 14414, + "time_per_iteration": 4.017768621444702 + }, + { + "auxiliary_loss_clip": 0.0107912, + "auxiliary_loss_mlp": 0.00777751, + "balance_loss_clip": 1.03747749, + "balance_loss_mlp": 1.00065303, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 1.7211363344183022, + "language_loss": 0.64020932, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.65877807, + "num_input_tokens_seen": 310914145, + "step": 14415, + "time_per_iteration": 2.5476083755493164 + }, + { + "auxiliary_loss_clip": 0.010023, + "auxiliary_loss_mlp": 0.00998773, + "balance_loss_clip": 1.00834024, + "balance_loss_mlp": 0.99755156, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.855582212961333, + "language_loss": 0.60337138, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62338215, + "num_input_tokens_seen": 310972825, + "step": 14416, + "time_per_iteration": 3.155855894088745 + }, + { + "auxiliary_loss_clip": 0.01101707, + "auxiliary_loss_mlp": 0.0077945, + "balance_loss_clip": 1.03680062, + "balance_loss_mlp": 1.00065911, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.635876395714065, + "language_loss": 0.74486375, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76367527, + "num_input_tokens_seen": 310992050, + "step": 14417, + "time_per_iteration": 2.4700350761413574 + }, + { + "auxiliary_loss_clip": 0.01087995, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.03799284, + "balance_loss_mlp": 1.0217663, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 1.5029686359998342, + "language_loss": 0.74413067, + "learning_rate": 1.830152003424319e-07, + "loss": 0.76534921, + "num_input_tokens_seen": 311011105, + "step": 14418, + "time_per_iteration": 2.5139708518981934 + }, + { + "auxiliary_loss_clip": 0.01098648, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.03713965, + "balance_loss_mlp": 1.02033591, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.6131132664455092, + "language_loss": 0.6815486, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70286083, + "num_input_tokens_seen": 311032080, + "step": 14419, + "time_per_iteration": 2.5278987884521484 + }, + { + "auxiliary_loss_clip": 0.01099741, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.0357579, + "balance_loss_mlp": 1.01927018, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.787693450942173, + "language_loss": 0.79086149, + "learning_rate": 1.826898250065465e-07, + "loss": 0.81216466, + "num_input_tokens_seen": 311049735, + "step": 14420, + "time_per_iteration": 2.4134628772735596 + }, + { + "auxiliary_loss_clip": 0.01094005, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.0340941, + "balance_loss_mlp": 1.02029073, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.6612355064105473, + "language_loss": 0.83483434, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85609829, + "num_input_tokens_seen": 311067675, + "step": 14421, + "time_per_iteration": 2.4527835845947266 + }, + { + "auxiliary_loss_clip": 0.01016699, + "auxiliary_loss_mlp": 0.01000585, + "balance_loss_clip": 1.01718688, + "balance_loss_mlp": 0.9993574, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.7082112382677288, + "language_loss": 0.49265936, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51283222, + "num_input_tokens_seen": 311126605, + "step": 14422, + "time_per_iteration": 3.065929412841797 + }, + { + "auxiliary_loss_clip": 0.01087048, + "auxiliary_loss_mlp": 0.00776845, + "balance_loss_clip": 1.03683543, + "balance_loss_mlp": 1.00062132, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 9.048109845922395, + "language_loss": 0.73503143, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75367039, + "num_input_tokens_seen": 311147325, + "step": 14423, + "time_per_iteration": 2.5540943145751953 + }, + { + "auxiliary_loss_clip": 0.01065304, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.03237343, + "balance_loss_mlp": 1.01868653, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.658461725039191, + "language_loss": 0.76953262, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.790501, + "num_input_tokens_seen": 311165385, + "step": 14424, + "time_per_iteration": 2.5027763843536377 + }, + { + "auxiliary_loss_clip": 0.0106844, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.03186154, + "balance_loss_mlp": 1.02633333, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 1.6540911997336978, + "language_loss": 0.71430016, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73537403, + "num_input_tokens_seen": 311185860, + "step": 14425, + "time_per_iteration": 2.598541498184204 + }, + { + "auxiliary_loss_clip": 0.01101371, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.03807592, + "balance_loss_mlp": 1.0211978, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.8009523621696641, + "language_loss": 0.6834259, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70478404, + "num_input_tokens_seen": 311205810, + "step": 14426, + "time_per_iteration": 2.4598190784454346 + }, + { + "auxiliary_loss_clip": 0.01066749, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.03665507, + "balance_loss_mlp": 1.01527405, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 2.26012116498578, + "language_loss": 0.70698011, + "learning_rate": 1.815531824008234e-07, + "loss": 0.72793078, + "num_input_tokens_seen": 311226080, + "step": 14427, + "time_per_iteration": 2.530733585357666 + }, + { + "auxiliary_loss_clip": 0.01080374, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.03539932, + "balance_loss_mlp": 1.0194633, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 2.007410749934019, + "language_loss": 0.68012887, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70124537, + "num_input_tokens_seen": 311246380, + "step": 14428, + "time_per_iteration": 2.5704007148742676 + }, + { + "auxiliary_loss_clip": 0.0108153, + "auxiliary_loss_mlp": 0.01026879, + "balance_loss_clip": 1.03782964, + "balance_loss_mlp": 1.01523829, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 2.0134786905687205, + "language_loss": 0.70607233, + "learning_rate": 1.812290478794889e-07, + "loss": 0.7271564, + "num_input_tokens_seen": 311266465, + "step": 14429, + "time_per_iteration": 2.491757392883301 + }, + { + "auxiliary_loss_clip": 0.01087245, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.03595364, + "balance_loss_mlp": 1.01888418, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 2.157913971646584, + "language_loss": 0.66890109, + "learning_rate": 1.810670840677151e-07, + "loss": 0.69008094, + "num_input_tokens_seen": 311285075, + "step": 14430, + "time_per_iteration": 3.9538073539733887 + }, + { + "auxiliary_loss_clip": 0.01065033, + "auxiliary_loss_mlp": 0.01037401, + "balance_loss_clip": 1.03478742, + "balance_loss_mlp": 1.02418089, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 2.3859416553469224, + "language_loss": 0.69796872, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71899307, + "num_input_tokens_seen": 311303230, + "step": 14431, + "time_per_iteration": 2.572453737258911 + }, + { + "auxiliary_loss_clip": 0.01098305, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.03545499, + "balance_loss_mlp": 1.02303207, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.1171731686018602, + "language_loss": 0.63344759, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.6547882, + "num_input_tokens_seen": 311318070, + "step": 14432, + "time_per_iteration": 2.4463162422180176 + }, + { + "auxiliary_loss_clip": 0.01098584, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.03672838, + "balance_loss_mlp": 1.0273869, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 1.8687141009122854, + "language_loss": 0.78082198, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80219448, + "num_input_tokens_seen": 311334885, + "step": 14433, + "time_per_iteration": 2.4345033168792725 + }, + { + "auxiliary_loss_clip": 0.01009238, + "auxiliary_loss_mlp": 0.01002782, + "balance_loss_clip": 1.00523496, + "balance_loss_mlp": 1.00145328, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7037115262245365, + "language_loss": 0.58454311, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60466337, + "num_input_tokens_seen": 311399780, + "step": 14434, + "time_per_iteration": 3.1481306552886963 + }, + { + "auxiliary_loss_clip": 0.01085182, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.03516173, + "balance_loss_mlp": 1.02181721, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.7555579480959296, + "language_loss": 0.80245942, + "learning_rate": 1.802582997433628e-07, + "loss": 0.8236413, + "num_input_tokens_seen": 311419610, + "step": 14435, + "time_per_iteration": 2.5780136585235596 + }, + { + "auxiliary_loss_clip": 0.01087413, + "auxiliary_loss_mlp": 0.00778347, + "balance_loss_clip": 1.03409386, + "balance_loss_mlp": 1.00069916, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 1.9471529091483177, + "language_loss": 0.62481987, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.64347744, + "num_input_tokens_seen": 311440045, + "step": 14436, + "time_per_iteration": 2.608644723892212 + }, + { + "auxiliary_loss_clip": 0.01085214, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.03717756, + "balance_loss_mlp": 1.01573205, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 3.574285371512405, + "language_loss": 0.71066916, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.73180765, + "num_input_tokens_seen": 311456660, + "step": 14437, + "time_per_iteration": 2.4804084300994873 + }, + { + "auxiliary_loss_clip": 0.01078284, + "auxiliary_loss_mlp": 0.01028138, + "balance_loss_clip": 1.03846037, + "balance_loss_mlp": 1.01560307, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 2.3421141524941, + "language_loss": 0.80349678, + "learning_rate": 1.797738571571381e-07, + "loss": 0.824561, + "num_input_tokens_seen": 311475460, + "step": 14438, + "time_per_iteration": 2.5805230140686035 + }, + { + "auxiliary_loss_clip": 0.01093628, + "auxiliary_loss_mlp": 0.01023342, + "balance_loss_clip": 1.03889692, + "balance_loss_mlp": 1.01142073, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 1.9597369930458683, + "language_loss": 0.67593694, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.6971066, + "num_input_tokens_seen": 311494575, + "step": 14439, + "time_per_iteration": 2.4643445014953613 + }, + { + "auxiliary_loss_clip": 0.01097289, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.03729594, + "balance_loss_mlp": 1.0234462, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.512543562074904, + "language_loss": 0.64234674, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.66367054, + "num_input_tokens_seen": 311515805, + "step": 14440, + "time_per_iteration": 2.615168333053589 + }, + { + "auxiliary_loss_clip": 0.01097145, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.03667045, + "balance_loss_mlp": 1.01940143, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.698235849033824, + "language_loss": 0.65881312, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68010014, + "num_input_tokens_seen": 311536000, + "step": 14441, + "time_per_iteration": 4.052567720413208 + }, + { + "auxiliary_loss_clip": 0.01092112, + "auxiliary_loss_mlp": 0.01027026, + "balance_loss_clip": 1.03906965, + "balance_loss_mlp": 1.01551628, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.9336287720516572, + "language_loss": 0.66335678, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68454814, + "num_input_tokens_seen": 311556220, + "step": 14442, + "time_per_iteration": 3.937333583831787 + }, + { + "auxiliary_loss_clip": 0.01085119, + "auxiliary_loss_mlp": 0.01029205, + "balance_loss_clip": 1.03469574, + "balance_loss_mlp": 1.01553154, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.875900587069949, + "language_loss": 0.72206473, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74320793, + "num_input_tokens_seen": 311572530, + "step": 14443, + "time_per_iteration": 2.4549500942230225 + }, + { + "auxiliary_loss_clip": 0.0111066, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.03728318, + "balance_loss_mlp": 1.01626945, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 1.8828854794669214, + "language_loss": 0.83184612, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85324568, + "num_input_tokens_seen": 311591105, + "step": 14444, + "time_per_iteration": 2.4519402980804443 + }, + { + "auxiliary_loss_clip": 0.01073204, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.03870714, + "balance_loss_mlp": 1.02035916, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 1.9201526147724584, + "language_loss": 0.77443129, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79549176, + "num_input_tokens_seen": 311608350, + "step": 14445, + "time_per_iteration": 2.5018274784088135 + }, + { + "auxiliary_loss_clip": 0.01099913, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.03736687, + "balance_loss_mlp": 1.02047944, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.8628956809443526, + "language_loss": 0.67809778, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.69943142, + "num_input_tokens_seen": 311626380, + "step": 14446, + "time_per_iteration": 2.540966272354126 + }, + { + "auxiliary_loss_clip": 0.01094765, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.03464746, + "balance_loss_mlp": 1.02281094, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 2.0609794278114655, + "language_loss": 0.82909805, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85040307, + "num_input_tokens_seen": 311644345, + "step": 14447, + "time_per_iteration": 2.462724447250366 + }, + { + "auxiliary_loss_clip": 0.01032888, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.03412259, + "balance_loss_mlp": 1.01668096, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.6060367590435154, + "language_loss": 0.73875856, + "learning_rate": 1.781635359686515e-07, + "loss": 0.75937212, + "num_input_tokens_seen": 311663340, + "step": 14448, + "time_per_iteration": 2.651141881942749 + }, + { + "auxiliary_loss_clip": 0.01082365, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.03276849, + "balance_loss_mlp": 1.02316713, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.0542112194072946, + "language_loss": 0.80555147, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82674271, + "num_input_tokens_seen": 311679860, + "step": 14449, + "time_per_iteration": 2.445396900177002 + }, + { + "auxiliary_loss_clip": 0.01005061, + "auxiliary_loss_mlp": 0.01000339, + "balance_loss_clip": 1.01002121, + "balance_loss_mlp": 0.9991765, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8123490833195137, + "language_loss": 0.60591316, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62596714, + "num_input_tokens_seen": 311738135, + "step": 14450, + "time_per_iteration": 3.029585838317871 + }, + { + "auxiliary_loss_clip": 0.01086338, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.0378747, + "balance_loss_mlp": 1.02020872, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.821677217021877, + "language_loss": 0.7606647, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78185415, + "num_input_tokens_seen": 311756975, + "step": 14451, + "time_per_iteration": 2.5624589920043945 + }, + { + "auxiliary_loss_clip": 0.01096118, + "auxiliary_loss_mlp": 0.01027696, + "balance_loss_clip": 1.03663039, + "balance_loss_mlp": 1.015692, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 2.731129345639417, + "language_loss": 0.7269882, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74822628, + "num_input_tokens_seen": 311771830, + "step": 14452, + "time_per_iteration": 2.4271769523620605 + }, + { + "auxiliary_loss_clip": 0.01092423, + "auxiliary_loss_mlp": 0.0077915, + "balance_loss_clip": 1.04206324, + "balance_loss_mlp": 1.00066698, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.8532950548656948, + "language_loss": 0.7199657, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.73868144, + "num_input_tokens_seen": 311790130, + "step": 14453, + "time_per_iteration": 4.008254051208496 + }, + { + "auxiliary_loss_clip": 0.01097059, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.03585315, + "balance_loss_mlp": 1.02259612, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 2.137294290535442, + "language_loss": 0.73789871, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.75922412, + "num_input_tokens_seen": 311808360, + "step": 14454, + "time_per_iteration": 2.4415934085845947 + }, + { + "auxiliary_loss_clip": 0.01109161, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.03764296, + "balance_loss_mlp": 1.01981974, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 4.642529222336424, + "language_loss": 0.59420729, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61561787, + "num_input_tokens_seen": 311831325, + "step": 14455, + "time_per_iteration": 2.555312156677246 + }, + { + "auxiliary_loss_clip": 0.01091932, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.03778815, + "balance_loss_mlp": 1.01798964, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.242096182605699, + "language_loss": 0.80187929, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.82310587, + "num_input_tokens_seen": 311848090, + "step": 14456, + "time_per_iteration": 2.4595463275909424 + }, + { + "auxiliary_loss_clip": 0.01055318, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.03406096, + "balance_loss_mlp": 1.02544355, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 2.7852744207761098, + "language_loss": 0.74310434, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76406658, + "num_input_tokens_seen": 311867855, + "step": 14457, + "time_per_iteration": 2.6304757595062256 + }, + { + "auxiliary_loss_clip": 0.0104836, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.03338122, + "balance_loss_mlp": 1.02113295, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.5709633209820517, + "language_loss": 0.78582394, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80663836, + "num_input_tokens_seen": 311888675, + "step": 14458, + "time_per_iteration": 2.6186304092407227 + }, + { + "auxiliary_loss_clip": 0.01100474, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.03703952, + "balance_loss_mlp": 1.0204339, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.776610909495414, + "language_loss": 0.71008217, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73142409, + "num_input_tokens_seen": 311907310, + "step": 14459, + "time_per_iteration": 2.4502222537994385 + }, + { + "auxiliary_loss_clip": 0.01082394, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.03394389, + "balance_loss_mlp": 1.01737475, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 2.0457779188063276, + "language_loss": 0.73882437, + "learning_rate": 1.762402701923398e-07, + "loss": 0.75993502, + "num_input_tokens_seen": 311929635, + "step": 14460, + "time_per_iteration": 2.5489609241485596 + }, + { + "auxiliary_loss_clip": 0.01094634, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.03929877, + "balance_loss_mlp": 1.02084434, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 1.8750374911661212, + "language_loss": 0.6523813, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.67366225, + "num_input_tokens_seen": 311948800, + "step": 14461, + "time_per_iteration": 2.526700258255005 + }, + { + "auxiliary_loss_clip": 0.01093188, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.03167176, + "balance_loss_mlp": 1.0247457, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 2.064378892853726, + "language_loss": 0.82496345, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84627217, + "num_input_tokens_seen": 311964090, + "step": 14462, + "time_per_iteration": 2.4185099601745605 + }, + { + "auxiliary_loss_clip": 0.01098778, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.03518486, + "balance_loss_mlp": 1.01874804, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 1.8382677278941302, + "language_loss": 0.64974415, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67104828, + "num_input_tokens_seen": 311981460, + "step": 14463, + "time_per_iteration": 2.448834180831909 + }, + { + "auxiliary_loss_clip": 0.01092616, + "auxiliary_loss_mlp": 0.01038033, + "balance_loss_clip": 1.03944552, + "balance_loss_mlp": 1.02502131, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 1.959871403371559, + "language_loss": 0.66281545, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.68412191, + "num_input_tokens_seen": 312000115, + "step": 14464, + "time_per_iteration": 2.4584994316101074 + }, + { + "auxiliary_loss_clip": 0.01090726, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.03689742, + "balance_loss_mlp": 1.02347612, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 2.4928395516985264, + "language_loss": 0.62190759, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.64317846, + "num_input_tokens_seen": 312020770, + "step": 14465, + "time_per_iteration": 2.513237237930298 + }, + { + "auxiliary_loss_clip": 0.010935, + "auxiliary_loss_mlp": 0.01038062, + "balance_loss_clip": 1.03605938, + "balance_loss_mlp": 1.026636, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.4961632991122547, + "language_loss": 0.84647977, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.86779535, + "num_input_tokens_seen": 312041870, + "step": 14466, + "time_per_iteration": 2.476449966430664 + }, + { + "auxiliary_loss_clip": 0.01083988, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.03865027, + "balance_loss_mlp": 1.02883959, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.3432556862860716, + "language_loss": 0.62025702, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.64152873, + "num_input_tokens_seen": 312058210, + "step": 14467, + "time_per_iteration": 2.5031192302703857 + }, + { + "auxiliary_loss_clip": 0.01103888, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.03533459, + "balance_loss_mlp": 1.01925457, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.575453936687019, + "language_loss": 0.68844998, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.70979661, + "num_input_tokens_seen": 312082665, + "step": 14468, + "time_per_iteration": 2.5189077854156494 + }, + { + "auxiliary_loss_clip": 0.01083538, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.03508461, + "balance_loss_mlp": 1.01919031, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.8864866396943933, + "language_loss": 0.70958388, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73072517, + "num_input_tokens_seen": 312101960, + "step": 14469, + "time_per_iteration": 4.050451040267944 + }, + { + "auxiliary_loss_clip": 0.01094218, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.03717089, + "balance_loss_mlp": 1.01760244, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 1.9197595567154049, + "language_loss": 0.84078991, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86202371, + "num_input_tokens_seen": 312117125, + "step": 14470, + "time_per_iteration": 2.4738802909851074 + }, + { + "auxiliary_loss_clip": 0.01084717, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.03547227, + "balance_loss_mlp": 1.02146983, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.8061772218212304, + "language_loss": 0.73146331, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75265795, + "num_input_tokens_seen": 312135775, + "step": 14471, + "time_per_iteration": 2.527635335922241 + }, + { + "auxiliary_loss_clip": 0.01107077, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.03644609, + "balance_loss_mlp": 1.01543474, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 1.4779381528822475, + "language_loss": 0.7902317, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.81156617, + "num_input_tokens_seen": 312156070, + "step": 14472, + "time_per_iteration": 2.4302423000335693 + }, + { + "auxiliary_loss_clip": 0.01093127, + "auxiliary_loss_mlp": 0.00778664, + "balance_loss_clip": 1.03709054, + "balance_loss_mlp": 1.00074005, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 1.805334669761953, + "language_loss": 0.72905046, + "learning_rate": 1.741679706279644e-07, + "loss": 0.74776828, + "num_input_tokens_seen": 312174380, + "step": 14473, + "time_per_iteration": 2.4711008071899414 + }, + { + "auxiliary_loss_clip": 0.01113507, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.03906965, + "balance_loss_mlp": 1.01951134, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.5069708238910677, + "language_loss": 0.72284377, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74429941, + "num_input_tokens_seen": 312195130, + "step": 14474, + "time_per_iteration": 2.463402509689331 + }, + { + "auxiliary_loss_clip": 0.01086188, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.03488541, + "balance_loss_mlp": 1.02748394, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 1.7330760912970422, + "language_loss": 0.6696173, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69089168, + "num_input_tokens_seen": 312212300, + "step": 14475, + "time_per_iteration": 2.5446665287017822 + }, + { + "auxiliary_loss_clip": 0.01108611, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.03504777, + "balance_loss_mlp": 1.01547277, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.612708355107856, + "language_loss": 0.77516806, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79653561, + "num_input_tokens_seen": 312231735, + "step": 14476, + "time_per_iteration": 2.418954849243164 + }, + { + "auxiliary_loss_clip": 0.01091844, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.03408206, + "balance_loss_mlp": 1.02164602, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 1.5307476657008796, + "language_loss": 0.72311044, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74436837, + "num_input_tokens_seen": 312253060, + "step": 14477, + "time_per_iteration": 2.487877368927002 + }, + { + "auxiliary_loss_clip": 0.01100086, + "auxiliary_loss_mlp": 0.01027489, + "balance_loss_clip": 1.03689885, + "balance_loss_mlp": 1.01547289, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 1.8965671266255626, + "language_loss": 0.5950262, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61630195, + "num_input_tokens_seen": 312269460, + "step": 14478, + "time_per_iteration": 2.415552854537964 + }, + { + "auxiliary_loss_clip": 0.01100385, + "auxiliary_loss_mlp": 0.01026778, + "balance_loss_clip": 1.04064727, + "balance_loss_mlp": 1.01632357, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.4379599228300564, + "language_loss": 0.71364903, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73492062, + "num_input_tokens_seen": 312289830, + "step": 14479, + "time_per_iteration": 2.5390512943267822 + }, + { + "auxiliary_loss_clip": 0.01086286, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.03699994, + "balance_loss_mlp": 1.01875877, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 4.534425691976704, + "language_loss": 0.70818782, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.72936302, + "num_input_tokens_seen": 312311320, + "step": 14480, + "time_per_iteration": 2.5630271434783936 + }, + { + "auxiliary_loss_clip": 0.01063588, + "auxiliary_loss_mlp": 0.01030211, + "balance_loss_clip": 1.03689456, + "balance_loss_mlp": 1.0183382, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 2.9572558743269246, + "language_loss": 0.7035017, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72443968, + "num_input_tokens_seen": 312332095, + "step": 14481, + "time_per_iteration": 5.698948383331299 + }, + { + "auxiliary_loss_clip": 0.01096979, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.03635621, + "balance_loss_mlp": 1.02097058, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.7206951545875568, + "language_loss": 0.77121568, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79251611, + "num_input_tokens_seen": 312351225, + "step": 14482, + "time_per_iteration": 2.4636149406433105 + }, + { + "auxiliary_loss_clip": 0.01087388, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.03588033, + "balance_loss_mlp": 1.02194321, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.8653908209113343, + "language_loss": 0.76637703, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.7876004, + "num_input_tokens_seen": 312369730, + "step": 14483, + "time_per_iteration": 2.497457504272461 + }, + { + "auxiliary_loss_clip": 0.01105284, + "auxiliary_loss_mlp": 0.01038672, + "balance_loss_clip": 1.03791869, + "balance_loss_mlp": 1.02507663, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 2.2477256402184227, + "language_loss": 0.61949527, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64093482, + "num_input_tokens_seen": 312386780, + "step": 14484, + "time_per_iteration": 2.434095621109009 + }, + { + "auxiliary_loss_clip": 0.011098, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.03808761, + "balance_loss_mlp": 1.02393746, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 2.221367915271567, + "language_loss": 0.67957276, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70103055, + "num_input_tokens_seen": 312404875, + "step": 14485, + "time_per_iteration": 2.406726121902466 + }, + { + "auxiliary_loss_clip": 0.01054441, + "auxiliary_loss_mlp": 0.00778046, + "balance_loss_clip": 1.03365564, + "balance_loss_mlp": 1.00062084, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.876560368480632, + "language_loss": 0.62768364, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.64600849, + "num_input_tokens_seen": 312425280, + "step": 14486, + "time_per_iteration": 2.689939260482788 + }, + { + "auxiliary_loss_clip": 0.01111287, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.03598833, + "balance_loss_mlp": 1.02203369, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 1.9045655628725209, + "language_loss": 0.62212992, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.64360279, + "num_input_tokens_seen": 312443835, + "step": 14487, + "time_per_iteration": 2.4181413650512695 + }, + { + "auxiliary_loss_clip": 0.01082361, + "auxiliary_loss_mlp": 0.00776328, + "balance_loss_clip": 1.03686857, + "balance_loss_mlp": 1.00064063, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 1.9920008099879944, + "language_loss": 0.67370713, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.692294, + "num_input_tokens_seen": 312460830, + "step": 14488, + "time_per_iteration": 2.467770576477051 + }, + { + "auxiliary_loss_clip": 0.01093659, + "auxiliary_loss_mlp": 0.00777407, + "balance_loss_clip": 1.04028809, + "balance_loss_mlp": 1.00065732, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 1.802287935889573, + "language_loss": 0.85771298, + "learning_rate": 1.716335121648338e-07, + "loss": 0.8764236, + "num_input_tokens_seen": 312477575, + "step": 14489, + "time_per_iteration": 2.465637683868408 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.03924859, + "balance_loss_mlp": 1.02143466, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 2.3919172919429967, + "language_loss": 0.7541008, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.7755065, + "num_input_tokens_seen": 312492140, + "step": 14490, + "time_per_iteration": 2.4216747283935547 + }, + { + "auxiliary_loss_clip": 0.01101527, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.0359273, + "balance_loss_mlp": 1.01617503, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 2.1013535239748014, + "language_loss": 0.76246428, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.7837739, + "num_input_tokens_seen": 312508400, + "step": 14491, + "time_per_iteration": 2.4214773178100586 + }, + { + "auxiliary_loss_clip": 0.01081457, + "auxiliary_loss_mlp": 0.0102465, + "balance_loss_clip": 1.04206038, + "balance_loss_mlp": 1.01272941, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.606195255283482, + "language_loss": 0.66665256, + "learning_rate": 1.711602764198723e-07, + "loss": 0.68771362, + "num_input_tokens_seen": 312525915, + "step": 14492, + "time_per_iteration": 2.509551763534546 + }, + { + "auxiliary_loss_clip": 0.0109782, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.03836298, + "balance_loss_mlp": 1.02053213, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 1.812401271842213, + "language_loss": 0.69629663, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71759707, + "num_input_tokens_seen": 312544735, + "step": 14493, + "time_per_iteration": 3.98789119720459 + }, + { + "auxiliary_loss_clip": 0.01111077, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.03829527, + "balance_loss_mlp": 1.02116263, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 3.6400680201463764, + "language_loss": 0.89157474, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91303188, + "num_input_tokens_seen": 312557910, + "step": 14494, + "time_per_iteration": 2.4205031394958496 + }, + { + "auxiliary_loss_clip": 0.010727, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.03648448, + "balance_loss_mlp": 1.01628268, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 1.794991017688856, + "language_loss": 0.59740132, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61840326, + "num_input_tokens_seen": 312580360, + "step": 14495, + "time_per_iteration": 2.6939401626586914 + }, + { + "auxiliary_loss_clip": 0.01078119, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.03472328, + "balance_loss_mlp": 1.018049, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 2.1642527577637054, + "language_loss": 0.80532265, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82641208, + "num_input_tokens_seen": 312597550, + "step": 14496, + "time_per_iteration": 2.5353078842163086 + }, + { + "auxiliary_loss_clip": 0.01083112, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.0371145, + "balance_loss_mlp": 1.02354407, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 2.0645617328291315, + "language_loss": 0.78963089, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.81083786, + "num_input_tokens_seen": 312616435, + "step": 14497, + "time_per_iteration": 2.4850246906280518 + }, + { + "auxiliary_loss_clip": 0.01111733, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.03786135, + "balance_loss_mlp": 1.01669407, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 2.8507348340091503, + "language_loss": 0.67029905, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69171572, + "num_input_tokens_seen": 312632770, + "step": 14498, + "time_per_iteration": 2.4497811794281006 + }, + { + "auxiliary_loss_clip": 0.0107006, + "auxiliary_loss_mlp": 0.01029732, + "balance_loss_clip": 1.03335404, + "balance_loss_mlp": 1.0177815, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.5520307505842936, + "language_loss": 0.57057774, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59157574, + "num_input_tokens_seen": 312651900, + "step": 14499, + "time_per_iteration": 2.6087145805358887 + }, + { + "auxiliary_loss_clip": 0.01069814, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.03486419, + "balance_loss_mlp": 1.02249765, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 1.9188075009975951, + "language_loss": 0.79481739, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.81587178, + "num_input_tokens_seen": 312671380, + "step": 14500, + "time_per_iteration": 2.531327247619629 + }, + { + "auxiliary_loss_clip": 0.01095991, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.04070044, + "balance_loss_mlp": 1.01733816, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 2.4809337798118385, + "language_loss": 0.72978479, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.75104165, + "num_input_tokens_seen": 312689215, + "step": 14501, + "time_per_iteration": 2.4365899562835693 + }, + { + "auxiliary_loss_clip": 0.01076003, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.03349245, + "balance_loss_mlp": 1.02126265, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.746249343564712, + "language_loss": 0.64773703, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66884309, + "num_input_tokens_seen": 312706400, + "step": 14502, + "time_per_iteration": 2.5074918270111084 + }, + { + "auxiliary_loss_clip": 0.0108605, + "auxiliary_loss_mlp": 0.010421, + "balance_loss_clip": 1.03475893, + "balance_loss_mlp": 1.02826595, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 2.0490591104345435, + "language_loss": 0.68776923, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.70905077, + "num_input_tokens_seen": 312727985, + "step": 14503, + "time_per_iteration": 2.5809805393218994 + }, + { + "auxiliary_loss_clip": 0.0108513, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.03620946, + "balance_loss_mlp": 1.01796532, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 2.5441387737046828, + "language_loss": 0.6974268, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71858507, + "num_input_tokens_seen": 312745025, + "step": 14504, + "time_per_iteration": 2.4656310081481934 + }, + { + "auxiliary_loss_clip": 0.0109625, + "auxiliary_loss_mlp": 0.00777806, + "balance_loss_clip": 1.03682327, + "balance_loss_mlp": 1.00058782, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 1.7583533154524098, + "language_loss": 0.70032012, + "learning_rate": 1.691168026385552e-07, + "loss": 0.71906066, + "num_input_tokens_seen": 312764170, + "step": 14505, + "time_per_iteration": 2.4863810539245605 + }, + { + "auxiliary_loss_clip": 0.01087094, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.03702044, + "balance_loss_mlp": 1.02051187, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.5561905822504625, + "language_loss": 0.78662628, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80781484, + "num_input_tokens_seen": 312783830, + "step": 14506, + "time_per_iteration": 2.4932901859283447 + }, + { + "auxiliary_loss_clip": 0.01090572, + "auxiliary_loss_mlp": 0.01026203, + "balance_loss_clip": 1.0356667, + "balance_loss_mlp": 1.01398396, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.044995483610197, + "language_loss": 0.73941088, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76057863, + "num_input_tokens_seen": 312802015, + "step": 14507, + "time_per_iteration": 2.489786148071289 + }, + { + "auxiliary_loss_clip": 0.01054068, + "auxiliary_loss_mlp": 0.01040696, + "balance_loss_clip": 1.03282952, + "balance_loss_mlp": 1.02533007, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 2.0547835908982517, + "language_loss": 0.72126693, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74221456, + "num_input_tokens_seen": 312820650, + "step": 14508, + "time_per_iteration": 2.5818111896514893 + }, + { + "auxiliary_loss_clip": 0.01092238, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.03821778, + "balance_loss_mlp": 1.02174187, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 1.7295786844812517, + "language_loss": 0.6893869, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.71065885, + "num_input_tokens_seen": 312841310, + "step": 14509, + "time_per_iteration": 4.001091003417969 + }, + { + "auxiliary_loss_clip": 0.01083584, + "auxiliary_loss_mlp": 0.01032557, + "balance_loss_clip": 1.0363996, + "balance_loss_mlp": 1.02086282, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 1.905133690829841, + "language_loss": 0.58687162, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60803306, + "num_input_tokens_seen": 312862100, + "step": 14510, + "time_per_iteration": 2.5700268745422363 + }, + { + "auxiliary_loss_clip": 0.01116083, + "auxiliary_loss_mlp": 0.01035105, + "balance_loss_clip": 1.03795004, + "balance_loss_mlp": 1.02137256, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 2.1909292955276793, + "language_loss": 0.67117149, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69268334, + "num_input_tokens_seen": 312880220, + "step": 14511, + "time_per_iteration": 2.4319136142730713 + }, + { + "auxiliary_loss_clip": 0.0106814, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.03863037, + "balance_loss_mlp": 1.02068782, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 1.8346668282218588, + "language_loss": 0.81725478, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.83826995, + "num_input_tokens_seen": 312900765, + "step": 14512, + "time_per_iteration": 2.6565234661102295 + }, + { + "auxiliary_loss_clip": 0.01014217, + "auxiliary_loss_mlp": 0.01000917, + "balance_loss_clip": 1.00954938, + "balance_loss_mlp": 0.99966544, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.785410635214727, + "language_loss": 0.58552939, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.6056807, + "num_input_tokens_seen": 312955840, + "step": 14513, + "time_per_iteration": 2.9675827026367188 + }, + { + "auxiliary_loss_clip": 0.01098623, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.03684974, + "balance_loss_mlp": 1.01742387, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.7600526236448668, + "language_loss": 0.76735556, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78864288, + "num_input_tokens_seen": 312973565, + "step": 14514, + "time_per_iteration": 2.466552257537842 + }, + { + "auxiliary_loss_clip": 0.0110398, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.03802776, + "balance_loss_mlp": 1.01600885, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 1.9001866755614578, + "language_loss": 0.65209752, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67341435, + "num_input_tokens_seen": 312994660, + "step": 14515, + "time_per_iteration": 2.5048367977142334 + }, + { + "auxiliary_loss_clip": 0.0109889, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.0345794, + "balance_loss_mlp": 1.02059758, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 2.010590244152563, + "language_loss": 0.79335022, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81467891, + "num_input_tokens_seen": 313009860, + "step": 14516, + "time_per_iteration": 2.4490134716033936 + }, + { + "auxiliary_loss_clip": 0.0111265, + "auxiliary_loss_mlp": 0.01029828, + "balance_loss_clip": 1.03828645, + "balance_loss_mlp": 1.01729369, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 2.4132928031220255, + "language_loss": 0.72037649, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74180126, + "num_input_tokens_seen": 313027025, + "step": 14517, + "time_per_iteration": 2.437917947769165 + }, + { + "auxiliary_loss_clip": 0.01069481, + "auxiliary_loss_mlp": 0.0102362, + "balance_loss_clip": 1.03383994, + "balance_loss_mlp": 1.01284969, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 1.8640154216508327, + "language_loss": 0.72404462, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74497563, + "num_input_tokens_seen": 313046830, + "step": 14518, + "time_per_iteration": 2.5163156986236572 + }, + { + "auxiliary_loss_clip": 0.01084217, + "auxiliary_loss_mlp": 0.01038667, + "balance_loss_clip": 1.03453398, + "balance_loss_mlp": 1.02599466, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.353486103890563, + "language_loss": 0.74141455, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.7626434, + "num_input_tokens_seen": 313067715, + "step": 14519, + "time_per_iteration": 2.5422749519348145 + }, + { + "auxiliary_loss_clip": 0.01099476, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.03619742, + "balance_loss_mlp": 1.01811707, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.6386479468426245, + "language_loss": 0.76142824, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78274071, + "num_input_tokens_seen": 313082305, + "step": 14520, + "time_per_iteration": 3.8583109378814697 + }, + { + "auxiliary_loss_clip": 0.01089592, + "auxiliary_loss_mlp": 0.01037054, + "balance_loss_clip": 1.0367837, + "balance_loss_mlp": 1.02330959, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 2.0564267546819472, + "language_loss": 0.82325613, + "learning_rate": 1.666178664801816e-07, + "loss": 0.8445226, + "num_input_tokens_seen": 313101190, + "step": 14521, + "time_per_iteration": 3.7272839546203613 + }, + { + "auxiliary_loss_clip": 0.01101496, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.04143572, + "balance_loss_mlp": 1.02268434, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 2.0010195489101132, + "language_loss": 0.76106608, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78244102, + "num_input_tokens_seen": 313118965, + "step": 14522, + "time_per_iteration": 2.4288413524627686 + }, + { + "auxiliary_loss_clip": 0.01095753, + "auxiliary_loss_mlp": 0.00776561, + "balance_loss_clip": 1.03429496, + "balance_loss_mlp": 1.0006237, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 2.544350379131539, + "language_loss": 0.75579911, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77452224, + "num_input_tokens_seen": 313139280, + "step": 14523, + "time_per_iteration": 2.4737164974212646 + }, + { + "auxiliary_loss_clip": 0.01096458, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.0346427, + "balance_loss_mlp": 1.01927722, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 2.2381933303421055, + "language_loss": 0.78316021, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80443984, + "num_input_tokens_seen": 313156655, + "step": 14524, + "time_per_iteration": 2.434220790863037 + }, + { + "auxiliary_loss_clip": 0.01090035, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_clip": 1.04401708, + "balance_loss_mlp": 1.01670957, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 2.0777100865681013, + "language_loss": 0.77952611, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.80070853, + "num_input_tokens_seen": 313174050, + "step": 14525, + "time_per_iteration": 2.5589754581451416 + }, + { + "auxiliary_loss_clip": 0.01033428, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.03703523, + "balance_loss_mlp": 1.02127457, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.6020994360258214, + "language_loss": 0.68901944, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.70968801, + "num_input_tokens_seen": 313192765, + "step": 14526, + "time_per_iteration": 2.8365819454193115 + }, + { + "auxiliary_loss_clip": 0.01066594, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.04236066, + "balance_loss_mlp": 1.02350342, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 1.963720643973445, + "language_loss": 0.61056376, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.63159955, + "num_input_tokens_seen": 313210925, + "step": 14527, + "time_per_iteration": 3.1156632900238037 + }, + { + "auxiliary_loss_clip": 0.01101784, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.04048395, + "balance_loss_mlp": 1.02051914, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.7963694651152757, + "language_loss": 0.65695834, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.67832518, + "num_input_tokens_seen": 313228250, + "step": 14528, + "time_per_iteration": 2.4232027530670166 + }, + { + "auxiliary_loss_clip": 0.01081479, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.03887486, + "balance_loss_mlp": 1.01659775, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 12.218035211344269, + "language_loss": 0.89490247, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.91600513, + "num_input_tokens_seen": 313247880, + "step": 14529, + "time_per_iteration": 2.5411055088043213 + }, + { + "auxiliary_loss_clip": 0.01086541, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.03491306, + "balance_loss_mlp": 1.02003348, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 2.1932301996097427, + "language_loss": 0.84906077, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.87024796, + "num_input_tokens_seen": 313266790, + "step": 14530, + "time_per_iteration": 2.526829719543457 + }, + { + "auxiliary_loss_clip": 0.01087348, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.0347296, + "balance_loss_mlp": 1.02361023, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.4261884153639077, + "language_loss": 0.74480695, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76603448, + "num_input_tokens_seen": 313286805, + "step": 14531, + "time_per_iteration": 2.5207035541534424 + }, + { + "auxiliary_loss_clip": 0.01094958, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.03422296, + "balance_loss_mlp": 1.01973581, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 1.7538409848248206, + "language_loss": 0.61752927, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.63879168, + "num_input_tokens_seen": 313305415, + "step": 14532, + "time_per_iteration": 4.008764982223511 + }, + { + "auxiliary_loss_clip": 0.01019327, + "auxiliary_loss_mlp": 0.01000791, + "balance_loss_clip": 1.00600696, + "balance_loss_mlp": 0.99955714, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.8232475391612966, + "language_loss": 0.58673793, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60693914, + "num_input_tokens_seen": 313369940, + "step": 14533, + "time_per_iteration": 3.131136178970337 + }, + { + "auxiliary_loss_clip": 0.01080506, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.0333209, + "balance_loss_mlp": 1.02082741, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.4607139919086751, + "language_loss": 0.76606649, + "learning_rate": 1.646005846335954e-07, + "loss": 0.78720033, + "num_input_tokens_seen": 313390965, + "step": 14534, + "time_per_iteration": 2.588646173477173 + }, + { + "auxiliary_loss_clip": 0.01082309, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.03300107, + "balance_loss_mlp": 1.02276754, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 1.8801462157006057, + "language_loss": 0.75073278, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.771909, + "num_input_tokens_seen": 313409680, + "step": 14535, + "time_per_iteration": 2.5065178871154785 + }, + { + "auxiliary_loss_clip": 0.01108636, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.03595924, + "balance_loss_mlp": 1.02112389, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 2.175058365200065, + "language_loss": 0.74622953, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76765662, + "num_input_tokens_seen": 313431335, + "step": 14536, + "time_per_iteration": 2.510540246963501 + }, + { + "auxiliary_loss_clip": 0.01087437, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.03394735, + "balance_loss_mlp": 1.01868391, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1.857424496969593, + "language_loss": 0.64134598, + "learning_rate": 1.641367279482304e-07, + "loss": 0.66252542, + "num_input_tokens_seen": 313449225, + "step": 14537, + "time_per_iteration": 2.4774153232574463 + }, + { + "auxiliary_loss_clip": 0.01096696, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.03440928, + "balance_loss_mlp": 1.01770544, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 3.1251432384155837, + "language_loss": 0.58687752, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60815084, + "num_input_tokens_seen": 313467715, + "step": 14538, + "time_per_iteration": 2.510101795196533 + }, + { + "auxiliary_loss_clip": 0.01096285, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.03634655, + "balance_loss_mlp": 1.01775837, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 1.765764481359634, + "language_loss": 0.68352115, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70478159, + "num_input_tokens_seen": 313486805, + "step": 14539, + "time_per_iteration": 2.445502281188965 + }, + { + "auxiliary_loss_clip": 0.01102442, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.03554523, + "balance_loss_mlp": 1.0200963, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 2.106820228436578, + "language_loss": 0.73762089, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.75897902, + "num_input_tokens_seen": 313504880, + "step": 14540, + "time_per_iteration": 2.431525707244873 + }, + { + "auxiliary_loss_clip": 0.01085447, + "auxiliary_loss_mlp": 0.01044183, + "balance_loss_clip": 1.03518534, + "balance_loss_mlp": 1.03056908, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 1.7251377940302293, + "language_loss": 0.79030395, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81160021, + "num_input_tokens_seen": 313524995, + "step": 14541, + "time_per_iteration": 2.539562463760376 + }, + { + "auxiliary_loss_clip": 0.01072467, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.03499508, + "balance_loss_mlp": 1.01855361, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 2.4248006554816968, + "language_loss": 0.6646812, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.68573779, + "num_input_tokens_seen": 313541740, + "step": 14542, + "time_per_iteration": 2.583238124847412 + }, + { + "auxiliary_loss_clip": 0.01027778, + "auxiliary_loss_mlp": 0.01004604, + "balance_loss_clip": 1.00455332, + "balance_loss_mlp": 1.00333428, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.7879526685147777, + "language_loss": 0.54517448, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56549823, + "num_input_tokens_seen": 313593445, + "step": 14543, + "time_per_iteration": 2.810969114303589 + }, + { + "auxiliary_loss_clip": 0.01085808, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.03688371, + "balance_loss_mlp": 1.01836348, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 2.170818730745099, + "language_loss": 0.69677699, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71794754, + "num_input_tokens_seen": 313615640, + "step": 14544, + "time_per_iteration": 2.5804946422576904 + }, + { + "auxiliary_loss_clip": 0.01066401, + "auxiliary_loss_mlp": 0.01027549, + "balance_loss_clip": 1.03785944, + "balance_loss_mlp": 1.01655793, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.6853174330970744, + "language_loss": 0.75920123, + "learning_rate": 1.62902840325714e-07, + "loss": 0.78014076, + "num_input_tokens_seen": 313635550, + "step": 14545, + "time_per_iteration": 2.600484848022461 + }, + { + "auxiliary_loss_clip": 0.01098248, + "auxiliary_loss_mlp": 0.00779631, + "balance_loss_clip": 1.03446734, + "balance_loss_mlp": 1.00069726, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.809185136730068, + "language_loss": 0.66282314, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68160194, + "num_input_tokens_seen": 313659275, + "step": 14546, + "time_per_iteration": 2.6362080574035645 + }, + { + "auxiliary_loss_clip": 0.01108858, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.03736246, + "balance_loss_mlp": 1.01897955, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.571206285571172, + "language_loss": 0.72972608, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75112808, + "num_input_tokens_seen": 313680595, + "step": 14547, + "time_per_iteration": 2.445267915725708 + }, + { + "auxiliary_loss_clip": 0.01116475, + "auxiliary_loss_mlp": 0.01041579, + "balance_loss_clip": 1.03804684, + "balance_loss_mlp": 1.02796507, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 2.2459635862602223, + "language_loss": 0.69007421, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71165478, + "num_input_tokens_seen": 313699730, + "step": 14548, + "time_per_iteration": 4.068903923034668 + }, + { + "auxiliary_loss_clip": 0.01091057, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.03688955, + "balance_loss_mlp": 1.02275956, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 2.2269038280337554, + "language_loss": 0.70827603, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.72954106, + "num_input_tokens_seen": 313720090, + "step": 14549, + "time_per_iteration": 2.517941951751709 + }, + { + "auxiliary_loss_clip": 0.01101998, + "auxiliary_loss_mlp": 0.00780009, + "balance_loss_clip": 1.03507733, + "balance_loss_mlp": 1.00076187, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 9.950506465615339, + "language_loss": 0.83711767, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.85593778, + "num_input_tokens_seen": 313736795, + "step": 14550, + "time_per_iteration": 2.4748504161834717 + }, + { + "auxiliary_loss_clip": 0.01099762, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.03685451, + "balance_loss_mlp": 1.0265584, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 2.1173233839795307, + "language_loss": 0.71966755, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.74105072, + "num_input_tokens_seen": 313754820, + "step": 14551, + "time_per_iteration": 2.4144599437713623 + }, + { + "auxiliary_loss_clip": 0.01097165, + "auxiliary_loss_mlp": 0.00778925, + "balance_loss_clip": 1.03544915, + "balance_loss_mlp": 1.00066793, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 2.554955326788643, + "language_loss": 0.63826525, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.65702617, + "num_input_tokens_seen": 313775830, + "step": 14552, + "time_per_iteration": 2.515775203704834 + }, + { + "auxiliary_loss_clip": 0.01079109, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.03457463, + "balance_loss_mlp": 1.01335049, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 1.8572608076305943, + "language_loss": 0.79374796, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81481701, + "num_input_tokens_seen": 313795745, + "step": 14553, + "time_per_iteration": 2.530791759490967 + }, + { + "auxiliary_loss_clip": 0.01094641, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.03308177, + "balance_loss_mlp": 1.02052724, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 2.242152371820368, + "language_loss": 0.69981819, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72109312, + "num_input_tokens_seen": 313813895, + "step": 14554, + "time_per_iteration": 2.4071786403656006 + }, + { + "auxiliary_loss_clip": 0.01091201, + "auxiliary_loss_mlp": 0.00778188, + "balance_loss_clip": 1.03864646, + "balance_loss_mlp": 1.00056148, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.472998401781482, + "language_loss": 0.83537585, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85406971, + "num_input_tokens_seen": 313834225, + "step": 14555, + "time_per_iteration": 2.523364543914795 + }, + { + "auxiliary_loss_clip": 0.01097995, + "auxiliary_loss_mlp": 0.01030445, + "balance_loss_clip": 1.03734195, + "balance_loss_mlp": 1.01755238, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.647536768285504, + "language_loss": 0.70897841, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73026282, + "num_input_tokens_seen": 313854430, + "step": 14556, + "time_per_iteration": 2.4981343746185303 + }, + { + "auxiliary_loss_clip": 0.01093145, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.03476202, + "balance_loss_mlp": 1.02072668, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 1.769289577381979, + "language_loss": 0.76689112, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78816843, + "num_input_tokens_seen": 313871600, + "step": 14557, + "time_per_iteration": 2.4733104705810547 + }, + { + "auxiliary_loss_clip": 0.01078305, + "auxiliary_loss_mlp": 0.01039334, + "balance_loss_clip": 1.03999829, + "balance_loss_mlp": 1.02570271, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 2.048072537606495, + "language_loss": 0.82812238, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.84929872, + "num_input_tokens_seen": 313891570, + "step": 14558, + "time_per_iteration": 2.5561983585357666 + }, + { + "auxiliary_loss_clip": 0.01027223, + "auxiliary_loss_mlp": 0.01000658, + "balance_loss_clip": 1.00383425, + "balance_loss_mlp": 0.9993946, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.8029630660030717, + "language_loss": 0.56061828, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58089709, + "num_input_tokens_seen": 313951290, + "step": 14559, + "time_per_iteration": 4.489179611206055 + }, + { + "auxiliary_loss_clip": 0.01098389, + "auxiliary_loss_mlp": 0.010332, + "balance_loss_clip": 1.03752494, + "balance_loss_mlp": 1.021106, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 1.6958389538355287, + "language_loss": 0.66067088, + "learning_rate": 1.606013202286407e-07, + "loss": 0.68198675, + "num_input_tokens_seen": 313968645, + "step": 14560, + "time_per_iteration": 3.846449375152588 + }, + { + "auxiliary_loss_clip": 0.01106907, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.03590465, + "balance_loss_mlp": 1.01772928, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 2.3590474631098246, + "language_loss": 0.78760916, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.80897313, + "num_input_tokens_seen": 313987580, + "step": 14561, + "time_per_iteration": 2.479806900024414 + }, + { + "auxiliary_loss_clip": 0.01110768, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.03637958, + "balance_loss_mlp": 1.02208972, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 2.027910973571864, + "language_loss": 0.77713066, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.79859054, + "num_input_tokens_seen": 314004460, + "step": 14562, + "time_per_iteration": 2.3988258838653564 + }, + { + "auxiliary_loss_clip": 0.01104305, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.03574228, + "balance_loss_mlp": 1.01959848, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 1.948820702281333, + "language_loss": 0.71649098, + "learning_rate": 1.601428988367981e-07, + "loss": 0.73784471, + "num_input_tokens_seen": 314026855, + "step": 14563, + "time_per_iteration": 2.525341272354126 + }, + { + "auxiliary_loss_clip": 0.01114111, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.03895783, + "balance_loss_mlp": 1.02290928, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.294443687528252, + "language_loss": 0.65312719, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67462623, + "num_input_tokens_seen": 314042830, + "step": 14564, + "time_per_iteration": 2.3753461837768555 + }, + { + "auxiliary_loss_clip": 0.01095667, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.03495109, + "balance_loss_mlp": 1.02412546, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.7625939266491972, + "language_loss": 0.70722818, + "learning_rate": 1.598376334037408e-07, + "loss": 0.72854757, + "num_input_tokens_seen": 314062225, + "step": 14565, + "time_per_iteration": 2.4762790203094482 + }, + { + "auxiliary_loss_clip": 0.01093605, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.03741097, + "balance_loss_mlp": 1.02054524, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1.9889956971106328, + "language_loss": 0.77954537, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.80082536, + "num_input_tokens_seen": 314082325, + "step": 14566, + "time_per_iteration": 2.537045478820801 + }, + { + "auxiliary_loss_clip": 0.01090683, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.04091048, + "balance_loss_mlp": 1.02164006, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.711088850471202, + "language_loss": 0.70891571, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73016447, + "num_input_tokens_seen": 314100310, + "step": 14567, + "time_per_iteration": 2.4526889324188232 + }, + { + "auxiliary_loss_clip": 0.01090556, + "auxiliary_loss_mlp": 0.00777648, + "balance_loss_clip": 1.0363934, + "balance_loss_mlp": 1.00062752, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.9893718111268355, + "language_loss": 0.74218565, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76086771, + "num_input_tokens_seen": 314121330, + "step": 14568, + "time_per_iteration": 2.5254359245300293 + }, + { + "auxiliary_loss_clip": 0.01072055, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.03348565, + "balance_loss_mlp": 1.02124381, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 2.954620739116703, + "language_loss": 0.87197739, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.89302987, + "num_input_tokens_seen": 314139875, + "step": 14569, + "time_per_iteration": 2.5420334339141846 + }, + { + "auxiliary_loss_clip": 0.01076746, + "auxiliary_loss_mlp": 0.01027966, + "balance_loss_clip": 1.03880167, + "balance_loss_mlp": 1.01617026, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.6852031056877774, + "language_loss": 0.74519026, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76623738, + "num_input_tokens_seen": 314157850, + "step": 14570, + "time_per_iteration": 2.5251963138580322 + }, + { + "auxiliary_loss_clip": 0.01101133, + "auxiliary_loss_mlp": 0.00778374, + "balance_loss_clip": 1.03700602, + "balance_loss_mlp": 1.00075424, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 1.6958467931738805, + "language_loss": 0.67989999, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69869506, + "num_input_tokens_seen": 314176720, + "step": 14571, + "time_per_iteration": 3.9785783290863037 + }, + { + "auxiliary_loss_clip": 0.01070823, + "auxiliary_loss_mlp": 0.01027835, + "balance_loss_clip": 1.03565311, + "balance_loss_mlp": 1.01599145, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 2.293780314842209, + "language_loss": 0.62783527, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64882183, + "num_input_tokens_seen": 314196645, + "step": 14572, + "time_per_iteration": 2.5324418544769287 + }, + { + "auxiliary_loss_clip": 0.01096688, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_clip": 1.0368917, + "balance_loss_mlp": 1.01586676, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.7153133841836767, + "language_loss": 0.73804009, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75927734, + "num_input_tokens_seen": 314217430, + "step": 14573, + "time_per_iteration": 2.533747434616089 + }, + { + "auxiliary_loss_clip": 0.01063797, + "auxiliary_loss_mlp": 0.00777245, + "balance_loss_clip": 1.03925812, + "balance_loss_mlp": 1.00052285, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 1.946066068076029, + "language_loss": 0.72944224, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.74785262, + "num_input_tokens_seen": 314235310, + "step": 14574, + "time_per_iteration": 2.5627925395965576 + }, + { + "auxiliary_loss_clip": 0.0109683, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.03647375, + "balance_loss_mlp": 1.02231431, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 1.57424043876849, + "language_loss": 0.75951308, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.7808274, + "num_input_tokens_seen": 314252355, + "step": 14575, + "time_per_iteration": 2.4409024715423584 + }, + { + "auxiliary_loss_clip": 0.01083198, + "auxiliary_loss_mlp": 0.01039543, + "balance_loss_clip": 1.03659379, + "balance_loss_mlp": 1.02703834, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.7190408128088954, + "language_loss": 0.6654067, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.68663406, + "num_input_tokens_seen": 314272755, + "step": 14576, + "time_per_iteration": 2.5946803092956543 + }, + { + "auxiliary_loss_clip": 0.01079959, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.03218782, + "balance_loss_mlp": 1.02220511, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 1.955774863530091, + "language_loss": 0.66614676, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68728137, + "num_input_tokens_seen": 314291365, + "step": 14577, + "time_per_iteration": 2.4535884857177734 + }, + { + "auxiliary_loss_clip": 0.01102159, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.03948641, + "balance_loss_mlp": 1.02100062, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 2.2675777032325017, + "language_loss": 0.71144581, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73280537, + "num_input_tokens_seen": 314310075, + "step": 14578, + "time_per_iteration": 2.5001111030578613 + }, + { + "auxiliary_loss_clip": 0.01110525, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.03585505, + "balance_loss_mlp": 1.02041054, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 1.9193933390868878, + "language_loss": 0.71511942, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73655355, + "num_input_tokens_seen": 314325695, + "step": 14579, + "time_per_iteration": 2.375258684158325 + }, + { + "auxiliary_loss_clip": 0.01078763, + "auxiliary_loss_mlp": 0.01036525, + "balance_loss_clip": 1.03141427, + "balance_loss_mlp": 1.02369189, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 1.6592053371325945, + "language_loss": 0.70042807, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72158098, + "num_input_tokens_seen": 314343605, + "step": 14580, + "time_per_iteration": 2.452500104904175 + }, + { + "auxiliary_loss_clip": 0.01107693, + "auxiliary_loss_mlp": 0.0077732, + "balance_loss_clip": 1.03751016, + "balance_loss_mlp": 1.00066423, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.5870025571361615, + "language_loss": 0.65536332, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67421341, + "num_input_tokens_seen": 314364275, + "step": 14581, + "time_per_iteration": 2.435760974884033 + }, + { + "auxiliary_loss_clip": 0.01080582, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.03522658, + "balance_loss_mlp": 1.01996684, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.4607789491861416, + "language_loss": 0.73607969, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75719976, + "num_input_tokens_seen": 314385140, + "step": 14582, + "time_per_iteration": 2.5367977619171143 + }, + { + "auxiliary_loss_clip": 0.01107828, + "auxiliary_loss_mlp": 0.00777964, + "balance_loss_clip": 1.03576255, + "balance_loss_mlp": 1.00055957, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 2.626990010472726, + "language_loss": 0.66890311, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.68776107, + "num_input_tokens_seen": 314403715, + "step": 14583, + "time_per_iteration": 2.400160789489746 + }, + { + "auxiliary_loss_clip": 0.01101729, + "auxiliary_loss_mlp": 0.00777354, + "balance_loss_clip": 1.03713083, + "balance_loss_mlp": 1.00066745, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.6018342821804623, + "language_loss": 0.79121184, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81000268, + "num_input_tokens_seen": 314421880, + "step": 14584, + "time_per_iteration": 2.4376142024993896 + }, + { + "auxiliary_loss_clip": 0.01078273, + "auxiliary_loss_mlp": 0.01024645, + "balance_loss_clip": 1.03702784, + "balance_loss_mlp": 1.01289678, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.7067429166503363, + "language_loss": 0.72111535, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74214453, + "num_input_tokens_seen": 314441585, + "step": 14585, + "time_per_iteration": 2.527259111404419 + }, + { + "auxiliary_loss_clip": 0.01088533, + "auxiliary_loss_mlp": 0.01028849, + "balance_loss_clip": 1.03446281, + "balance_loss_mlp": 1.01581943, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 2.3676347425161537, + "language_loss": 0.74289066, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76406449, + "num_input_tokens_seen": 314459020, + "step": 14586, + "time_per_iteration": 2.476285696029663 + }, + { + "auxiliary_loss_clip": 0.01105532, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.03367698, + "balance_loss_mlp": 1.01734054, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.9834708004051833, + "language_loss": 0.78801286, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80937088, + "num_input_tokens_seen": 314478935, + "step": 14587, + "time_per_iteration": 4.376430511474609 + }, + { + "auxiliary_loss_clip": 0.01098104, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.03660679, + "balance_loss_mlp": 1.0152241, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.6106226185736576, + "language_loss": 0.73949003, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76076436, + "num_input_tokens_seen": 314497635, + "step": 14588, + "time_per_iteration": 2.501269817352295 + }, + { + "auxiliary_loss_clip": 0.01042055, + "auxiliary_loss_mlp": 0.00776804, + "balance_loss_clip": 1.03287423, + "balance_loss_mlp": 1.00057077, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 2.061082670802712, + "language_loss": 0.6664086, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68459719, + "num_input_tokens_seen": 314515445, + "step": 14589, + "time_per_iteration": 2.649813175201416 + }, + { + "auxiliary_loss_clip": 0.01099138, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.0377785, + "balance_loss_mlp": 1.02053618, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 3.372112365414537, + "language_loss": 0.70689714, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.72821474, + "num_input_tokens_seen": 314533040, + "step": 14590, + "time_per_iteration": 2.455317258834839 + }, + { + "auxiliary_loss_clip": 0.0108812, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.0370512, + "balance_loss_mlp": 1.03038669, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 2.0549929104698035, + "language_loss": 0.74521637, + "learning_rate": 1.558945991776086e-07, + "loss": 0.7665453, + "num_input_tokens_seen": 314548280, + "step": 14591, + "time_per_iteration": 2.4419825077056885 + }, + { + "auxiliary_loss_clip": 0.01104138, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.03649998, + "balance_loss_mlp": 1.01687837, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 1.8562366202877456, + "language_loss": 0.79769981, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.81902814, + "num_input_tokens_seen": 314565345, + "step": 14592, + "time_per_iteration": 2.380277633666992 + }, + { + "auxiliary_loss_clip": 0.011046, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.03600585, + "balance_loss_mlp": 1.01603699, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.7309296320995884, + "language_loss": 0.82653862, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84785712, + "num_input_tokens_seen": 314584190, + "step": 14593, + "time_per_iteration": 2.407484292984009 + }, + { + "auxiliary_loss_clip": 0.01091495, + "auxiliary_loss_mlp": 0.01026438, + "balance_loss_clip": 1.03419411, + "balance_loss_mlp": 1.01432681, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 1.5373821538720562, + "language_loss": 0.75851864, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.77969801, + "num_input_tokens_seen": 314605625, + "step": 14594, + "time_per_iteration": 2.484462261199951 + }, + { + "auxiliary_loss_clip": 0.0105976, + "auxiliary_loss_mlp": 0.01036577, + "balance_loss_clip": 1.02981913, + "balance_loss_mlp": 1.02235556, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 3.080593852126007, + "language_loss": 0.77832806, + "learning_rate": 1.552921717241651e-07, + "loss": 0.79929143, + "num_input_tokens_seen": 314622630, + "step": 14595, + "time_per_iteration": 2.525644540786743 + }, + { + "auxiliary_loss_clip": 0.01077904, + "auxiliary_loss_mlp": 0.01031229, + "balance_loss_clip": 1.03715396, + "balance_loss_mlp": 1.01846206, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 2.2887679331718886, + "language_loss": 0.70817262, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.7292639, + "num_input_tokens_seen": 314642460, + "step": 14596, + "time_per_iteration": 2.5710296630859375 + }, + { + "auxiliary_loss_clip": 0.01080321, + "auxiliary_loss_mlp": 0.01024889, + "balance_loss_clip": 1.04114199, + "balance_loss_mlp": 1.01322436, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 1.9678457788789914, + "language_loss": 0.86157775, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88262981, + "num_input_tokens_seen": 314659875, + "step": 14597, + "time_per_iteration": 2.5489466190338135 + }, + { + "auxiliary_loss_clip": 0.01097497, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.037359, + "balance_loss_mlp": 1.02063823, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 1.8474085158541629, + "language_loss": 0.72777605, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.74907589, + "num_input_tokens_seen": 314680260, + "step": 14598, + "time_per_iteration": 2.502790927886963 + }, + { + "auxiliary_loss_clip": 0.01094095, + "auxiliary_loss_mlp": 0.00778619, + "balance_loss_clip": 1.03509927, + "balance_loss_mlp": 1.00068533, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.707135736611774, + "language_loss": 0.77908289, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79781008, + "num_input_tokens_seen": 314696260, + "step": 14599, + "time_per_iteration": 5.2250964641571045 + }, + { + "auxiliary_loss_clip": 0.01079971, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.03563499, + "balance_loss_mlp": 1.01763606, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.4495132232922825, + "language_loss": 0.67587656, + "learning_rate": 1.545407113589332e-07, + "loss": 0.69697452, + "num_input_tokens_seen": 314714215, + "step": 14600, + "time_per_iteration": 2.5141026973724365 + }, + { + "auxiliary_loss_clip": 0.01099352, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.03626812, + "balance_loss_mlp": 1.02727759, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 1.7708209208376569, + "language_loss": 0.6933316, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71472257, + "num_input_tokens_seen": 314735700, + "step": 14601, + "time_per_iteration": 2.694387435913086 + }, + { + "auxiliary_loss_clip": 0.01102396, + "auxiliary_loss_mlp": 0.01030699, + "balance_loss_clip": 1.03730893, + "balance_loss_mlp": 1.01839626, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 1.905348920959651, + "language_loss": 0.73173118, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75306213, + "num_input_tokens_seen": 314753335, + "step": 14602, + "time_per_iteration": 2.486001491546631 + }, + { + "auxiliary_loss_clip": 0.01106758, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.03586817, + "balance_loss_mlp": 1.02013683, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 1.7607616504487251, + "language_loss": 0.71282989, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73421466, + "num_input_tokens_seen": 314770800, + "step": 14603, + "time_per_iteration": 2.38576078414917 + }, + { + "auxiliary_loss_clip": 0.01009135, + "auxiliary_loss_mlp": 0.0099947, + "balance_loss_clip": 1.00515604, + "balance_loss_mlp": 0.99820632, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7371431080045219, + "language_loss": 0.5415535, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56163955, + "num_input_tokens_seen": 314837275, + "step": 14604, + "time_per_iteration": 3.078256130218506 + }, + { + "auxiliary_loss_clip": 0.01002307, + "auxiliary_loss_mlp": 0.01011449, + "balance_loss_clip": 1.00600123, + "balance_loss_mlp": 1.01013172, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 1.556637799229776, + "language_loss": 0.5921275, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61226505, + "num_input_tokens_seen": 314902220, + "step": 14605, + "time_per_iteration": 3.076221227645874 + }, + { + "auxiliary_loss_clip": 0.0106962, + "auxiliary_loss_mlp": 0.01029976, + "balance_loss_clip": 1.03815317, + "balance_loss_mlp": 1.01760221, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.6787788858352153, + "language_loss": 0.85248053, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87347645, + "num_input_tokens_seen": 314921645, + "step": 14606, + "time_per_iteration": 2.571523666381836 + }, + { + "auxiliary_loss_clip": 0.01112142, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.03831267, + "balance_loss_mlp": 1.01882505, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 2.3438539027503693, + "language_loss": 0.70761353, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72905838, + "num_input_tokens_seen": 314939390, + "step": 14607, + "time_per_iteration": 2.3814334869384766 + }, + { + "auxiliary_loss_clip": 0.01085763, + "auxiliary_loss_mlp": 0.01038369, + "balance_loss_clip": 1.03526294, + "balance_loss_mlp": 1.02732396, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.9713441793840227, + "language_loss": 0.72340107, + "learning_rate": 1.533420140300785e-07, + "loss": 0.74464238, + "num_input_tokens_seen": 314959205, + "step": 14608, + "time_per_iteration": 2.529057502746582 + }, + { + "auxiliary_loss_clip": 0.01100502, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.03498101, + "balance_loss_mlp": 1.02172804, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 3.438270385240666, + "language_loss": 0.87233746, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89368826, + "num_input_tokens_seen": 314977485, + "step": 14609, + "time_per_iteration": 2.443864107131958 + }, + { + "auxiliary_loss_clip": 0.01065185, + "auxiliary_loss_mlp": 0.01031103, + "balance_loss_clip": 1.03693438, + "balance_loss_mlp": 1.01895595, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.530845025625381, + "language_loss": 0.7062729, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72723573, + "num_input_tokens_seen": 314997830, + "step": 14610, + "time_per_iteration": 2.565526008605957 + }, + { + "auxiliary_loss_clip": 0.01092374, + "auxiliary_loss_mlp": 0.00777736, + "balance_loss_clip": 1.03852296, + "balance_loss_mlp": 1.00068343, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 2.410175501164505, + "language_loss": 0.80622894, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82493001, + "num_input_tokens_seen": 315016480, + "step": 14611, + "time_per_iteration": 4.009986400604248 + }, + { + "auxiliary_loss_clip": 0.01110054, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.03738618, + "balance_loss_mlp": 1.02140498, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.810163024111784, + "language_loss": 0.7663945, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78783083, + "num_input_tokens_seen": 315036135, + "step": 14612, + "time_per_iteration": 2.4338672161102295 + }, + { + "auxiliary_loss_clip": 0.01058359, + "auxiliary_loss_mlp": 0.01031392, + "balance_loss_clip": 1.03544068, + "balance_loss_mlp": 1.01945281, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.5907476192382075, + "language_loss": 0.7240169, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74491441, + "num_input_tokens_seen": 315057995, + "step": 14613, + "time_per_iteration": 2.584477186203003 + }, + { + "auxiliary_loss_clip": 0.01008833, + "auxiliary_loss_mlp": 0.00998076, + "balance_loss_clip": 1.01277339, + "balance_loss_mlp": 0.99688405, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.0313969675092796, + "language_loss": 0.64567721, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66574621, + "num_input_tokens_seen": 315104010, + "step": 14614, + "time_per_iteration": 2.8786847591400146 + }, + { + "auxiliary_loss_clip": 0.01028502, + "auxiliary_loss_mlp": 0.01002823, + "balance_loss_clip": 1.00515056, + "balance_loss_mlp": 1.00154161, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6617770732117095, + "language_loss": 0.58566844, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60598171, + "num_input_tokens_seen": 315174550, + "step": 14615, + "time_per_iteration": 3.064314365386963 + }, + { + "auxiliary_loss_clip": 0.01067528, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.03949547, + "balance_loss_mlp": 1.01722217, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 1.8847934045697452, + "language_loss": 0.7252667, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.74623108, + "num_input_tokens_seen": 315191825, + "step": 14616, + "time_per_iteration": 2.5384504795074463 + }, + { + "auxiliary_loss_clip": 0.01027092, + "auxiliary_loss_mlp": 0.01002305, + "balance_loss_clip": 1.00393021, + "balance_loss_mlp": 1.00100613, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.8191267253105109, + "language_loss": 0.57946503, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.59975898, + "num_input_tokens_seen": 315255075, + "step": 14617, + "time_per_iteration": 3.1338679790496826 + }, + { + "auxiliary_loss_clip": 0.01081027, + "auxiliary_loss_mlp": 0.01035373, + "balance_loss_clip": 1.0332346, + "balance_loss_mlp": 1.02242088, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 1.7649154315330795, + "language_loss": 0.83747512, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.85863912, + "num_input_tokens_seen": 315273995, + "step": 14618, + "time_per_iteration": 2.5408432483673096 + }, + { + "auxiliary_loss_clip": 0.01080712, + "auxiliary_loss_mlp": 0.01029299, + "balance_loss_clip": 1.03524733, + "balance_loss_mlp": 1.01768804, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 1.5674934786485872, + "language_loss": 0.68903881, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71013892, + "num_input_tokens_seen": 315294485, + "step": 14619, + "time_per_iteration": 2.5066356658935547 + }, + { + "auxiliary_loss_clip": 0.01067554, + "auxiliary_loss_mlp": 0.01033434, + "balance_loss_clip": 1.03476894, + "balance_loss_mlp": 1.02174544, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 1.7902052366361465, + "language_loss": 0.77488375, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.79589367, + "num_input_tokens_seen": 315310420, + "step": 14620, + "time_per_iteration": 2.554990530014038 + }, + { + "auxiliary_loss_clip": 0.01086801, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.03856981, + "balance_loss_mlp": 1.02142394, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.8187309325093086, + "language_loss": 0.79167557, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81289047, + "num_input_tokens_seen": 315330110, + "step": 14621, + "time_per_iteration": 2.4847359657287598 + }, + { + "auxiliary_loss_clip": 0.01089037, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.03505695, + "balance_loss_mlp": 1.02579236, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 1.8182529823929252, + "language_loss": 0.66769505, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.6889655, + "num_input_tokens_seen": 315350080, + "step": 14622, + "time_per_iteration": 2.512242078781128 + }, + { + "auxiliary_loss_clip": 0.01084834, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.0402422, + "balance_loss_mlp": 1.02059627, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.985821394388228, + "language_loss": 0.73043495, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75160569, + "num_input_tokens_seen": 315366360, + "step": 14623, + "time_per_iteration": 2.465816020965576 + }, + { + "auxiliary_loss_clip": 0.01056035, + "auxiliary_loss_mlp": 0.01031231, + "balance_loss_clip": 1.03112006, + "balance_loss_mlp": 1.01939392, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 1.6348387208867752, + "language_loss": 0.78552026, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80639291, + "num_input_tokens_seen": 315385890, + "step": 14624, + "time_per_iteration": 2.5994346141815186 + }, + { + "auxiliary_loss_clip": 0.01098142, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.03377819, + "balance_loss_mlp": 1.02284789, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.8233160835341569, + "language_loss": 0.79914415, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.8204881, + "num_input_tokens_seen": 315403400, + "step": 14625, + "time_per_iteration": 2.484487295150757 + }, + { + "auxiliary_loss_clip": 0.0108613, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.03664458, + "balance_loss_mlp": 1.02414465, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.636727454567476, + "language_loss": 0.74290776, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76413095, + "num_input_tokens_seen": 315423670, + "step": 14626, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01097624, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.03545499, + "balance_loss_mlp": 1.01908326, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 1.7225694297568581, + "language_loss": 0.71132255, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73261321, + "num_input_tokens_seen": 315446265, + "step": 14627, + "time_per_iteration": 3.9759106636047363 + }, + { + "auxiliary_loss_clip": 0.01076814, + "auxiliary_loss_mlp": 0.0103286, + "balance_loss_clip": 1.03794348, + "balance_loss_mlp": 1.02030158, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.8445661196439231, + "language_loss": 0.72495574, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.7460525, + "num_input_tokens_seen": 315464655, + "step": 14628, + "time_per_iteration": 2.5034332275390625 + }, + { + "auxiliary_loss_clip": 0.01078399, + "auxiliary_loss_mlp": 0.01033849, + "balance_loss_clip": 1.03364539, + "balance_loss_mlp": 1.02139175, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 2.420485163525189, + "language_loss": 0.69178545, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71290797, + "num_input_tokens_seen": 315481090, + "step": 14629, + "time_per_iteration": 2.4872093200683594 + }, + { + "auxiliary_loss_clip": 0.01081466, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.03212786, + "balance_loss_mlp": 1.02015853, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.5592860716431511, + "language_loss": 0.68474692, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70587254, + "num_input_tokens_seen": 315502010, + "step": 14630, + "time_per_iteration": 2.5416243076324463 + }, + { + "auxiliary_loss_clip": 0.01082255, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.03488231, + "balance_loss_mlp": 1.02100658, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.4985298694465796, + "language_loss": 0.7442795, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76543391, + "num_input_tokens_seen": 315523040, + "step": 14631, + "time_per_iteration": 2.5843801498413086 + }, + { + "auxiliary_loss_clip": 0.01085055, + "auxiliary_loss_mlp": 0.00776825, + "balance_loss_clip": 1.03543997, + "balance_loss_mlp": 1.00056279, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 1.9875286721412952, + "language_loss": 0.69622982, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71484864, + "num_input_tokens_seen": 315541865, + "step": 14632, + "time_per_iteration": 2.525489568710327 + }, + { + "auxiliary_loss_clip": 0.01087778, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.03768432, + "balance_loss_mlp": 1.01763916, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 2.5350239675021426, + "language_loss": 0.65486264, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67602646, + "num_input_tokens_seen": 315561470, + "step": 14633, + "time_per_iteration": 2.5452866554260254 + }, + { + "auxiliary_loss_clip": 0.01077649, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.03372264, + "balance_loss_mlp": 1.02189779, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.734161448748237, + "language_loss": 0.84016436, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86127925, + "num_input_tokens_seen": 315583140, + "step": 14634, + "time_per_iteration": 2.565396547317505 + }, + { + "auxiliary_loss_clip": 0.01084928, + "auxiliary_loss_mlp": 0.00778098, + "balance_loss_clip": 1.03530478, + "balance_loss_mlp": 1.00064421, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.7506642117108764, + "language_loss": 0.79909283, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.81772304, + "num_input_tokens_seen": 315601935, + "step": 14635, + "time_per_iteration": 2.5368144512176514 + }, + { + "auxiliary_loss_clip": 0.01081706, + "auxiliary_loss_mlp": 0.01024595, + "balance_loss_clip": 1.03653944, + "balance_loss_mlp": 1.01234603, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 1.6829039456635848, + "language_loss": 0.65244377, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67350674, + "num_input_tokens_seen": 315619995, + "step": 14636, + "time_per_iteration": 2.500194787979126 + }, + { + "auxiliary_loss_clip": 0.0108397, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.03455842, + "balance_loss_mlp": 1.02226353, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.5948712120584392, + "language_loss": 0.70223093, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72342587, + "num_input_tokens_seen": 315637895, + "step": 14637, + "time_per_iteration": 2.50708270072937 + }, + { + "auxiliary_loss_clip": 0.01085592, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.03668308, + "balance_loss_mlp": 1.01793456, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 1.867704360973765, + "language_loss": 0.66257954, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.68373162, + "num_input_tokens_seen": 315655520, + "step": 14638, + "time_per_iteration": 3.9832170009613037 + }, + { + "auxiliary_loss_clip": 0.01100221, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.0383805, + "balance_loss_mlp": 1.01689506, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 1.5798965026808363, + "language_loss": 0.57905328, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60034746, + "num_input_tokens_seen": 315678955, + "step": 14639, + "time_per_iteration": 4.021294355392456 + }, + { + "auxiliary_loss_clip": 0.01081113, + "auxiliary_loss_mlp": 0.01034959, + "balance_loss_clip": 1.03343773, + "balance_loss_mlp": 1.02231121, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.5270731663633592, + "language_loss": 0.74283075, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76399142, + "num_input_tokens_seen": 315700360, + "step": 14640, + "time_per_iteration": 2.521728515625 + }, + { + "auxiliary_loss_clip": 0.01085569, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.03294563, + "balance_loss_mlp": 1.02801919, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 1.9321312109596456, + "language_loss": 0.69728649, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71854782, + "num_input_tokens_seen": 315719270, + "step": 14641, + "time_per_iteration": 2.4962656497955322 + }, + { + "auxiliary_loss_clip": 0.01099221, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03987122, + "balance_loss_mlp": 1.01424873, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 4.324076410562841, + "language_loss": 0.85173762, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.8730042, + "num_input_tokens_seen": 315737425, + "step": 14642, + "time_per_iteration": 2.4404780864715576 + }, + { + "auxiliary_loss_clip": 0.01067535, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.03857982, + "balance_loss_mlp": 1.02164638, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 2.018129554383414, + "language_loss": 0.78940547, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.81042564, + "num_input_tokens_seen": 315755725, + "step": 14643, + "time_per_iteration": 2.5416369438171387 + }, + { + "auxiliary_loss_clip": 0.01091663, + "auxiliary_loss_mlp": 0.01027007, + "balance_loss_clip": 1.03359258, + "balance_loss_mlp": 1.01567602, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.6652542957335814, + "language_loss": 0.7318542, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75304085, + "num_input_tokens_seen": 315773835, + "step": 14644, + "time_per_iteration": 2.4103927612304688 + }, + { + "auxiliary_loss_clip": 0.01110782, + "auxiliary_loss_mlp": 0.00778253, + "balance_loss_clip": 1.03712463, + "balance_loss_mlp": 1.00073278, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 2.9525874220869133, + "language_loss": 0.79150236, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81039274, + "num_input_tokens_seen": 315790615, + "step": 14645, + "time_per_iteration": 2.3801212310791016 + }, + { + "auxiliary_loss_clip": 0.01093222, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.03299379, + "balance_loss_mlp": 1.0220387, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 2.1953459589162665, + "language_loss": 0.63975382, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66102952, + "num_input_tokens_seen": 315811010, + "step": 14646, + "time_per_iteration": 2.452197551727295 + }, + { + "auxiliary_loss_clip": 0.01081984, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.03856897, + "balance_loss_mlp": 1.01919055, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 2.039069950558431, + "language_loss": 0.77791965, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79906249, + "num_input_tokens_seen": 315828130, + "step": 14647, + "time_per_iteration": 2.465397357940674 + }, + { + "auxiliary_loss_clip": 0.01106393, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.03620791, + "balance_loss_mlp": 1.01840019, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 2.08453827716338, + "language_loss": 0.75063396, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77199644, + "num_input_tokens_seen": 315844900, + "step": 14648, + "time_per_iteration": 2.367466449737549 + }, + { + "auxiliary_loss_clip": 0.01087317, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.0335772, + "balance_loss_mlp": 1.0190537, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 1.697036986199664, + "language_loss": 0.65543216, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67661893, + "num_input_tokens_seen": 315863745, + "step": 14649, + "time_per_iteration": 2.5027334690093994 + }, + { + "auxiliary_loss_clip": 0.01073838, + "auxiliary_loss_mlp": 0.01028942, + "balance_loss_clip": 1.03767645, + "balance_loss_mlp": 1.0159843, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.5467908838394555, + "language_loss": 0.62475675, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64578462, + "num_input_tokens_seen": 315885765, + "step": 14650, + "time_per_iteration": 2.5613203048706055 + }, + { + "auxiliary_loss_clip": 0.01081897, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.03329086, + "balance_loss_mlp": 1.01673782, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.8325496227977165, + "language_loss": 0.72743833, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.74854118, + "num_input_tokens_seen": 315907340, + "step": 14651, + "time_per_iteration": 4.065206289291382 + }, + { + "auxiliary_loss_clip": 0.01102209, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.03729177, + "balance_loss_mlp": 1.01907015, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.8952128265560166, + "language_loss": 0.71988976, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74123937, + "num_input_tokens_seen": 315924935, + "step": 14652, + "time_per_iteration": 2.426349639892578 + }, + { + "auxiliary_loss_clip": 0.01088797, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.03354597, + "balance_loss_mlp": 1.0193243, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 2.0135713278569045, + "language_loss": 0.75345576, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.77466065, + "num_input_tokens_seen": 315943165, + "step": 14653, + "time_per_iteration": 2.46390438079834 + }, + { + "auxiliary_loss_clip": 0.01111442, + "auxiliary_loss_mlp": 0.01034963, + "balance_loss_clip": 1.03628135, + "balance_loss_mlp": 1.02201676, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 2.4453893521169054, + "language_loss": 0.71107525, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73253924, + "num_input_tokens_seen": 315961340, + "step": 14654, + "time_per_iteration": 2.373892307281494 + }, + { + "auxiliary_loss_clip": 0.01067678, + "auxiliary_loss_mlp": 0.01037749, + "balance_loss_clip": 1.0399648, + "balance_loss_mlp": 1.02449882, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 1.6915168296839906, + "language_loss": 0.71690089, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73795521, + "num_input_tokens_seen": 315981335, + "step": 14655, + "time_per_iteration": 2.637395143508911 + }, + { + "auxiliary_loss_clip": 0.0106133, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.03278196, + "balance_loss_mlp": 1.02258468, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.8011920096829561, + "language_loss": 0.81430089, + "learning_rate": 1.462440453077449e-07, + "loss": 0.8352654, + "num_input_tokens_seen": 316001325, + "step": 14656, + "time_per_iteration": 2.5411007404327393 + }, + { + "auxiliary_loss_clip": 0.01083276, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.03711414, + "balance_loss_mlp": 1.02058566, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.8205958072764197, + "language_loss": 0.68577325, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70693195, + "num_input_tokens_seen": 316022540, + "step": 14657, + "time_per_iteration": 2.5171945095062256 + }, + { + "auxiliary_loss_clip": 0.01086802, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.03891027, + "balance_loss_mlp": 1.0247407, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 2.008246387400081, + "language_loss": 0.83896619, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.8602041, + "num_input_tokens_seen": 316037735, + "step": 14658, + "time_per_iteration": 2.5295403003692627 + }, + { + "auxiliary_loss_clip": 0.01094812, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.03872728, + "balance_loss_mlp": 1.01945066, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 1.6988969286845725, + "language_loss": 0.77242965, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79369843, + "num_input_tokens_seen": 316058105, + "step": 14659, + "time_per_iteration": 2.52327036857605 + }, + { + "auxiliary_loss_clip": 0.01086826, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.03761709, + "balance_loss_mlp": 1.02278543, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 2.0120161299352826, + "language_loss": 0.6038965, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62511069, + "num_input_tokens_seen": 316074415, + "step": 14660, + "time_per_iteration": 2.4968740940093994 + }, + { + "auxiliary_loss_clip": 0.01092593, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.03760171, + "balance_loss_mlp": 1.02547646, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 1.6881187023868112, + "language_loss": 0.77854955, + "learning_rate": 1.455139770123972e-07, + "loss": 0.79986411, + "num_input_tokens_seen": 316094405, + "step": 14661, + "time_per_iteration": 2.540388345718384 + }, + { + "auxiliary_loss_clip": 0.0106969, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.03882027, + "balance_loss_mlp": 1.03060889, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 1.6429221504512006, + "language_loss": 0.76708013, + "learning_rate": 1.45368174298081e-07, + "loss": 0.78821254, + "num_input_tokens_seen": 316113390, + "step": 14662, + "time_per_iteration": 2.5174524784088135 + }, + { + "auxiliary_loss_clip": 0.01064976, + "auxiliary_loss_mlp": 0.01028764, + "balance_loss_clip": 1.0360105, + "balance_loss_mlp": 1.01740277, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 1.9315125158655666, + "language_loss": 0.73645985, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.75739729, + "num_input_tokens_seen": 316131085, + "step": 14663, + "time_per_iteration": 2.584894895553589 + }, + { + "auxiliary_loss_clip": 0.01098805, + "auxiliary_loss_mlp": 0.00777606, + "balance_loss_clip": 1.03700018, + "balance_loss_mlp": 1.00063324, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.5972979479465053, + "language_loss": 0.69886339, + "learning_rate": 1.450767798584489e-07, + "loss": 0.71762753, + "num_input_tokens_seen": 316151440, + "step": 14664, + "time_per_iteration": 2.562607765197754 + }, + { + "auxiliary_loss_clip": 0.0103165, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.03329635, + "balance_loss_mlp": 1.02429605, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.8445155374234046, + "language_loss": 0.81265497, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83333147, + "num_input_tokens_seen": 316170750, + "step": 14665, + "time_per_iteration": 2.6942551136016846 + }, + { + "auxiliary_loss_clip": 0.01084275, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.03529191, + "balance_loss_mlp": 1.02474427, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 13.681921855684356, + "language_loss": 0.58749402, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60869867, + "num_input_tokens_seen": 316187265, + "step": 14666, + "time_per_iteration": 4.841364622116089 + }, + { + "auxiliary_loss_clip": 0.01099807, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.03737652, + "balance_loss_mlp": 1.01806688, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 2.8629080542769576, + "language_loss": 0.838552, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85986716, + "num_input_tokens_seen": 316206555, + "step": 14667, + "time_per_iteration": 2.4234368801116943 + }, + { + "auxiliary_loss_clip": 0.01108961, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.03666902, + "balance_loss_mlp": 1.01925898, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.9539889322838366, + "language_loss": 0.62626338, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64767814, + "num_input_tokens_seen": 316225210, + "step": 14668, + "time_per_iteration": 2.4106338024139404 + }, + { + "auxiliary_loss_clip": 0.01093472, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.03486907, + "balance_loss_mlp": 1.01951456, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.6636476272726264, + "language_loss": 0.56675112, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.58798766, + "num_input_tokens_seen": 316242685, + "step": 14669, + "time_per_iteration": 2.4099185466766357 + }, + { + "auxiliary_loss_clip": 0.01106057, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.03484392, + "balance_loss_mlp": 1.01821244, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 2.365334264496059, + "language_loss": 0.71388811, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73524988, + "num_input_tokens_seen": 316260935, + "step": 14670, + "time_per_iteration": 2.3958938121795654 + }, + { + "auxiliary_loss_clip": 0.01090762, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.03254044, + "balance_loss_mlp": 1.01875448, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 4.471405205407009, + "language_loss": 0.73605549, + "learning_rate": 1.44059115283929e-07, + "loss": 0.75727606, + "num_input_tokens_seen": 316281190, + "step": 14671, + "time_per_iteration": 2.5013985633850098 + }, + { + "auxiliary_loss_clip": 0.01088268, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.03398526, + "balance_loss_mlp": 1.0173409, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 3.054823676185547, + "language_loss": 0.85306919, + "learning_rate": 1.43914016096218e-07, + "loss": 0.87425613, + "num_input_tokens_seen": 316297115, + "step": 14672, + "time_per_iteration": 2.468780517578125 + }, + { + "auxiliary_loss_clip": 0.01072161, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.03382146, + "balance_loss_mlp": 1.01800227, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.5183645971349056, + "language_loss": 0.7235105, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74453104, + "num_input_tokens_seen": 316318235, + "step": 14673, + "time_per_iteration": 2.547105312347412 + }, + { + "auxiliary_loss_clip": 0.01010069, + "auxiliary_loss_mlp": 0.00999284, + "balance_loss_clip": 1.00527024, + "balance_loss_mlp": 0.99823523, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.8076264592920082, + "language_loss": 0.49353874, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51363224, + "num_input_tokens_seen": 316384705, + "step": 14674, + "time_per_iteration": 3.1674790382385254 + }, + { + "auxiliary_loss_clip": 0.01083932, + "auxiliary_loss_mlp": 0.00778464, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.00059724, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 2.3708053275162655, + "language_loss": 0.76485324, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78347719, + "num_input_tokens_seen": 316401165, + "step": 14675, + "time_per_iteration": 2.4767937660217285 + }, + { + "auxiliary_loss_clip": 0.01080807, + "auxiliary_loss_mlp": 0.01032301, + "balance_loss_clip": 1.0341742, + "balance_loss_mlp": 1.02045798, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 3.2808142740271586, + "language_loss": 0.79594654, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.81707764, + "num_input_tokens_seen": 316418780, + "step": 14676, + "time_per_iteration": 2.4727632999420166 + }, + { + "auxiliary_loss_clip": 0.01009891, + "auxiliary_loss_mlp": 0.01000595, + "balance_loss_clip": 1.02105856, + "balance_loss_mlp": 0.99920064, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.6882724413686893, + "language_loss": 0.54739529, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56750011, + "num_input_tokens_seen": 316482030, + "step": 14677, + "time_per_iteration": 4.800127744674683 + }, + { + "auxiliary_loss_clip": 0.01104266, + "auxiliary_loss_mlp": 0.01028329, + "balance_loss_clip": 1.03443015, + "balance_loss_mlp": 1.01657498, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 2.447019556510436, + "language_loss": 0.64722383, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.66854978, + "num_input_tokens_seen": 316499175, + "step": 14678, + "time_per_iteration": 2.3888142108917236 + }, + { + "auxiliary_loss_clip": 0.01087443, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.03401434, + "balance_loss_mlp": 1.02101398, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 1.759077366959955, + "language_loss": 0.71028769, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73149425, + "num_input_tokens_seen": 316519495, + "step": 14679, + "time_per_iteration": 3.8828654289245605 + }, + { + "auxiliary_loss_clip": 0.01085341, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.03771353, + "balance_loss_mlp": 1.01960742, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.5909854772946164, + "language_loss": 0.63713109, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.65828568, + "num_input_tokens_seen": 316538180, + "step": 14680, + "time_per_iteration": 2.497878074645996 + }, + { + "auxiliary_loss_clip": 0.01107787, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.03658164, + "balance_loss_mlp": 1.02185774, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.523617261542292, + "language_loss": 0.77274728, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79416579, + "num_input_tokens_seen": 316551750, + "step": 14681, + "time_per_iteration": 2.3608052730560303 + }, + { + "auxiliary_loss_clip": 0.01087099, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.03471875, + "balance_loss_mlp": 1.01617718, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.8085956490401833, + "language_loss": 0.72971845, + "learning_rate": 1.424668961888047e-07, + "loss": 0.75087965, + "num_input_tokens_seen": 316570680, + "step": 14682, + "time_per_iteration": 2.4905896186828613 + }, + { + "auxiliary_loss_clip": 0.01071756, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.0416981, + "balance_loss_mlp": 1.0169667, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 2.3453554751764356, + "language_loss": 0.74646592, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76749384, + "num_input_tokens_seen": 316588635, + "step": 14683, + "time_per_iteration": 2.5456318855285645 + }, + { + "auxiliary_loss_clip": 0.01076803, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.0355134, + "balance_loss_mlp": 1.01927638, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.748683456207234, + "language_loss": 0.65555942, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67664826, + "num_input_tokens_seen": 316607550, + "step": 14684, + "time_per_iteration": 2.5282509326934814 + }, + { + "auxiliary_loss_clip": 0.01091691, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.03525233, + "balance_loss_mlp": 1.01648402, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 5.748576245083062, + "language_loss": 0.69266462, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71385777, + "num_input_tokens_seen": 316624460, + "step": 14685, + "time_per_iteration": 2.4151744842529297 + }, + { + "auxiliary_loss_clip": 0.01059702, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.03513789, + "balance_loss_mlp": 1.02292323, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 2.0456464288385394, + "language_loss": 0.74673712, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76770449, + "num_input_tokens_seen": 316640765, + "step": 14686, + "time_per_iteration": 2.5469765663146973 + }, + { + "auxiliary_loss_clip": 0.01057305, + "auxiliary_loss_mlp": 0.01026017, + "balance_loss_clip": 1.03267932, + "balance_loss_mlp": 1.01365471, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 1.890839524491289, + "language_loss": 0.63015467, + "learning_rate": 1.417459773114007e-07, + "loss": 0.65098786, + "num_input_tokens_seen": 316656120, + "step": 14687, + "time_per_iteration": 2.5189738273620605 + }, + { + "auxiliary_loss_clip": 0.01099089, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.03616214, + "balance_loss_mlp": 1.02279866, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 1.919266691835692, + "language_loss": 0.69093448, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71227741, + "num_input_tokens_seen": 316676095, + "step": 14688, + "time_per_iteration": 2.4988789558410645 + }, + { + "auxiliary_loss_clip": 0.01092944, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.0348568, + "balance_loss_mlp": 1.01674175, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.4934770334837522, + "language_loss": 0.66767657, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.6888988, + "num_input_tokens_seen": 316696235, + "step": 14689, + "time_per_iteration": 2.5140960216522217 + }, + { + "auxiliary_loss_clip": 0.01081035, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.03773832, + "balance_loss_mlp": 1.01651633, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.5808986491337746, + "language_loss": 0.7465589, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76765168, + "num_input_tokens_seen": 316719680, + "step": 14690, + "time_per_iteration": 4.126803874969482 + }, + { + "auxiliary_loss_clip": 0.0108639, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.03420961, + "balance_loss_mlp": 1.02164412, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.4875932599538164, + "language_loss": 0.72798383, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.74919879, + "num_input_tokens_seen": 316739830, + "step": 14691, + "time_per_iteration": 2.5242300033569336 + }, + { + "auxiliary_loss_clip": 0.0108212, + "auxiliary_loss_mlp": 0.01026893, + "balance_loss_clip": 1.0417372, + "balance_loss_mlp": 1.01407242, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 1.9087146129299277, + "language_loss": 0.51981258, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.54090273, + "num_input_tokens_seen": 316758105, + "step": 14692, + "time_per_iteration": 2.527318000793457 + }, + { + "auxiliary_loss_clip": 0.01075208, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.03973317, + "balance_loss_mlp": 1.01879156, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 2.1899052223043705, + "language_loss": 0.60362601, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62468624, + "num_input_tokens_seen": 316777455, + "step": 14693, + "time_per_iteration": 2.53948712348938 + }, + { + "auxiliary_loss_clip": 0.01104594, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.03668761, + "balance_loss_mlp": 1.01482439, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.4932963291798702, + "language_loss": 0.75702786, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77833372, + "num_input_tokens_seen": 316796300, + "step": 14694, + "time_per_iteration": 2.429095983505249 + }, + { + "auxiliary_loss_clip": 0.01093356, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.03391683, + "balance_loss_mlp": 1.0199604, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 2.201559653612019, + "language_loss": 0.72705293, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74830401, + "num_input_tokens_seen": 316819090, + "step": 14695, + "time_per_iteration": 2.512070655822754 + }, + { + "auxiliary_loss_clip": 0.01092516, + "auxiliary_loss_mlp": 0.01026283, + "balance_loss_clip": 1.03534484, + "balance_loss_mlp": 1.01478481, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.597513004316273, + "language_loss": 0.80184877, + "learning_rate": 1.404527630961998e-07, + "loss": 0.82303673, + "num_input_tokens_seen": 316839250, + "step": 14696, + "time_per_iteration": 2.4676921367645264 + }, + { + "auxiliary_loss_clip": 0.01069794, + "auxiliary_loss_mlp": 0.01027029, + "balance_loss_clip": 1.03578305, + "balance_loss_mlp": 1.01553702, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.5976583134135267, + "language_loss": 0.7466625, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.76763076, + "num_input_tokens_seen": 316861315, + "step": 14697, + "time_per_iteration": 2.606313467025757 + }, + { + "auxiliary_loss_clip": 0.01085708, + "auxiliary_loss_mlp": 0.01037545, + "balance_loss_clip": 1.03526723, + "balance_loss_mlp": 1.0253799, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 2.107100136340577, + "language_loss": 0.72206438, + "learning_rate": 1.401661576761779e-07, + "loss": 0.74329686, + "num_input_tokens_seen": 316879325, + "step": 14698, + "time_per_iteration": 2.4730303287506104 + }, + { + "auxiliary_loss_clip": 0.01018637, + "auxiliary_loss_mlp": 0.0100047, + "balance_loss_clip": 1.00520909, + "balance_loss_mlp": 0.99941552, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.8027086911741236, + "language_loss": 0.53703618, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55722725, + "num_input_tokens_seen": 316936425, + "step": 14699, + "time_per_iteration": 3.041752338409424 + }, + { + "auxiliary_loss_clip": 0.01090284, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.03626513, + "balance_loss_mlp": 1.01732814, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.7161540490939133, + "language_loss": 0.76780683, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.78900826, + "num_input_tokens_seen": 316956360, + "step": 14700, + "time_per_iteration": 2.517565965652466 + }, + { + "auxiliary_loss_clip": 0.01076959, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.03679538, + "balance_loss_mlp": 1.01755071, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 2.636483556879649, + "language_loss": 0.72973025, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.75079191, + "num_input_tokens_seen": 316975295, + "step": 14701, + "time_per_iteration": 2.526059627532959 + }, + { + "auxiliary_loss_clip": 0.01087651, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.03464484, + "balance_loss_mlp": 1.02078438, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 1.726365000247592, + "language_loss": 0.71156621, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.73279059, + "num_input_tokens_seen": 316994520, + "step": 14702, + "time_per_iteration": 2.529662847518921 + }, + { + "auxiliary_loss_clip": 0.01074108, + "auxiliary_loss_mlp": 0.01038718, + "balance_loss_clip": 1.03421044, + "balance_loss_mlp": 1.02428746, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 1.8651973365578365, + "language_loss": 0.71279454, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73392272, + "num_input_tokens_seen": 317018095, + "step": 14703, + "time_per_iteration": 2.7263550758361816 + }, + { + "auxiliary_loss_clip": 0.01066113, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.03773308, + "balance_loss_mlp": 1.0190444, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 1.9236225184856015, + "language_loss": 0.66848856, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68945092, + "num_input_tokens_seen": 317035755, + "step": 14704, + "time_per_iteration": 2.55556058883667 + }, + { + "auxiliary_loss_clip": 0.01086991, + "auxiliary_loss_mlp": 0.01028371, + "balance_loss_clip": 1.03376937, + "balance_loss_mlp": 1.01717162, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.7652662427120624, + "language_loss": 0.70879245, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72994608, + "num_input_tokens_seen": 317055765, + "step": 14705, + "time_per_iteration": 4.001708745956421 + }, + { + "auxiliary_loss_clip": 0.01087877, + "auxiliary_loss_mlp": 0.01030801, + "balance_loss_clip": 1.03707683, + "balance_loss_mlp": 1.02003026, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.6873680442479928, + "language_loss": 0.71000981, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.73119664, + "num_input_tokens_seen": 317077955, + "step": 14706, + "time_per_iteration": 2.574622392654419 + }, + { + "auxiliary_loss_clip": 0.01096449, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.03496826, + "balance_loss_mlp": 1.0180701, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 2.004167982787022, + "language_loss": 0.74442244, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.7656858, + "num_input_tokens_seen": 317095825, + "step": 14707, + "time_per_iteration": 2.473729133605957 + }, + { + "auxiliary_loss_clip": 0.01002498, + "auxiliary_loss_mlp": 0.01000037, + "balance_loss_clip": 1.00704885, + "balance_loss_mlp": 0.99867171, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.7940379764645213, + "language_loss": 0.60332346, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62334877, + "num_input_tokens_seen": 317152875, + "step": 14708, + "time_per_iteration": 2.9361915588378906 + }, + { + "auxiliary_loss_clip": 0.01077495, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.03222311, + "balance_loss_mlp": 1.01945949, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 7.061006421002505, + "language_loss": 0.67068952, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.69177181, + "num_input_tokens_seen": 317176725, + "step": 14709, + "time_per_iteration": 2.661437511444092 + }, + { + "auxiliary_loss_clip": 0.01091209, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.03692436, + "balance_loss_mlp": 1.02140105, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.6040498694486074, + "language_loss": 0.62662268, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64788204, + "num_input_tokens_seen": 317206880, + "step": 14710, + "time_per_iteration": 2.749446153640747 + }, + { + "auxiliary_loss_clip": 0.01075015, + "auxiliary_loss_mlp": 0.01026534, + "balance_loss_clip": 1.03512776, + "balance_loss_mlp": 1.01574588, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 2.8765523431542874, + "language_loss": 0.64486408, + "learning_rate": 1.38310100580431e-07, + "loss": 0.66587961, + "num_input_tokens_seen": 317224135, + "step": 14711, + "time_per_iteration": 2.5354421138763428 + }, + { + "auxiliary_loss_clip": 0.01073162, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.03306198, + "balance_loss_mlp": 1.01819921, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 2.007692563255726, + "language_loss": 0.76217788, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78321952, + "num_input_tokens_seen": 317244505, + "step": 14712, + "time_per_iteration": 2.5755953788757324 + }, + { + "auxiliary_loss_clip": 0.01051387, + "auxiliary_loss_mlp": 0.01027864, + "balance_loss_clip": 1.03131008, + "balance_loss_mlp": 1.01542473, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 2.230833802677638, + "language_loss": 0.81375468, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83454722, + "num_input_tokens_seen": 317257830, + "step": 14713, + "time_per_iteration": 2.5594069957733154 + }, + { + "auxiliary_loss_clip": 0.01084668, + "auxiliary_loss_mlp": 0.01026141, + "balance_loss_clip": 1.03487706, + "balance_loss_mlp": 1.0137372, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.4412900725003397, + "language_loss": 0.55445993, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57556796, + "num_input_tokens_seen": 317278430, + "step": 14714, + "time_per_iteration": 2.5489296913146973 + }, + { + "auxiliary_loss_clip": 0.01052371, + "auxiliary_loss_mlp": 0.01039351, + "balance_loss_clip": 1.03492618, + "balance_loss_mlp": 1.02645814, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 2.196154851383307, + "language_loss": 0.74099034, + "learning_rate": 1.377414057838755e-07, + "loss": 0.76190758, + "num_input_tokens_seen": 317295970, + "step": 14715, + "time_per_iteration": 2.6644413471221924 + }, + { + "auxiliary_loss_clip": 0.01098374, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.03535128, + "balance_loss_mlp": 1.01913714, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 1.5497392087723458, + "language_loss": 0.75299144, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77428341, + "num_input_tokens_seen": 317316185, + "step": 14716, + "time_per_iteration": 4.025964736938477 + }, + { + "auxiliary_loss_clip": 0.01078393, + "auxiliary_loss_mlp": 0.01040364, + "balance_loss_clip": 1.03615296, + "balance_loss_mlp": 1.02816939, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 1.8947292593921816, + "language_loss": 0.71081793, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.7320056, + "num_input_tokens_seen": 317333275, + "step": 14717, + "time_per_iteration": 4.0243189334869385 + }, + { + "auxiliary_loss_clip": 0.01093376, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.03505063, + "balance_loss_mlp": 1.02092814, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 1.9730563929282277, + "language_loss": 0.73750234, + "learning_rate": 1.373156261464208e-07, + "loss": 0.75876218, + "num_input_tokens_seen": 317351245, + "step": 14718, + "time_per_iteration": 2.5600426197052 + }, + { + "auxiliary_loss_clip": 0.01056867, + "auxiliary_loss_mlp": 0.01026748, + "balance_loss_clip": 1.03576982, + "balance_loss_mlp": 1.01463044, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 1.8032374573833108, + "language_loss": 0.78594518, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80678129, + "num_input_tokens_seen": 317370740, + "step": 14719, + "time_per_iteration": 2.639145612716675 + }, + { + "auxiliary_loss_clip": 0.01109048, + "auxiliary_loss_mlp": 0.01026402, + "balance_loss_clip": 1.03626728, + "balance_loss_mlp": 1.01426101, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.7831632261132586, + "language_loss": 0.71900195, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.7403565, + "num_input_tokens_seen": 317388370, + "step": 14720, + "time_per_iteration": 2.386683225631714 + }, + { + "auxiliary_loss_clip": 0.01088821, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.03538132, + "balance_loss_mlp": 1.0198071, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 2.2160750857019753, + "language_loss": 0.82219619, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84341252, + "num_input_tokens_seen": 317407390, + "step": 14721, + "time_per_iteration": 2.4941112995147705 + }, + { + "auxiliary_loss_clip": 0.01087881, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.0343287, + "balance_loss_mlp": 1.01961446, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 1.7153666674357397, + "language_loss": 0.62261999, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64382708, + "num_input_tokens_seen": 317430825, + "step": 14722, + "time_per_iteration": 2.721419095993042 + }, + { + "auxiliary_loss_clip": 0.01098751, + "auxiliary_loss_mlp": 0.01029522, + "balance_loss_clip": 1.03535056, + "balance_loss_mlp": 1.01730919, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 1.9839306306514275, + "language_loss": 0.68821472, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.70949745, + "num_input_tokens_seen": 317451905, + "step": 14723, + "time_per_iteration": 2.5605406761169434 + }, + { + "auxiliary_loss_clip": 0.01075706, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.03306222, + "balance_loss_mlp": 1.02022028, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.9511436445585115, + "language_loss": 0.78174704, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80283082, + "num_input_tokens_seen": 317470030, + "step": 14724, + "time_per_iteration": 2.546201705932617 + }, + { + "auxiliary_loss_clip": 0.01019177, + "auxiliary_loss_mlp": 0.01001183, + "balance_loss_clip": 1.0056982, + "balance_loss_mlp": 0.99997264, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.8032754094660042, + "language_loss": 0.58890855, + "learning_rate": 1.363246127376143e-07, + "loss": 0.6091122, + "num_input_tokens_seen": 317527460, + "step": 14725, + "time_per_iteration": 2.920001983642578 + }, + { + "auxiliary_loss_clip": 0.01087715, + "auxiliary_loss_mlp": 0.00779939, + "balance_loss_clip": 1.0326997, + "balance_loss_mlp": 1.00063646, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 2.290898697441549, + "language_loss": 0.69055212, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.70922869, + "num_input_tokens_seen": 317544070, + "step": 14726, + "time_per_iteration": 2.4638121128082275 + }, + { + "auxiliary_loss_clip": 0.01094579, + "auxiliary_loss_mlp": 0.0077749, + "balance_loss_clip": 1.03430843, + "balance_loss_mlp": 1.00065184, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.7579936439785036, + "language_loss": 0.6975826, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71630323, + "num_input_tokens_seen": 317570275, + "step": 14727, + "time_per_iteration": 2.6450555324554443 + }, + { + "auxiliary_loss_clip": 0.01088536, + "auxiliary_loss_mlp": 0.0103351, + "balance_loss_clip": 1.03932929, + "balance_loss_mlp": 1.02112973, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.7384679351190162, + "language_loss": 0.70078158, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72200209, + "num_input_tokens_seen": 317590160, + "step": 14728, + "time_per_iteration": 2.5120182037353516 + }, + { + "auxiliary_loss_clip": 0.01078601, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.03788781, + "balance_loss_mlp": 1.01755929, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.002965895792878, + "language_loss": 0.66327542, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68434817, + "num_input_tokens_seen": 317608340, + "step": 14729, + "time_per_iteration": 4.010065317153931 + }, + { + "auxiliary_loss_clip": 0.01082055, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.03845024, + "balance_loss_mlp": 1.02280176, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.6062494768260678, + "language_loss": 0.63035357, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.65151656, + "num_input_tokens_seen": 317629910, + "step": 14730, + "time_per_iteration": 2.615464448928833 + }, + { + "auxiliary_loss_clip": 0.01070067, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.03349459, + "balance_loss_mlp": 1.02075458, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.520945324738281, + "language_loss": 0.79658699, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81761038, + "num_input_tokens_seen": 317650265, + "step": 14731, + "time_per_iteration": 2.53238844871521 + }, + { + "auxiliary_loss_clip": 0.01074686, + "auxiliary_loss_mlp": 0.01034673, + "balance_loss_clip": 1.03228521, + "balance_loss_mlp": 1.02225149, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.8170217005693963, + "language_loss": 0.83066624, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85175979, + "num_input_tokens_seen": 317669045, + "step": 14732, + "time_per_iteration": 2.5273427963256836 + }, + { + "auxiliary_loss_clip": 0.0100977, + "auxiliary_loss_mlp": 0.00999952, + "balance_loss_clip": 1.00661373, + "balance_loss_mlp": 0.99869484, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.9354859957280224, + "language_loss": 0.59881371, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.61891091, + "num_input_tokens_seen": 317728065, + "step": 14733, + "time_per_iteration": 3.057741165161133 + }, + { + "auxiliary_loss_clip": 0.011097, + "auxiliary_loss_mlp": 0.00777912, + "balance_loss_clip": 1.03779125, + "balance_loss_mlp": 1.00058055, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 2.8257412912923887, + "language_loss": 0.66735947, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68623555, + "num_input_tokens_seen": 317746120, + "step": 14734, + "time_per_iteration": 2.43522310256958 + }, + { + "auxiliary_loss_clip": 0.01079822, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.03458023, + "balance_loss_mlp": 1.02399302, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 1.8708407128389204, + "language_loss": 0.75673938, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77789295, + "num_input_tokens_seen": 317762280, + "step": 14735, + "time_per_iteration": 2.455443859100342 + }, + { + "auxiliary_loss_clip": 0.01070852, + "auxiliary_loss_mlp": 0.01034963, + "balance_loss_clip": 1.03263485, + "balance_loss_mlp": 1.02213573, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 1.8056875077128192, + "language_loss": 0.70344508, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72450316, + "num_input_tokens_seen": 317780615, + "step": 14736, + "time_per_iteration": 2.5240159034729004 + }, + { + "auxiliary_loss_clip": 0.01080197, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.03937364, + "balance_loss_mlp": 1.01734364, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 1.7459717863753965, + "language_loss": 0.84603822, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.8671326, + "num_input_tokens_seen": 317798830, + "step": 14737, + "time_per_iteration": 2.468461036682129 + }, + { + "auxiliary_loss_clip": 0.01085727, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.03650296, + "balance_loss_mlp": 1.01974785, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 2.7620743780980357, + "language_loss": 0.67911136, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70030218, + "num_input_tokens_seen": 317819235, + "step": 14738, + "time_per_iteration": 2.643784761428833 + }, + { + "auxiliary_loss_clip": 0.01101809, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.03643942, + "balance_loss_mlp": 1.01782441, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.7058356454948027, + "language_loss": 0.7525382, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77386272, + "num_input_tokens_seen": 317836785, + "step": 14739, + "time_per_iteration": 2.478872537612915 + }, + { + "auxiliary_loss_clip": 0.01095079, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.03589082, + "balance_loss_mlp": 1.01793194, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 1.9643752813686954, + "language_loss": 0.87114394, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89238614, + "num_input_tokens_seen": 317854225, + "step": 14740, + "time_per_iteration": 2.4421868324279785 + }, + { + "auxiliary_loss_clip": 0.01062447, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.03440726, + "balance_loss_mlp": 1.02155197, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 2.264864318081496, + "language_loss": 0.63521153, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65617824, + "num_input_tokens_seen": 317874865, + "step": 14741, + "time_per_iteration": 2.6013035774230957 + }, + { + "auxiliary_loss_clip": 0.01108064, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03670347, + "balance_loss_mlp": 1.01817489, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 2.1528688901402355, + "language_loss": 0.72676384, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.74814713, + "num_input_tokens_seen": 317892830, + "step": 14742, + "time_per_iteration": 2.3943607807159424 + }, + { + "auxiliary_loss_clip": 0.01096077, + "auxiliary_loss_mlp": 0.007772, + "balance_loss_clip": 1.03496289, + "balance_loss_mlp": 1.00058508, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.8661118571967241, + "language_loss": 0.58984268, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.60857546, + "num_input_tokens_seen": 317911780, + "step": 14743, + "time_per_iteration": 2.4774532318115234 + }, + { + "auxiliary_loss_clip": 0.01080857, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.03561592, + "balance_loss_mlp": 1.0221628, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 1.6782589781468686, + "language_loss": 0.6014111, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62257916, + "num_input_tokens_seen": 317932855, + "step": 14744, + "time_per_iteration": 3.9784677028656006 + }, + { + "auxiliary_loss_clip": 0.0109264, + "auxiliary_loss_mlp": 0.0077771, + "balance_loss_clip": 1.03632259, + "balance_loss_mlp": 1.00060582, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 2.164744486715665, + "language_loss": 0.76980364, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78850716, + "num_input_tokens_seen": 317952090, + "step": 14745, + "time_per_iteration": 2.4449350833892822 + }, + { + "auxiliary_loss_clip": 0.01107239, + "auxiliary_loss_mlp": 0.00778576, + "balance_loss_clip": 1.03685522, + "balance_loss_mlp": 1.0005908, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 1.833239419130914, + "language_loss": 0.77550656, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79436469, + "num_input_tokens_seen": 317970370, + "step": 14746, + "time_per_iteration": 2.4002809524536133 + }, + { + "auxiliary_loss_clip": 0.01087367, + "auxiliary_loss_mlp": 0.01035569, + "balance_loss_clip": 1.04040432, + "balance_loss_mlp": 1.02262866, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 2.8238882884044503, + "language_loss": 0.7644465, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.78567588, + "num_input_tokens_seen": 317989125, + "step": 14747, + "time_per_iteration": 2.4902031421661377 + }, + { + "auxiliary_loss_clip": 0.01081832, + "auxiliary_loss_mlp": 0.00776774, + "balance_loss_clip": 1.03295624, + "balance_loss_mlp": 1.00058079, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.8154339338089875, + "language_loss": 0.8237282, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84231424, + "num_input_tokens_seen": 318007820, + "step": 14748, + "time_per_iteration": 2.491424798965454 + }, + { + "auxiliary_loss_clip": 0.01098299, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.03658676, + "balance_loss_mlp": 1.02163267, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 1.892127725297409, + "language_loss": 0.77269387, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.7940166, + "num_input_tokens_seen": 318030435, + "step": 14749, + "time_per_iteration": 2.6851162910461426 + }, + { + "auxiliary_loss_clip": 0.01046282, + "auxiliary_loss_mlp": 0.00777755, + "balance_loss_clip": 1.03685093, + "balance_loss_mlp": 1.00067616, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 1.950178510615369, + "language_loss": 0.69616449, + "learning_rate": 1.328135602550451e-07, + "loss": 0.71440482, + "num_input_tokens_seen": 318049465, + "step": 14750, + "time_per_iteration": 2.608279228210449 + }, + { + "auxiliary_loss_clip": 0.01095797, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.03529418, + "balance_loss_mlp": 1.02088094, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 1.8097134731277782, + "language_loss": 0.59548593, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.6167686, + "num_input_tokens_seen": 318067760, + "step": 14751, + "time_per_iteration": 2.461893081665039 + }, + { + "auxiliary_loss_clip": 0.01107615, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.03616619, + "balance_loss_mlp": 1.02163911, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.1281039549885787, + "language_loss": 0.81016606, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.8315835, + "num_input_tokens_seen": 318082785, + "step": 14752, + "time_per_iteration": 2.384307861328125 + }, + { + "auxiliary_loss_clip": 0.01090249, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.03707242, + "balance_loss_mlp": 1.01938057, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 2.5367857286030713, + "language_loss": 0.80096, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82219172, + "num_input_tokens_seen": 318101925, + "step": 14753, + "time_per_iteration": 2.495453357696533 + }, + { + "auxiliary_loss_clip": 0.01105872, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.03520513, + "balance_loss_mlp": 1.02053547, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 1.5589593916629156, + "language_loss": 0.65331584, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.674698, + "num_input_tokens_seen": 318119945, + "step": 14754, + "time_per_iteration": 2.393505811691284 + }, + { + "auxiliary_loss_clip": 0.01110878, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.03812897, + "balance_loss_mlp": 1.01854384, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 1.8492003415452634, + "language_loss": 0.74619603, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76761508, + "num_input_tokens_seen": 318139685, + "step": 14755, + "time_per_iteration": 4.010523796081543 + }, + { + "auxiliary_loss_clip": 0.01086412, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.0319922, + "balance_loss_mlp": 1.02024889, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.5562491579640605, + "language_loss": 0.77862394, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.79982471, + "num_input_tokens_seen": 318160375, + "step": 14756, + "time_per_iteration": 3.7500054836273193 + }, + { + "auxiliary_loss_clip": 0.01089059, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.0363034, + "balance_loss_mlp": 1.02335477, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 1.8887529371170866, + "language_loss": 0.76659322, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78783935, + "num_input_tokens_seen": 318177995, + "step": 14757, + "time_per_iteration": 2.4606540203094482 + }, + { + "auxiliary_loss_clip": 0.01051162, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.03110909, + "balance_loss_mlp": 1.02069902, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 1.8517859153250482, + "language_loss": 0.68226397, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70310414, + "num_input_tokens_seen": 318197030, + "step": 14758, + "time_per_iteration": 2.6658477783203125 + }, + { + "auxiliary_loss_clip": 0.01108345, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.0367527, + "balance_loss_mlp": 1.0207907, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.6623141215844908, + "language_loss": 0.68954021, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.7109673, + "num_input_tokens_seen": 318221780, + "step": 14759, + "time_per_iteration": 2.729095697402954 + }, + { + "auxiliary_loss_clip": 0.01104519, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.03314459, + "balance_loss_mlp": 1.01987779, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 2.470297076440363, + "language_loss": 0.74655104, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76791829, + "num_input_tokens_seen": 318239710, + "step": 14760, + "time_per_iteration": 2.430783748626709 + }, + { + "auxiliary_loss_clip": 0.01090376, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.02305675, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 2.380782033247273, + "language_loss": 0.75927496, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.78053814, + "num_input_tokens_seen": 318257425, + "step": 14761, + "time_per_iteration": 2.5107550621032715 + }, + { + "auxiliary_loss_clip": 0.01110248, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.03633428, + "balance_loss_mlp": 1.02629828, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 1.6158096780439135, + "language_loss": 0.61303127, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63452101, + "num_input_tokens_seen": 318278485, + "step": 14762, + "time_per_iteration": 2.498549699783325 + }, + { + "auxiliary_loss_clip": 0.01095847, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.03378725, + "balance_loss_mlp": 1.02176797, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 1.7554053428089091, + "language_loss": 0.64069659, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66200823, + "num_input_tokens_seen": 318297560, + "step": 14763, + "time_per_iteration": 2.4570138454437256 + }, + { + "auxiliary_loss_clip": 0.01083982, + "auxiliary_loss_mlp": 0.00778228, + "balance_loss_clip": 1.03720689, + "balance_loss_mlp": 1.00066781, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.0363662285825703, + "language_loss": 0.71440846, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.73303056, + "num_input_tokens_seen": 318313060, + "step": 14764, + "time_per_iteration": 2.482483148574829 + }, + { + "auxiliary_loss_clip": 0.01112327, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.0369041, + "balance_loss_mlp": 1.02279639, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.2805852500523094, + "language_loss": 0.65861762, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68009067, + "num_input_tokens_seen": 318332030, + "step": 14765, + "time_per_iteration": 2.424703598022461 + }, + { + "auxiliary_loss_clip": 0.01069453, + "auxiliary_loss_mlp": 0.0102565, + "balance_loss_clip": 1.03679228, + "balance_loss_mlp": 1.01497436, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.890115584525155, + "language_loss": 0.76408768, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78503877, + "num_input_tokens_seen": 318351090, + "step": 14766, + "time_per_iteration": 2.5646462440490723 + }, + { + "auxiliary_loss_clip": 0.01077672, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.03301001, + "balance_loss_mlp": 1.02041376, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 1.9331697878953504, + "language_loss": 0.72838575, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.74948573, + "num_input_tokens_seen": 318372000, + "step": 14767, + "time_per_iteration": 2.489440441131592 + }, + { + "auxiliary_loss_clip": 0.01103901, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.03518891, + "balance_loss_mlp": 1.02021015, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 2.083226557708881, + "language_loss": 0.70825946, + "learning_rate": 1.303129987538778e-07, + "loss": 0.7296164, + "num_input_tokens_seen": 318391530, + "step": 14768, + "time_per_iteration": 2.4451122283935547 + }, + { + "auxiliary_loss_clip": 0.01095073, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.03467083, + "balance_loss_mlp": 1.0187248, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.666312132326374, + "language_loss": 0.70079941, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72205597, + "num_input_tokens_seen": 318410690, + "step": 14769, + "time_per_iteration": 3.8369364738464355 + }, + { + "auxiliary_loss_clip": 0.01081517, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.03348351, + "balance_loss_mlp": 1.01877868, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 2.201043536224922, + "language_loss": 0.67198968, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69310606, + "num_input_tokens_seen": 318427380, + "step": 14770, + "time_per_iteration": 2.454843521118164 + }, + { + "auxiliary_loss_clip": 0.01094106, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.03665984, + "balance_loss_mlp": 1.02015042, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 1.7991052758510984, + "language_loss": 0.65218759, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67344862, + "num_input_tokens_seen": 318448530, + "step": 14771, + "time_per_iteration": 2.490727186203003 + }, + { + "auxiliary_loss_clip": 0.010855, + "auxiliary_loss_mlp": 0.01026139, + "balance_loss_clip": 1.03423107, + "balance_loss_mlp": 1.01463556, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.451412228706949, + "language_loss": 0.8236829, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84479928, + "num_input_tokens_seen": 318468655, + "step": 14772, + "time_per_iteration": 2.5480825901031494 + }, + { + "auxiliary_loss_clip": 0.01082528, + "auxiliary_loss_mlp": 0.01025565, + "balance_loss_clip": 1.03215742, + "balance_loss_mlp": 1.01437747, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.5871846179495608, + "language_loss": 0.76267099, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78375196, + "num_input_tokens_seen": 318488740, + "step": 14773, + "time_per_iteration": 2.541348457336426 + }, + { + "auxiliary_loss_clip": 0.0108335, + "auxiliary_loss_mlp": 0.01027781, + "balance_loss_clip": 1.0346067, + "balance_loss_mlp": 1.01647377, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.7189341621761367, + "language_loss": 0.75172818, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77283949, + "num_input_tokens_seen": 318508810, + "step": 14774, + "time_per_iteration": 2.5437023639678955 + }, + { + "auxiliary_loss_clip": 0.01063995, + "auxiliary_loss_mlp": 0.00776838, + "balance_loss_clip": 1.03702235, + "balance_loss_mlp": 1.00058687, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 2.484059975095638, + "language_loss": 0.71864104, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.7370494, + "num_input_tokens_seen": 318526860, + "step": 14775, + "time_per_iteration": 2.5810375213623047 + }, + { + "auxiliary_loss_clip": 0.01107117, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.03621817, + "balance_loss_mlp": 1.0193727, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 1.7923482302338505, + "language_loss": 0.80442357, + "learning_rate": 1.292090097299432e-07, + "loss": 0.82580709, + "num_input_tokens_seen": 318545180, + "step": 14776, + "time_per_iteration": 2.4140713214874268 + }, + { + "auxiliary_loss_clip": 0.01103539, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.03574979, + "balance_loss_mlp": 1.01832354, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 5.812339038284071, + "language_loss": 0.69508719, + "learning_rate": 1.290713302796802e-07, + "loss": 0.71643233, + "num_input_tokens_seen": 318564350, + "step": 14777, + "time_per_iteration": 2.501572370529175 + }, + { + "auxiliary_loss_clip": 0.0109212, + "auxiliary_loss_mlp": 0.01036075, + "balance_loss_clip": 1.03082633, + "balance_loss_mlp": 1.02381396, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 1.724126854175723, + "language_loss": 0.70214081, + "learning_rate": 1.2893372177522e-07, + "loss": 0.72342277, + "num_input_tokens_seen": 318582275, + "step": 14778, + "time_per_iteration": 2.4322216510772705 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.03581572, + "balance_loss_mlp": 1.01898587, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 1.8669111385294992, + "language_loss": 0.77610672, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79749072, + "num_input_tokens_seen": 318601230, + "step": 14779, + "time_per_iteration": 2.4108774662017822 + }, + { + "auxiliary_loss_clip": 0.01016192, + "auxiliary_loss_mlp": 0.01002031, + "balance_loss_clip": 1.01722705, + "balance_loss_mlp": 1.00096381, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.878569927239539, + "language_loss": 0.56786031, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58804256, + "num_input_tokens_seen": 318645595, + "step": 14780, + "time_per_iteration": 2.876652240753174 + }, + { + "auxiliary_loss_clip": 0.01026094, + "auxiliary_loss_mlp": 0.01001925, + "balance_loss_clip": 1.00287795, + "balance_loss_mlp": 1.00067317, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7902853194831266, + "language_loss": 0.62395716, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64423728, + "num_input_tokens_seen": 318707850, + "step": 14781, + "time_per_iteration": 3.1015355587005615 + }, + { + "auxiliary_loss_clip": 0.00982608, + "auxiliary_loss_mlp": 0.01006192, + "balance_loss_clip": 1.02721667, + "balance_loss_mlp": 1.00511289, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7839949865043319, + "language_loss": 0.58144808, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60133606, + "num_input_tokens_seen": 318764915, + "step": 14782, + "time_per_iteration": 3.252070903778076 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.03678226, + "balance_loss_mlp": 1.02018452, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 1.5439759386477572, + "language_loss": 0.65715861, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.67854011, + "num_input_tokens_seen": 318785660, + "step": 14783, + "time_per_iteration": 4.454373359680176 + }, + { + "auxiliary_loss_clip": 0.01113438, + "auxiliary_loss_mlp": 0.01036821, + "balance_loss_clip": 1.03798676, + "balance_loss_mlp": 1.02399457, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.6225353998287866, + "language_loss": 0.77581805, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79732066, + "num_input_tokens_seen": 318806080, + "step": 14784, + "time_per_iteration": 2.5331709384918213 + }, + { + "auxiliary_loss_clip": 0.01083568, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.03515649, + "balance_loss_mlp": 1.02220035, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 2.5437856461734873, + "language_loss": 0.60167003, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62285757, + "num_input_tokens_seen": 318826445, + "step": 14785, + "time_per_iteration": 2.564931869506836 + }, + { + "auxiliary_loss_clip": 0.01075871, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.03602982, + "balance_loss_mlp": 1.02031147, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 2.1237798478607517, + "language_loss": 0.65061951, + "learning_rate": 1.278354084140445e-07, + "loss": 0.67170733, + "num_input_tokens_seen": 318843915, + "step": 14786, + "time_per_iteration": 2.526183605194092 + }, + { + "auxiliary_loss_clip": 0.01081908, + "auxiliary_loss_mlp": 0.00779881, + "balance_loss_clip": 1.03995574, + "balance_loss_mlp": 1.00064111, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 2.537062557087518, + "language_loss": 0.85484099, + "learning_rate": 1.276984386563009e-07, + "loss": 0.8734588, + "num_input_tokens_seen": 318859670, + "step": 14787, + "time_per_iteration": 2.5310580730438232 + }, + { + "auxiliary_loss_clip": 0.01083189, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.03597307, + "balance_loss_mlp": 1.01710463, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 1.7923874072648043, + "language_loss": 0.70998174, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.73110747, + "num_input_tokens_seen": 318877855, + "step": 14788, + "time_per_iteration": 2.4910173416137695 + }, + { + "auxiliary_loss_clip": 0.01105328, + "auxiliary_loss_mlp": 0.01032499, + "balance_loss_clip": 1.03641939, + "balance_loss_mlp": 1.0213052, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.78269279774711, + "language_loss": 0.70061886, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72199714, + "num_input_tokens_seen": 318896045, + "step": 14789, + "time_per_iteration": 2.434948205947876 + }, + { + "auxiliary_loss_clip": 0.01100049, + "auxiliary_loss_mlp": 0.01026057, + "balance_loss_clip": 1.03714108, + "balance_loss_mlp": 1.01369476, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 1.526106888229676, + "language_loss": 0.70678842, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72804946, + "num_input_tokens_seen": 318915515, + "step": 14790, + "time_per_iteration": 2.49438738822937 + }, + { + "auxiliary_loss_clip": 0.01087649, + "auxiliary_loss_mlp": 0.01026621, + "balance_loss_clip": 1.0362463, + "balance_loss_mlp": 1.01569593, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 1.847597525844274, + "language_loss": 0.72864181, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.74978447, + "num_input_tokens_seen": 318934305, + "step": 14791, + "time_per_iteration": 2.5023603439331055 + }, + { + "auxiliary_loss_clip": 0.01079006, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.03571689, + "balance_loss_mlp": 1.02177954, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 1.4040397461062795, + "language_loss": 0.73642588, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.7575537, + "num_input_tokens_seen": 318953880, + "step": 14792, + "time_per_iteration": 2.5759665966033936 + }, + { + "auxiliary_loss_clip": 0.01041949, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.03184676, + "balance_loss_mlp": 1.01914096, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 2.2382447010177766, + "language_loss": 0.66312683, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68387312, + "num_input_tokens_seen": 318971395, + "step": 14793, + "time_per_iteration": 2.650961399078369 + }, + { + "auxiliary_loss_clip": 0.01080404, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.03608453, + "balance_loss_mlp": 1.0200609, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.5349892778950218, + "language_loss": 0.715469, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73660147, + "num_input_tokens_seen": 318990580, + "step": 14794, + "time_per_iteration": 4.2693188190460205 + }, + { + "auxiliary_loss_clip": 0.01102886, + "auxiliary_loss_mlp": 0.01031887, + "balance_loss_clip": 1.0417918, + "balance_loss_mlp": 1.01882815, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 2.0375472270553403, + "language_loss": 0.75640738, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77775502, + "num_input_tokens_seen": 319010040, + "step": 14795, + "time_per_iteration": 2.5630152225494385 + }, + { + "auxiliary_loss_clip": 0.01003081, + "auxiliary_loss_mlp": 0.01000975, + "balance_loss_clip": 1.00802946, + "balance_loss_mlp": 0.99974108, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7658733217389481, + "language_loss": 0.56136274, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58140337, + "num_input_tokens_seen": 319063860, + "step": 14796, + "time_per_iteration": 4.205257415771484 + }, + { + "auxiliary_loss_clip": 0.011105, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.03708339, + "balance_loss_mlp": 1.01822376, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 2.007024068757125, + "language_loss": 0.70121658, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72264552, + "num_input_tokens_seen": 319082335, + "step": 14797, + "time_per_iteration": 2.451362371444702 + }, + { + "auxiliary_loss_clip": 0.0101394, + "auxiliary_loss_mlp": 0.01002232, + "balance_loss_clip": 1.00799894, + "balance_loss_mlp": 1.00090885, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7476094368309188, + "language_loss": 0.57936907, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.59953082, + "num_input_tokens_seen": 319147075, + "step": 14798, + "time_per_iteration": 3.1468605995178223 + }, + { + "auxiliary_loss_clip": 0.01098084, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.03507829, + "balance_loss_mlp": 1.01659977, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.762397781873083, + "language_loss": 0.7923438, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.8136192, + "num_input_tokens_seen": 319166630, + "step": 14799, + "time_per_iteration": 2.4814302921295166 + }, + { + "auxiliary_loss_clip": 0.01017498, + "auxiliary_loss_mlp": 0.01001418, + "balance_loss_clip": 1.00380552, + "balance_loss_mlp": 1.00030351, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.8797120145295099, + "language_loss": 0.58108294, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60127211, + "num_input_tokens_seen": 319221865, + "step": 14800, + "time_per_iteration": 2.967132091522217 + }, + { + "auxiliary_loss_clip": 0.01099736, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.03889287, + "balance_loss_mlp": 1.02095318, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.7771186482779244, + "language_loss": 0.66164082, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68296158, + "num_input_tokens_seen": 319240710, + "step": 14801, + "time_per_iteration": 2.4766340255737305 + }, + { + "auxiliary_loss_clip": 0.01076463, + "auxiliary_loss_mlp": 0.01039335, + "balance_loss_clip": 1.03640687, + "balance_loss_mlp": 1.02484512, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.3268275673753607, + "language_loss": 0.7542845, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77544248, + "num_input_tokens_seen": 319256495, + "step": 14802, + "time_per_iteration": 2.50508975982666 + }, + { + "auxiliary_loss_clip": 0.01095894, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.04010653, + "balance_loss_mlp": 1.0192678, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 1.7829854976395845, + "language_loss": 0.73823225, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75949967, + "num_input_tokens_seen": 319273620, + "step": 14803, + "time_per_iteration": 2.4828906059265137 + }, + { + "auxiliary_loss_clip": 0.01080675, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.03492379, + "balance_loss_mlp": 1.01973307, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 2.1397562663103513, + "language_loss": 0.71779525, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.73892212, + "num_input_tokens_seen": 319291720, + "step": 14804, + "time_per_iteration": 2.5054383277893066 + }, + { + "auxiliary_loss_clip": 0.01097372, + "auxiliary_loss_mlp": 0.01031787, + "balance_loss_clip": 1.03951287, + "balance_loss_mlp": 1.01920497, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.6497150290400373, + "language_loss": 0.8120625, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83335412, + "num_input_tokens_seen": 319310380, + "step": 14805, + "time_per_iteration": 2.492304563522339 + }, + { + "auxiliary_loss_clip": 0.01099669, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.01962137, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 1.7974501354213852, + "language_loss": 0.67103314, + "learning_rate": 1.251095087580505e-07, + "loss": 0.6923492, + "num_input_tokens_seen": 319331765, + "step": 14806, + "time_per_iteration": 2.5639021396636963 + }, + { + "auxiliary_loss_clip": 0.01084742, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.03353882, + "balance_loss_mlp": 1.01910484, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 2.583125526845808, + "language_loss": 0.66970199, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.69086564, + "num_input_tokens_seen": 319349135, + "step": 14807, + "time_per_iteration": 2.462923049926758 + }, + { + "auxiliary_loss_clip": 0.01083144, + "auxiliary_loss_mlp": 0.0102602, + "balance_loss_clip": 1.0344851, + "balance_loss_mlp": 1.01487386, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.7050170730465546, + "language_loss": 0.75364852, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77474016, + "num_input_tokens_seen": 319368410, + "step": 14808, + "time_per_iteration": 4.045866012573242 + }, + { + "auxiliary_loss_clip": 0.01074936, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.03489757, + "balance_loss_mlp": 1.02028048, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 2.702375056615278, + "language_loss": 0.81357646, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83464277, + "num_input_tokens_seen": 319387535, + "step": 14809, + "time_per_iteration": 2.5393736362457275 + }, + { + "auxiliary_loss_clip": 0.01097838, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.0349896, + "balance_loss_mlp": 1.01911259, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.8656449916318656, + "language_loss": 0.6877166, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70900261, + "num_input_tokens_seen": 319407210, + "step": 14810, + "time_per_iteration": 2.486971616744995 + }, + { + "auxiliary_loss_clip": 0.01077499, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.0346272, + "balance_loss_mlp": 1.01411796, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 1.9686576971129308, + "language_loss": 0.70201659, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.72306061, + "num_input_tokens_seen": 319425340, + "step": 14811, + "time_per_iteration": 2.5374045372009277 + }, + { + "auxiliary_loss_clip": 0.01078874, + "auxiliary_loss_mlp": 0.00778093, + "balance_loss_clip": 1.03600001, + "balance_loss_mlp": 1.00058973, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 2.3518835680889354, + "language_loss": 0.6546582, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67322779, + "num_input_tokens_seen": 319448150, + "step": 14812, + "time_per_iteration": 2.812023878097534 + }, + { + "auxiliary_loss_clip": 0.01067514, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.04199994, + "balance_loss_mlp": 1.02067351, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.7871454554149453, + "language_loss": 0.68657422, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70756781, + "num_input_tokens_seen": 319466115, + "step": 14813, + "time_per_iteration": 2.576338768005371 + }, + { + "auxiliary_loss_clip": 0.01086819, + "auxiliary_loss_mlp": 0.01037701, + "balance_loss_clip": 1.03171062, + "balance_loss_mlp": 1.0209043, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 2.0819100525697216, + "language_loss": 0.7538929, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77513814, + "num_input_tokens_seen": 319485255, + "step": 14814, + "time_per_iteration": 2.5125064849853516 + }, + { + "auxiliary_loss_clip": 0.01098673, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.03407645, + "balance_loss_mlp": 1.01729918, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 1.9728281700902428, + "language_loss": 0.74308389, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76437384, + "num_input_tokens_seen": 319501800, + "step": 14815, + "time_per_iteration": 2.4748151302337646 + }, + { + "auxiliary_loss_clip": 0.01070874, + "auxiliary_loss_mlp": 0.01032356, + "balance_loss_clip": 1.03155518, + "balance_loss_mlp": 1.02013063, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 1.7721359075216625, + "language_loss": 0.75139624, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77242851, + "num_input_tokens_seen": 319520415, + "step": 14816, + "time_per_iteration": 2.555755853652954 + }, + { + "auxiliary_loss_clip": 0.0108896, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.03485417, + "balance_loss_mlp": 1.0158155, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 2.421148230902451, + "language_loss": 0.77913117, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.8003034, + "num_input_tokens_seen": 319538410, + "step": 14817, + "time_per_iteration": 2.5135233402252197 + }, + { + "auxiliary_loss_clip": 0.01004829, + "auxiliary_loss_mlp": 0.01001576, + "balance_loss_clip": 1.00847244, + "balance_loss_mlp": 1.00054502, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.7435185735930221, + "language_loss": 0.56499386, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58505785, + "num_input_tokens_seen": 319602565, + "step": 14818, + "time_per_iteration": 3.163846492767334 + }, + { + "auxiliary_loss_clip": 0.01059986, + "auxiliary_loss_mlp": 0.01032962, + "balance_loss_clip": 1.03621936, + "balance_loss_mlp": 1.0206058, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.6970864045766678, + "language_loss": 0.64397025, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66489971, + "num_input_tokens_seen": 319624645, + "step": 14819, + "time_per_iteration": 2.673257350921631 + }, + { + "auxiliary_loss_clip": 0.01096804, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.03498125, + "balance_loss_mlp": 1.01772642, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 1.7840539190564844, + "language_loss": 0.78605437, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80732644, + "num_input_tokens_seen": 319644040, + "step": 14820, + "time_per_iteration": 2.5057075023651123 + }, + { + "auxiliary_loss_clip": 0.0107097, + "auxiliary_loss_mlp": 0.00776511, + "balance_loss_clip": 1.03548694, + "balance_loss_mlp": 1.00059617, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.917761618476086, + "language_loss": 0.76623094, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78470576, + "num_input_tokens_seen": 319663930, + "step": 14821, + "time_per_iteration": 2.570371389389038 + }, + { + "auxiliary_loss_clip": 0.01025151, + "auxiliary_loss_mlp": 0.00753673, + "balance_loss_clip": 1.01628828, + "balance_loss_mlp": 1.00007391, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 1.5472182204942053, + "language_loss": 0.59357166, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61135989, + "num_input_tokens_seen": 319721245, + "step": 14822, + "time_per_iteration": 4.392891883850098 + }, + { + "auxiliary_loss_clip": 0.01093207, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.03667974, + "balance_loss_mlp": 1.01783144, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 1.8136273870330102, + "language_loss": 0.69316781, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.714405, + "num_input_tokens_seen": 319741200, + "step": 14823, + "time_per_iteration": 2.4945993423461914 + }, + { + "auxiliary_loss_clip": 0.01090483, + "auxiliary_loss_mlp": 0.01033499, + "balance_loss_clip": 1.03345919, + "balance_loss_mlp": 1.02058828, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.7475756098882378, + "language_loss": 0.69360584, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.7148456, + "num_input_tokens_seen": 319759265, + "step": 14824, + "time_per_iteration": 2.454730749130249 + }, + { + "auxiliary_loss_clip": 0.01060595, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.03421724, + "balance_loss_mlp": 1.02654922, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 1.7691604330216186, + "language_loss": 0.70206141, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72307396, + "num_input_tokens_seen": 319777560, + "step": 14825, + "time_per_iteration": 2.592150926589966 + }, + { + "auxiliary_loss_clip": 0.01085653, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.03329611, + "balance_loss_mlp": 1.01866913, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 1.914816363347178, + "language_loss": 0.71481991, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.73599029, + "num_input_tokens_seen": 319794125, + "step": 14826, + "time_per_iteration": 2.4622788429260254 + }, + { + "auxiliary_loss_clip": 0.01096476, + "auxiliary_loss_mlp": 0.01028403, + "balance_loss_clip": 1.03660512, + "balance_loss_mlp": 1.01647019, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 2.2179394021002206, + "language_loss": 0.75134563, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.77259439, + "num_input_tokens_seen": 319810310, + "step": 14827, + "time_per_iteration": 2.447829008102417 + }, + { + "auxiliary_loss_clip": 0.0109847, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.03587222, + "balance_loss_mlp": 1.02054989, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.9124376110116899, + "language_loss": 0.78639162, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80770421, + "num_input_tokens_seen": 319828505, + "step": 14828, + "time_per_iteration": 2.4619390964508057 + }, + { + "auxiliary_loss_clip": 0.01068573, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.03486919, + "balance_loss_mlp": 1.0204916, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 19.317265750355613, + "language_loss": 0.75315064, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77416801, + "num_input_tokens_seen": 319848680, + "step": 14829, + "time_per_iteration": 2.5738844871520996 + }, + { + "auxiliary_loss_clip": 0.01108218, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.03569603, + "balance_loss_mlp": 1.02376699, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.5803286785580541, + "language_loss": 0.84601152, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.8674463, + "num_input_tokens_seen": 319868835, + "step": 14830, + "time_per_iteration": 2.486201286315918 + }, + { + "auxiliary_loss_clip": 0.01094388, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.03532171, + "balance_loss_mlp": 1.01811886, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.3312712507170843, + "language_loss": 0.74835432, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.76959074, + "num_input_tokens_seen": 319891585, + "step": 14831, + "time_per_iteration": 2.5489277839660645 + }, + { + "auxiliary_loss_clip": 0.01100201, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.03513122, + "balance_loss_mlp": 1.01315379, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 1.8297219239960831, + "language_loss": 0.73074079, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75200009, + "num_input_tokens_seen": 319910315, + "step": 14832, + "time_per_iteration": 2.4610464572906494 + }, + { + "auxiliary_loss_clip": 0.01049684, + "auxiliary_loss_mlp": 0.00778404, + "balance_loss_clip": 1.03326178, + "balance_loss_mlp": 1.00059271, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 2.027160686294132, + "language_loss": 0.67214847, + "learning_rate": 1.214746621848355e-07, + "loss": 0.69042939, + "num_input_tokens_seen": 319932275, + "step": 14833, + "time_per_iteration": 2.656564712524414 + }, + { + "auxiliary_loss_clip": 0.01105553, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.03831708, + "balance_loss_mlp": 1.02128005, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 2.094104784098938, + "language_loss": 0.74320793, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.76460993, + "num_input_tokens_seen": 319955335, + "step": 14834, + "time_per_iteration": 4.02483057975769 + }, + { + "auxiliary_loss_clip": 0.01065763, + "auxiliary_loss_mlp": 0.01035403, + "balance_loss_clip": 1.03362441, + "balance_loss_mlp": 1.0233686, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 2.347043056851757, + "language_loss": 0.79163802, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81264961, + "num_input_tokens_seen": 319973990, + "step": 14835, + "time_per_iteration": 3.8680624961853027 + }, + { + "auxiliary_loss_clip": 0.01102671, + "auxiliary_loss_mlp": 0.01027051, + "balance_loss_clip": 1.03313172, + "balance_loss_mlp": 1.01560152, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.861887549714212, + "language_loss": 0.73861158, + "learning_rate": 1.210739940361689e-07, + "loss": 0.7599088, + "num_input_tokens_seen": 319995555, + "step": 14836, + "time_per_iteration": 2.51413631439209 + }, + { + "auxiliary_loss_clip": 0.01087597, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.03482437, + "balance_loss_mlp": 1.01740122, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 2.065686122398588, + "language_loss": 0.68613958, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.70731533, + "num_input_tokens_seen": 320012385, + "step": 14837, + "time_per_iteration": 2.462146282196045 + }, + { + "auxiliary_loss_clip": 0.01053588, + "auxiliary_loss_mlp": 0.01029365, + "balance_loss_clip": 1.03434432, + "balance_loss_mlp": 1.0169909, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.7423706089799296, + "language_loss": 0.67651498, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69734454, + "num_input_tokens_seen": 320032390, + "step": 14838, + "time_per_iteration": 2.6190879344940186 + }, + { + "auxiliary_loss_clip": 0.01097507, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.03341484, + "balance_loss_mlp": 1.0171597, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 2.1887529848052543, + "language_loss": 0.76546109, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78673583, + "num_input_tokens_seen": 320052885, + "step": 14839, + "time_per_iteration": 2.473830461502075 + }, + { + "auxiliary_loss_clip": 0.01001717, + "auxiliary_loss_mlp": 0.0075317, + "balance_loss_clip": 1.00657201, + "balance_loss_mlp": 1.00020301, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6814534270895487, + "language_loss": 0.49406672, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51161557, + "num_input_tokens_seen": 320113685, + "step": 14840, + "time_per_iteration": 3.0741281509399414 + }, + { + "auxiliary_loss_clip": 0.01114788, + "auxiliary_loss_mlp": 0.01034874, + "balance_loss_clip": 1.03733289, + "balance_loss_mlp": 1.02080727, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.1492483459679077, + "language_loss": 0.64146465, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66296124, + "num_input_tokens_seen": 320130810, + "step": 14841, + "time_per_iteration": 2.4228579998016357 + }, + { + "auxiliary_loss_clip": 0.01073331, + "auxiliary_loss_mlp": 0.00775848, + "balance_loss_clip": 1.03614521, + "balance_loss_mlp": 1.00058603, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 1.5593440338765903, + "language_loss": 0.68110931, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.69960105, + "num_input_tokens_seen": 320152170, + "step": 14842, + "time_per_iteration": 2.588374614715576 + }, + { + "auxiliary_loss_clip": 0.01107471, + "auxiliary_loss_mlp": 0.01031762, + "balance_loss_clip": 1.0373702, + "balance_loss_mlp": 1.0202527, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 2.927010578829348, + "language_loss": 0.80135447, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.82274675, + "num_input_tokens_seen": 320172360, + "step": 14843, + "time_per_iteration": 2.482736110687256 + }, + { + "auxiliary_loss_clip": 0.01085867, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.03383482, + "balance_loss_mlp": 1.02044141, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 1.8273207333528874, + "language_loss": 0.68760121, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.70879608, + "num_input_tokens_seen": 320192130, + "step": 14844, + "time_per_iteration": 2.490746021270752 + }, + { + "auxiliary_loss_clip": 0.01067464, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.03407395, + "balance_loss_mlp": 1.02094245, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 2.491601255420414, + "language_loss": 0.91382575, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93483448, + "num_input_tokens_seen": 320207760, + "step": 14845, + "time_per_iteration": 2.5287816524505615 + }, + { + "auxiliary_loss_clip": 0.01090625, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.03429222, + "balance_loss_mlp": 1.02010977, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 1.9860291510174022, + "language_loss": 0.71764284, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.73886579, + "num_input_tokens_seen": 320225325, + "step": 14846, + "time_per_iteration": 2.462266445159912 + }, + { + "auxiliary_loss_clip": 0.01085765, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.03991795, + "balance_loss_mlp": 1.01885486, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 1.609478903437939, + "language_loss": 0.56935513, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.59052038, + "num_input_tokens_seen": 320247645, + "step": 14847, + "time_per_iteration": 4.297422409057617 + }, + { + "auxiliary_loss_clip": 0.01069425, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.03371382, + "balance_loss_mlp": 1.02224803, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 2.3908204781362, + "language_loss": 0.76456082, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78559226, + "num_input_tokens_seen": 320266005, + "step": 14848, + "time_per_iteration": 2.539344310760498 + }, + { + "auxiliary_loss_clip": 0.01046637, + "auxiliary_loss_mlp": 0.01039769, + "balance_loss_clip": 1.03012383, + "balance_loss_mlp": 1.02630436, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 1.7158244865747372, + "language_loss": 0.69047511, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71133912, + "num_input_tokens_seen": 320285555, + "step": 14849, + "time_per_iteration": 2.6723995208740234 + }, + { + "auxiliary_loss_clip": 0.01100763, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.03972065, + "balance_loss_mlp": 1.0220238, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.8181840413166015, + "language_loss": 0.80713409, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.82848275, + "num_input_tokens_seen": 320305395, + "step": 14850, + "time_per_iteration": 2.4993064403533936 + }, + { + "auxiliary_loss_clip": 0.01087973, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.0352273, + "balance_loss_mlp": 1.02284467, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.949184268699433, + "language_loss": 0.74775106, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.76897687, + "num_input_tokens_seen": 320324220, + "step": 14851, + "time_per_iteration": 2.531210422515869 + }, + { + "auxiliary_loss_clip": 0.01083387, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.03425229, + "balance_loss_mlp": 1.0195483, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.5325690752703973, + "language_loss": 0.78651655, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80767024, + "num_input_tokens_seen": 320347195, + "step": 14852, + "time_per_iteration": 2.578433036804199 + }, + { + "auxiliary_loss_clip": 0.01091516, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.03784239, + "balance_loss_mlp": 1.02511382, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.4615527057706668, + "language_loss": 0.69300818, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71429217, + "num_input_tokens_seen": 320366850, + "step": 14853, + "time_per_iteration": 2.4891836643218994 + }, + { + "auxiliary_loss_clip": 0.01064654, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.03982258, + "balance_loss_mlp": 1.02254391, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.7247860262903871, + "language_loss": 0.66914499, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.6901437, + "num_input_tokens_seen": 320388895, + "step": 14854, + "time_per_iteration": 2.702504873275757 + }, + { + "auxiliary_loss_clip": 0.01081999, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.03251362, + "balance_loss_mlp": 1.022928, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.8836562590065744, + "language_loss": 0.74496883, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.76614487, + "num_input_tokens_seen": 320408520, + "step": 14855, + "time_per_iteration": 2.5168821811676025 + }, + { + "auxiliary_loss_clip": 0.01084155, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.03442264, + "balance_loss_mlp": 1.01796925, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 3.3266733343304473, + "language_loss": 0.64508915, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66622531, + "num_input_tokens_seen": 320427400, + "step": 14856, + "time_per_iteration": 2.5931830406188965 + }, + { + "auxiliary_loss_clip": 0.01106707, + "auxiliary_loss_mlp": 0.01028524, + "balance_loss_clip": 1.03486395, + "balance_loss_mlp": 1.01668024, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.5660622462113873, + "language_loss": 0.66505104, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68640339, + "num_input_tokens_seen": 320447570, + "step": 14857, + "time_per_iteration": 2.4553561210632324 + }, + { + "auxiliary_loss_clip": 0.01068158, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.03689277, + "balance_loss_mlp": 1.02135301, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.686837958691147, + "language_loss": 0.75498426, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77601045, + "num_input_tokens_seen": 320464405, + "step": 14858, + "time_per_iteration": 2.6232399940490723 + }, + { + "auxiliary_loss_clip": 0.01096681, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.03565311, + "balance_loss_mlp": 1.01853919, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.5105441281598695, + "language_loss": 0.69247335, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71375442, + "num_input_tokens_seen": 320485525, + "step": 14859, + "time_per_iteration": 2.5225181579589844 + }, + { + "auxiliary_loss_clip": 0.01060325, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.03696752, + "balance_loss_mlp": 1.01862776, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.9465670537060102, + "language_loss": 0.75832397, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77922666, + "num_input_tokens_seen": 320506725, + "step": 14860, + "time_per_iteration": 2.6149027347564697 + }, + { + "auxiliary_loss_clip": 0.010885, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.03505576, + "balance_loss_mlp": 1.01907969, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 1.7489910737697505, + "language_loss": 0.57524526, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59646094, + "num_input_tokens_seen": 320525425, + "step": 14861, + "time_per_iteration": 2.5188262462615967 + }, + { + "auxiliary_loss_clip": 0.01081192, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.03507102, + "balance_loss_mlp": 1.02142119, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.748174192146682, + "language_loss": 0.63731241, + "learning_rate": 1.176284122190685e-07, + "loss": 0.65845978, + "num_input_tokens_seen": 320543010, + "step": 14862, + "time_per_iteration": 3.9365711212158203 + }, + { + "auxiliary_loss_clip": 0.01093678, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.03389835, + "balance_loss_mlp": 1.01858854, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 1.659359546900576, + "language_loss": 0.78226513, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.8035084, + "num_input_tokens_seen": 320562180, + "step": 14863, + "time_per_iteration": 2.4890356063842773 + }, + { + "auxiliary_loss_clip": 0.01080266, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.03195751, + "balance_loss_mlp": 1.02166748, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 2.064385407088972, + "language_loss": 0.71110439, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.73224193, + "num_input_tokens_seen": 320580395, + "step": 14864, + "time_per_iteration": 2.5002081394195557 + }, + { + "auxiliary_loss_clip": 0.01104607, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.03747833, + "balance_loss_mlp": 1.02518392, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 2.466968737904999, + "language_loss": 0.75449312, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.77592117, + "num_input_tokens_seen": 320599505, + "step": 14865, + "time_per_iteration": 2.4518790245056152 + }, + { + "auxiliary_loss_clip": 0.01072354, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.03737414, + "balance_loss_mlp": 1.02096546, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.5038960060184854, + "language_loss": 0.71741289, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.73845971, + "num_input_tokens_seen": 320619825, + "step": 14866, + "time_per_iteration": 2.576260566711426 + }, + { + "auxiliary_loss_clip": 0.01103188, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.03938794, + "balance_loss_mlp": 1.01676393, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 4.6694460645116305, + "language_loss": 0.83765584, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.85899061, + "num_input_tokens_seen": 320638515, + "step": 14867, + "time_per_iteration": 2.4934887886047363 + }, + { + "auxiliary_loss_clip": 0.01097587, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.03550315, + "balance_loss_mlp": 1.01704454, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 1.6479003458792008, + "language_loss": 0.8029533, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82420778, + "num_input_tokens_seen": 320659430, + "step": 14868, + "time_per_iteration": 2.529993772506714 + }, + { + "auxiliary_loss_clip": 0.01084535, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.03660405, + "balance_loss_mlp": 1.01670694, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 2.029584428551673, + "language_loss": 0.77594507, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79708737, + "num_input_tokens_seen": 320679295, + "step": 14869, + "time_per_iteration": 2.5400049686431885 + }, + { + "auxiliary_loss_clip": 0.01095917, + "auxiliary_loss_mlp": 0.00776484, + "balance_loss_clip": 1.03584838, + "balance_loss_mlp": 1.00062847, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 1.9979363030351158, + "language_loss": 0.65913177, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67785579, + "num_input_tokens_seen": 320697535, + "step": 14870, + "time_per_iteration": 2.444439172744751 + }, + { + "auxiliary_loss_clip": 0.00994818, + "auxiliary_loss_mlp": 0.01012208, + "balance_loss_clip": 1.00587225, + "balance_loss_mlp": 1.01070011, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.7951439133398288, + "language_loss": 0.5594632, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57953346, + "num_input_tokens_seen": 320758635, + "step": 14871, + "time_per_iteration": 3.173024892807007 + }, + { + "auxiliary_loss_clip": 0.01096986, + "auxiliary_loss_mlp": 0.01036115, + "balance_loss_clip": 1.04149365, + "balance_loss_mlp": 1.0246644, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 1.8869176488899235, + "language_loss": 0.76312649, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78445756, + "num_input_tokens_seen": 320777175, + "step": 14872, + "time_per_iteration": 2.4624879360198975 + }, + { + "auxiliary_loss_clip": 0.01094512, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.03728187, + "balance_loss_mlp": 1.01760268, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.5243741705566491, + "language_loss": 0.66948843, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.6907177, + "num_input_tokens_seen": 320797670, + "step": 14873, + "time_per_iteration": 3.9891231060028076 + }, + { + "auxiliary_loss_clip": 0.01105521, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.03587711, + "balance_loss_mlp": 1.02301061, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.7145243827725627, + "language_loss": 0.59514821, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61655575, + "num_input_tokens_seen": 320817410, + "step": 14874, + "time_per_iteration": 2.4902613162994385 + }, + { + "auxiliary_loss_clip": 0.01080489, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.03878474, + "balance_loss_mlp": 1.02013814, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 4.3487999308531835, + "language_loss": 0.75768912, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.7788223, + "num_input_tokens_seen": 320836745, + "step": 14875, + "time_per_iteration": 3.883183002471924 + }, + { + "auxiliary_loss_clip": 0.01079214, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.03458869, + "balance_loss_mlp": 1.01959944, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 2.5405485659281455, + "language_loss": 0.77882272, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79995418, + "num_input_tokens_seen": 320853305, + "step": 14876, + "time_per_iteration": 2.544057607650757 + }, + { + "auxiliary_loss_clip": 0.01096528, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.0359751, + "balance_loss_mlp": 1.01909029, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 2.1259739660658656, + "language_loss": 0.78431988, + "learning_rate": 1.156625201573287e-07, + "loss": 0.80558938, + "num_input_tokens_seen": 320872885, + "step": 14877, + "time_per_iteration": 2.453263759613037 + }, + { + "auxiliary_loss_clip": 0.01061131, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.03169918, + "balance_loss_mlp": 1.02189517, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 1.9624473251748544, + "language_loss": 0.75062197, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77158821, + "num_input_tokens_seen": 320889755, + "step": 14878, + "time_per_iteration": 2.5839035511016846 + }, + { + "auxiliary_loss_clip": 0.01094466, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.03370786, + "balance_loss_mlp": 1.01767075, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 1.7068324170899176, + "language_loss": 0.76265007, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.78391534, + "num_input_tokens_seen": 320907860, + "step": 14879, + "time_per_iteration": 2.4811089038848877 + }, + { + "auxiliary_loss_clip": 0.01079448, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.03743863, + "balance_loss_mlp": 1.01850486, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 2.018026836320782, + "language_loss": 0.74433875, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.76543832, + "num_input_tokens_seen": 320925825, + "step": 14880, + "time_per_iteration": 2.526961088180542 + }, + { + "auxiliary_loss_clip": 0.01092234, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.03391826, + "balance_loss_mlp": 1.01876163, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.8066906408096368, + "language_loss": 0.82669955, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.84793758, + "num_input_tokens_seen": 320946165, + "step": 14881, + "time_per_iteration": 2.540952444076538 + }, + { + "auxiliary_loss_clip": 0.01072058, + "auxiliary_loss_mlp": 0.00778132, + "balance_loss_clip": 1.03736329, + "balance_loss_mlp": 1.00057697, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1.864535207097152, + "language_loss": 0.67498785, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69348973, + "num_input_tokens_seen": 320969330, + "step": 14882, + "time_per_iteration": 2.659738540649414 + }, + { + "auxiliary_loss_clip": 0.01088704, + "auxiliary_loss_mlp": 0.01033473, + "balance_loss_clip": 1.0335089, + "balance_loss_mlp": 1.01978183, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 2.8460242873441413, + "language_loss": 0.74907422, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77029598, + "num_input_tokens_seen": 320985055, + "step": 14883, + "time_per_iteration": 2.4937613010406494 + }, + { + "auxiliary_loss_clip": 0.01081847, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.03386807, + "balance_loss_mlp": 1.01738834, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.5760706063482168, + "language_loss": 0.72155488, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74266839, + "num_input_tokens_seen": 321004720, + "step": 14884, + "time_per_iteration": 2.548762083053589 + }, + { + "auxiliary_loss_clip": 0.01076971, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.0308764, + "balance_loss_mlp": 1.01882577, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 2.1876561429393364, + "language_loss": 0.75735855, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.7784344, + "num_input_tokens_seen": 321022350, + "step": 14885, + "time_per_iteration": 2.527820110321045 + }, + { + "auxiliary_loss_clip": 0.0108313, + "auxiliary_loss_mlp": 0.01030135, + "balance_loss_clip": 1.03701448, + "balance_loss_mlp": 1.01752901, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 2.191073118672206, + "language_loss": 0.81639457, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.83752722, + "num_input_tokens_seen": 321040450, + "step": 14886, + "time_per_iteration": 2.5077695846557617 + }, + { + "auxiliary_loss_clip": 0.01050242, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.0350045, + "balance_loss_mlp": 1.01860619, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.6972076842745678, + "language_loss": 0.6384899, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65929556, + "num_input_tokens_seen": 321063970, + "step": 14887, + "time_per_iteration": 4.388996839523315 + }, + { + "auxiliary_loss_clip": 0.01088251, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.03323674, + "balance_loss_mlp": 1.02194417, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 1.8114893111455734, + "language_loss": 0.60842824, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.62965661, + "num_input_tokens_seen": 321083840, + "step": 14888, + "time_per_iteration": 2.5147275924682617 + }, + { + "auxiliary_loss_clip": 0.01109172, + "auxiliary_loss_mlp": 0.00777687, + "balance_loss_clip": 1.03565788, + "balance_loss_mlp": 1.00057888, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 1.8564610358262459, + "language_loss": 0.69944483, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.71831346, + "num_input_tokens_seen": 321104165, + "step": 14889, + "time_per_iteration": 2.5132365226745605 + }, + { + "auxiliary_loss_clip": 0.0110106, + "auxiliary_loss_mlp": 0.00779502, + "balance_loss_clip": 1.04189706, + "balance_loss_mlp": 1.00062764, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.5980883450677026, + "language_loss": 0.7151159, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.73392153, + "num_input_tokens_seen": 321117290, + "step": 14890, + "time_per_iteration": 2.4363090991973877 + }, + { + "auxiliary_loss_clip": 0.01031674, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.03317213, + "balance_loss_mlp": 1.01835907, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.593001889875431, + "language_loss": 0.75712311, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.77775806, + "num_input_tokens_seen": 321137115, + "step": 14891, + "time_per_iteration": 2.758201837539673 + }, + { + "auxiliary_loss_clip": 0.010527, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.03201246, + "balance_loss_mlp": 1.01954365, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 1.8358624963105932, + "language_loss": 0.76535702, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78620332, + "num_input_tokens_seen": 321154490, + "step": 14892, + "time_per_iteration": 2.6103620529174805 + }, + { + "auxiliary_loss_clip": 0.01093517, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.03776848, + "balance_loss_mlp": 1.02305055, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 1.545171163384581, + "language_loss": 0.81857955, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83986688, + "num_input_tokens_seen": 321175625, + "step": 14893, + "time_per_iteration": 2.4979114532470703 + }, + { + "auxiliary_loss_clip": 0.01063407, + "auxiliary_loss_mlp": 0.01035675, + "balance_loss_clip": 1.03343153, + "balance_loss_mlp": 1.02344394, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 1.6573793997166568, + "language_loss": 0.74582601, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.76681685, + "num_input_tokens_seen": 321193895, + "step": 14894, + "time_per_iteration": 2.526426076889038 + }, + { + "auxiliary_loss_clip": 0.01101128, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.04252505, + "balance_loss_mlp": 1.02108002, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.9156594033173904, + "language_loss": 0.67107379, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.69242603, + "num_input_tokens_seen": 321211610, + "step": 14895, + "time_per_iteration": 2.471644401550293 + }, + { + "auxiliary_loss_clip": 0.01099707, + "auxiliary_loss_mlp": 0.01029781, + "balance_loss_clip": 1.03741097, + "balance_loss_mlp": 1.01664972, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.549583300319043, + "language_loss": 0.67056519, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69186008, + "num_input_tokens_seen": 321229805, + "step": 14896, + "time_per_iteration": 2.4353864192962646 + }, + { + "auxiliary_loss_clip": 0.01098685, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.03741586, + "balance_loss_mlp": 1.01852727, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 2.167842836597552, + "language_loss": 0.75616729, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.77746165, + "num_input_tokens_seen": 321247165, + "step": 14897, + "time_per_iteration": 2.4650275707244873 + }, + { + "auxiliary_loss_clip": 0.00994154, + "auxiliary_loss_mlp": 0.00754511, + "balance_loss_clip": 1.00581288, + "balance_loss_mlp": 1.00030029, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7338821989896007, + "language_loss": 0.55292183, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57040846, + "num_input_tokens_seen": 321308425, + "step": 14898, + "time_per_iteration": 3.1635444164276123 + }, + { + "auxiliary_loss_clip": 0.01109267, + "auxiliary_loss_mlp": 0.00778494, + "balance_loss_clip": 1.0369767, + "balance_loss_mlp": 1.00059485, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.3983901045797695, + "language_loss": 0.70109487, + "learning_rate": 1.12808298352008e-07, + "loss": 0.71997249, + "num_input_tokens_seen": 321329295, + "step": 14899, + "time_per_iteration": 2.4955575466156006 + }, + { + "auxiliary_loss_clip": 0.01053752, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.03703356, + "balance_loss_mlp": 1.02384698, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 1.8546696743764919, + "language_loss": 0.73990381, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.76081371, + "num_input_tokens_seen": 321347580, + "step": 14900, + "time_per_iteration": 2.6156883239746094 + }, + { + "auxiliary_loss_clip": 0.0099921, + "auxiliary_loss_mlp": 0.01000843, + "balance_loss_clip": 1.01196527, + "balance_loss_mlp": 0.99971646, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7993781797296822, + "language_loss": 0.6179772, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63797772, + "num_input_tokens_seen": 321407820, + "step": 14901, + "time_per_iteration": 4.628660678863525 + }, + { + "auxiliary_loss_clip": 0.01099564, + "auxiliary_loss_mlp": 0.01030067, + "balance_loss_clip": 1.03665757, + "balance_loss_mlp": 1.01761603, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.6296521047370471, + "language_loss": 0.70802826, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72932458, + "num_input_tokens_seen": 321426745, + "step": 14902, + "time_per_iteration": 2.5085365772247314 + }, + { + "auxiliary_loss_clip": 0.0108468, + "auxiliary_loss_mlp": 0.0102538, + "balance_loss_clip": 1.03852367, + "balance_loss_mlp": 1.01417446, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.8193952742779869, + "language_loss": 0.78275937, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80385995, + "num_input_tokens_seen": 321446165, + "step": 14903, + "time_per_iteration": 2.5486013889312744 + }, + { + "auxiliary_loss_clip": 0.01085803, + "auxiliary_loss_mlp": 0.01036791, + "balance_loss_clip": 1.03507936, + "balance_loss_mlp": 1.02374983, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 1.7787798841437599, + "language_loss": 0.72754508, + "learning_rate": 1.121644401702877e-07, + "loss": 0.74877101, + "num_input_tokens_seen": 321465285, + "step": 14904, + "time_per_iteration": 2.536644220352173 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01028583, + "balance_loss_clip": 1.03618491, + "balance_loss_mlp": 1.01513577, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 2.0994237630580046, + "language_loss": 0.74760199, + "learning_rate": 1.12035883275166e-07, + "loss": 0.7688719, + "num_input_tokens_seen": 321483670, + "step": 14905, + "time_per_iteration": 2.479442596435547 + }, + { + "auxiliary_loss_clip": 0.01097006, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.03483498, + "balance_loss_mlp": 1.02049494, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 1.5150084923626312, + "language_loss": 0.76615608, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78745079, + "num_input_tokens_seen": 321501190, + "step": 14906, + "time_per_iteration": 2.4794540405273438 + }, + { + "auxiliary_loss_clip": 0.01098936, + "auxiliary_loss_mlp": 0.01031666, + "balance_loss_clip": 1.03769612, + "balance_loss_mlp": 1.01953602, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.6621869412424821, + "language_loss": 0.74122804, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76253414, + "num_input_tokens_seen": 321518540, + "step": 14907, + "time_per_iteration": 2.4330763816833496 + }, + { + "auxiliary_loss_clip": 0.01098761, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.03681195, + "balance_loss_mlp": 1.02601814, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 2.8347530696924106, + "language_loss": 0.83279872, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85416734, + "num_input_tokens_seen": 321536555, + "step": 14908, + "time_per_iteration": 2.4292612075805664 + }, + { + "auxiliary_loss_clip": 0.01085987, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.03630996, + "balance_loss_mlp": 1.01801658, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 1.859001612181376, + "language_loss": 0.70642763, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72759867, + "num_input_tokens_seen": 321557655, + "step": 14909, + "time_per_iteration": 2.5002951622009277 + }, + { + "auxiliary_loss_clip": 0.01080804, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.04204857, + "balance_loss_mlp": 1.02426302, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 1.7881564331512434, + "language_loss": 0.72144592, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74262369, + "num_input_tokens_seen": 321576160, + "step": 14910, + "time_per_iteration": 2.5534276962280273 + }, + { + "auxiliary_loss_clip": 0.01094905, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.03412914, + "balance_loss_mlp": 1.01981449, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 2.2068989821048093, + "language_loss": 0.63715333, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65841705, + "num_input_tokens_seen": 321596205, + "step": 14911, + "time_per_iteration": 2.481411933898926 + }, + { + "auxiliary_loss_clip": 0.01083694, + "auxiliary_loss_mlp": 0.00778225, + "balance_loss_clip": 1.03606534, + "balance_loss_mlp": 1.00061548, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 3.5390017807351297, + "language_loss": 0.75443137, + "learning_rate": 1.111379898520437e-07, + "loss": 0.77305049, + "num_input_tokens_seen": 321614800, + "step": 14912, + "time_per_iteration": 4.044902801513672 + }, + { + "auxiliary_loss_clip": 0.01082711, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.03299689, + "balance_loss_mlp": 1.02263856, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 1.9265764090337005, + "language_loss": 0.81943774, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.84062177, + "num_input_tokens_seen": 321633445, + "step": 14913, + "time_per_iteration": 2.5401880741119385 + }, + { + "auxiliary_loss_clip": 0.01101757, + "auxiliary_loss_mlp": 0.01037152, + "balance_loss_clip": 1.03833199, + "balance_loss_mlp": 1.02377641, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 2.2716306143475276, + "language_loss": 0.6125443, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63393342, + "num_input_tokens_seen": 321650890, + "step": 14914, + "time_per_iteration": 2.4809927940368652 + }, + { + "auxiliary_loss_clip": 0.01017933, + "auxiliary_loss_mlp": 0.01001392, + "balance_loss_clip": 1.01816428, + "balance_loss_mlp": 1.00018811, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.7098300933982606, + "language_loss": 0.55019057, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57038379, + "num_input_tokens_seen": 321710960, + "step": 14915, + "time_per_iteration": 4.3908350467681885 + }, + { + "auxiliary_loss_clip": 0.01072931, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.03717709, + "balance_loss_mlp": 1.01807511, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.4578829961838207, + "language_loss": 0.716272, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73729372, + "num_input_tokens_seen": 321733290, + "step": 14916, + "time_per_iteration": 2.6763863563537598 + }, + { + "auxiliary_loss_clip": 0.01085806, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.03516495, + "balance_loss_mlp": 1.01863146, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 1.9951428235585926, + "language_loss": 0.78042567, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.80158424, + "num_input_tokens_seen": 321753120, + "step": 14917, + "time_per_iteration": 2.5392417907714844 + }, + { + "auxiliary_loss_clip": 0.01102859, + "auxiliary_loss_mlp": 0.01039933, + "balance_loss_clip": 1.03877258, + "balance_loss_mlp": 1.02702236, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 2.1815391842581504, + "language_loss": 0.68121111, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70263904, + "num_input_tokens_seen": 321772840, + "step": 14918, + "time_per_iteration": 2.565101146697998 + }, + { + "auxiliary_loss_clip": 0.01062582, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.0383656, + "balance_loss_mlp": 1.01565218, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 2.3176637287608615, + "language_loss": 0.83312631, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85402775, + "num_input_tokens_seen": 321791020, + "step": 14919, + "time_per_iteration": 2.62697172164917 + }, + { + "auxiliary_loss_clip": 0.01109505, + "auxiliary_loss_mlp": 0.00780021, + "balance_loss_clip": 1.03594255, + "balance_loss_mlp": 1.00065601, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 2.072178785369209, + "language_loss": 0.72108448, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.73997974, + "num_input_tokens_seen": 321810075, + "step": 14920, + "time_per_iteration": 2.4328479766845703 + }, + { + "auxiliary_loss_clip": 0.01096132, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.03416324, + "balance_loss_mlp": 1.01839733, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 3.0479501185418556, + "language_loss": 0.90872848, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.93000722, + "num_input_tokens_seen": 321822635, + "step": 14921, + "time_per_iteration": 2.411827325820923 + }, + { + "auxiliary_loss_clip": 0.01055387, + "auxiliary_loss_mlp": 0.01033006, + "balance_loss_clip": 1.03415251, + "balance_loss_mlp": 1.01942778, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.8661411832835184, + "language_loss": 0.73824447, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.75912839, + "num_input_tokens_seen": 321841130, + "step": 14922, + "time_per_iteration": 2.6297333240509033 + }, + { + "auxiliary_loss_clip": 0.01063577, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.03371835, + "balance_loss_mlp": 1.02398491, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 1.6542256875701322, + "language_loss": 0.70146614, + "learning_rate": 1.097341060694219e-07, + "loss": 0.72247958, + "num_input_tokens_seen": 321859855, + "step": 14923, + "time_per_iteration": 2.606762170791626 + }, + { + "auxiliary_loss_clip": 0.01091288, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.03951383, + "balance_loss_mlp": 1.01626015, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.5867157250589767, + "language_loss": 0.703565, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72477514, + "num_input_tokens_seen": 321877990, + "step": 14924, + "time_per_iteration": 2.50066876411438 + }, + { + "auxiliary_loss_clip": 0.0109434, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.03287137, + "balance_loss_mlp": 1.02561831, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.39809601521002, + "language_loss": 0.72182965, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74314082, + "num_input_tokens_seen": 321898120, + "step": 14925, + "time_per_iteration": 2.493561267852783 + }, + { + "auxiliary_loss_clip": 0.01087511, + "auxiliary_loss_mlp": 0.00782108, + "balance_loss_clip": 1.03503847, + "balance_loss_mlp": 1.00059366, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.8125352562728991, + "language_loss": 0.82541239, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84410858, + "num_input_tokens_seen": 321918140, + "step": 14926, + "time_per_iteration": 4.0984718799591064 + }, + { + "auxiliary_loss_clip": 0.01057346, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.03216922, + "balance_loss_mlp": 1.02145433, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.4318143053808592, + "language_loss": 0.79246706, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81337345, + "num_input_tokens_seen": 321938580, + "step": 14927, + "time_per_iteration": 2.5985171794891357 + }, + { + "auxiliary_loss_clip": 0.01082418, + "auxiliary_loss_mlp": 0.01026561, + "balance_loss_clip": 1.03417444, + "balance_loss_mlp": 1.01514065, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.7365546762143775, + "language_loss": 0.664572, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68566179, + "num_input_tokens_seen": 321961135, + "step": 14928, + "time_per_iteration": 2.6910016536712646 + }, + { + "auxiliary_loss_clip": 0.01090592, + "auxiliary_loss_mlp": 0.01045593, + "balance_loss_clip": 1.03750849, + "balance_loss_mlp": 1.02944636, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 3.930751267747025, + "language_loss": 0.70721972, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.72858161, + "num_input_tokens_seen": 321980945, + "step": 14929, + "time_per_iteration": 2.5372862815856934 + }, + { + "auxiliary_loss_clip": 0.01090227, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.03865731, + "balance_loss_mlp": 1.02174664, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.8472338783754918, + "language_loss": 0.67905474, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.70028806, + "num_input_tokens_seen": 322000350, + "step": 14930, + "time_per_iteration": 2.5320990085601807 + }, + { + "auxiliary_loss_clip": 0.01079248, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.03334308, + "balance_loss_mlp": 1.01749802, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 1.8250857439907655, + "language_loss": 0.74884725, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.76993752, + "num_input_tokens_seen": 322018980, + "step": 14931, + "time_per_iteration": 2.5044214725494385 + }, + { + "auxiliary_loss_clip": 0.01098631, + "auxiliary_loss_mlp": 0.01027335, + "balance_loss_clip": 1.03766394, + "balance_loss_mlp": 1.01623678, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 2.528686364083587, + "language_loss": 0.62902701, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65028667, + "num_input_tokens_seen": 322037675, + "step": 14932, + "time_per_iteration": 2.4736359119415283 + }, + { + "auxiliary_loss_clip": 0.01092053, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.03438151, + "balance_loss_mlp": 1.019382, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.7750239387312905, + "language_loss": 0.72018528, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.74140847, + "num_input_tokens_seen": 322055130, + "step": 14933, + "time_per_iteration": 2.4771690368652344 + }, + { + "auxiliary_loss_clip": 0.01063971, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.03127909, + "balance_loss_mlp": 1.01950395, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.6217357805070163, + "language_loss": 0.74600697, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76696658, + "num_input_tokens_seen": 322074850, + "step": 14934, + "time_per_iteration": 2.6286022663116455 + }, + { + "auxiliary_loss_clip": 0.01065736, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.03099167, + "balance_loss_mlp": 1.02299106, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.6225224330372185, + "language_loss": 0.61013287, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.63116127, + "num_input_tokens_seen": 322093315, + "step": 14935, + "time_per_iteration": 2.543339490890503 + }, + { + "auxiliary_loss_clip": 0.01074074, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.03987241, + "balance_loss_mlp": 1.01471233, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 3.2278893023681197, + "language_loss": 0.7649914, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.78600711, + "num_input_tokens_seen": 322112555, + "step": 14936, + "time_per_iteration": 2.593294858932495 + }, + { + "auxiliary_loss_clip": 0.01085942, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.0344795, + "balance_loss_mlp": 1.01999474, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.5817755464644845, + "language_loss": 0.73750675, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.75868654, + "num_input_tokens_seen": 322130440, + "step": 14937, + "time_per_iteration": 2.537004232406616 + }, + { + "auxiliary_loss_clip": 0.01006529, + "auxiliary_loss_mlp": 0.01003273, + "balance_loss_clip": 1.00992608, + "balance_loss_mlp": 1.00209284, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8412945854941212, + "language_loss": 0.63486952, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65496755, + "num_input_tokens_seen": 322187295, + "step": 14938, + "time_per_iteration": 3.0010883808135986 + }, + { + "auxiliary_loss_clip": 0.01087947, + "auxiliary_loss_mlp": 0.01028417, + "balance_loss_clip": 1.03677821, + "balance_loss_mlp": 1.01665711, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 3.0571259266304613, + "language_loss": 0.80285686, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.8240205, + "num_input_tokens_seen": 322202965, + "step": 14939, + "time_per_iteration": 2.4876508712768555 + }, + { + "auxiliary_loss_clip": 0.01001756, + "auxiliary_loss_mlp": 0.01004536, + "balance_loss_clip": 1.00816846, + "balance_loss_mlp": 1.00332594, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.7152975839866265, + "language_loss": 0.52848792, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.54855084, + "num_input_tokens_seen": 322269490, + "step": 14940, + "time_per_iteration": 4.65031886100769 + }, + { + "auxiliary_loss_clip": 0.01108215, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.03537536, + "balance_loss_mlp": 1.01714659, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 1.7977248163211077, + "language_loss": 0.77941287, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.80080181, + "num_input_tokens_seen": 322288060, + "step": 14941, + "time_per_iteration": 2.4482569694519043 + }, + { + "auxiliary_loss_clip": 0.01098932, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.03487563, + "balance_loss_mlp": 1.02323318, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 1.9988997106228334, + "language_loss": 0.73473644, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75609052, + "num_input_tokens_seen": 322307930, + "step": 14942, + "time_per_iteration": 2.546816825866699 + }, + { + "auxiliary_loss_clip": 0.01089007, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.03352869, + "balance_loss_mlp": 1.02596164, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.1837817746969685, + "language_loss": 0.79946065, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.82073915, + "num_input_tokens_seen": 322326155, + "step": 14943, + "time_per_iteration": 2.4721362590789795 + }, + { + "auxiliary_loss_clip": 0.01090612, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.03965926, + "balance_loss_mlp": 1.02035272, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.4500995610657021, + "language_loss": 0.71182454, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73306823, + "num_input_tokens_seen": 322345850, + "step": 14944, + "time_per_iteration": 2.534411668777466 + }, + { + "auxiliary_loss_clip": 0.01072309, + "auxiliary_loss_mlp": 0.01036165, + "balance_loss_clip": 1.03168797, + "balance_loss_mlp": 1.02236056, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 2.2484954213535553, + "language_loss": 0.75936466, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.78044939, + "num_input_tokens_seen": 322364715, + "step": 14945, + "time_per_iteration": 2.557596445083618 + }, + { + "auxiliary_loss_clip": 0.01114513, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.03662395, + "balance_loss_mlp": 1.02173233, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 1.9795122181923974, + "language_loss": 0.73671353, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75821269, + "num_input_tokens_seen": 322383570, + "step": 14946, + "time_per_iteration": 2.4314143657684326 + }, + { + "auxiliary_loss_clip": 0.0105728, + "auxiliary_loss_mlp": 0.01029563, + "balance_loss_clip": 1.03199589, + "balance_loss_mlp": 1.01625371, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 2.0533703993331187, + "language_loss": 0.64208144, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66294992, + "num_input_tokens_seen": 322401375, + "step": 14947, + "time_per_iteration": 2.5699899196624756 + }, + { + "auxiliary_loss_clip": 0.01087619, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.03739798, + "balance_loss_mlp": 1.02395821, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 2.3664765864620563, + "language_loss": 0.69855583, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.7197938, + "num_input_tokens_seen": 322421890, + "step": 14948, + "time_per_iteration": 2.5256271362304688 + }, + { + "auxiliary_loss_clip": 0.01079128, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.03675568, + "balance_loss_mlp": 1.01553512, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 2.1041212163900664, + "language_loss": 0.74668336, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76775384, + "num_input_tokens_seen": 322445730, + "step": 14949, + "time_per_iteration": 2.7143726348876953 + }, + { + "auxiliary_loss_clip": 0.01067725, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.03549182, + "balance_loss_mlp": 1.01897001, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 2.5453428044709696, + "language_loss": 0.75689983, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.7779004, + "num_input_tokens_seen": 322464595, + "step": 14950, + "time_per_iteration": 2.6372833251953125 + }, + { + "auxiliary_loss_clip": 0.01084031, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.03513432, + "balance_loss_mlp": 1.01995468, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 1.7058213955490915, + "language_loss": 0.66401994, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.68517721, + "num_input_tokens_seen": 322483305, + "step": 14951, + "time_per_iteration": 4.04653263092041 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.03466034, + "balance_loss_mlp": 1.0157969, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 1.8927286176527456, + "language_loss": 0.73616332, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.75753903, + "num_input_tokens_seen": 322501905, + "step": 14952, + "time_per_iteration": 2.449270248413086 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.03725386, + "balance_loss_mlp": 1.02202463, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.2687803086698084, + "language_loss": 0.56561577, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.58704674, + "num_input_tokens_seen": 322518135, + "step": 14953, + "time_per_iteration": 2.441136360168457 + }, + { + "auxiliary_loss_clip": 0.01086832, + "auxiliary_loss_mlp": 0.01039302, + "balance_loss_clip": 1.03450608, + "balance_loss_mlp": 1.02648687, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 1.850174832648572, + "language_loss": 0.82255501, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.8438164, + "num_input_tokens_seen": 322537905, + "step": 14954, + "time_per_iteration": 2.552574396133423 + }, + { + "auxiliary_loss_clip": 0.01108237, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.03782642, + "balance_loss_mlp": 1.01960063, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 2.7492866461753778, + "language_loss": 0.59844625, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.61984587, + "num_input_tokens_seen": 322557945, + "step": 14955, + "time_per_iteration": 3.833388566970825 + }, + { + "auxiliary_loss_clip": 0.01096331, + "auxiliary_loss_mlp": 0.01030615, + "balance_loss_clip": 1.03557205, + "balance_loss_mlp": 1.01906967, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.547526607351237, + "language_loss": 0.55288935, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.57415891, + "num_input_tokens_seen": 322575765, + "step": 14956, + "time_per_iteration": 2.4812207221984863 + }, + { + "auxiliary_loss_clip": 0.01065643, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.03378868, + "balance_loss_mlp": 1.02045059, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 1.6727837197436142, + "language_loss": 0.80087996, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.82186514, + "num_input_tokens_seen": 322595665, + "step": 14957, + "time_per_iteration": 2.6373753547668457 + }, + { + "auxiliary_loss_clip": 0.01112888, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.0380466, + "balance_loss_mlp": 1.01751983, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 1.6780719165549636, + "language_loss": 0.78843397, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80986512, + "num_input_tokens_seen": 322614755, + "step": 14958, + "time_per_iteration": 2.4769644737243652 + }, + { + "auxiliary_loss_clip": 0.01042379, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.03887296, + "balance_loss_mlp": 1.01694, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.4381036046145008, + "language_loss": 0.74717867, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.76788521, + "num_input_tokens_seen": 322633425, + "step": 14959, + "time_per_iteration": 2.6449666023254395 + }, + { + "auxiliary_loss_clip": 0.01103581, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.03387713, + "balance_loss_mlp": 1.01727402, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 1.8055356817933903, + "language_loss": 0.6843102, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.70563859, + "num_input_tokens_seen": 322652065, + "step": 14960, + "time_per_iteration": 2.4205780029296875 + }, + { + "auxiliary_loss_clip": 0.01085605, + "auxiliary_loss_mlp": 0.01027653, + "balance_loss_clip": 1.03598201, + "balance_loss_mlp": 1.01620889, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.7982717226245184, + "language_loss": 0.65776324, + "learning_rate": 1.049510991294591e-07, + "loss": 0.67889583, + "num_input_tokens_seen": 322673275, + "step": 14961, + "time_per_iteration": 2.551232099533081 + }, + { + "auxiliary_loss_clip": 0.01084382, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.03430104, + "balance_loss_mlp": 1.01779258, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.5063912611161148, + "language_loss": 0.83203614, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85316783, + "num_input_tokens_seen": 322693375, + "step": 14962, + "time_per_iteration": 2.5340781211853027 + }, + { + "auxiliary_loss_clip": 0.01090623, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.03801334, + "balance_loss_mlp": 1.0172044, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 3.0191450862224145, + "language_loss": 0.76262844, + "learning_rate": 1.047022340612298e-07, + "loss": 0.7838428, + "num_input_tokens_seen": 322712615, + "step": 14963, + "time_per_iteration": 2.5125577449798584 + }, + { + "auxiliary_loss_clip": 0.00995881, + "auxiliary_loss_mlp": 0.01004251, + "balance_loss_clip": 1.01726735, + "balance_loss_mlp": 1.00290442, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.7756208753205804, + "language_loss": 0.57493937, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59494066, + "num_input_tokens_seen": 322766855, + "step": 14964, + "time_per_iteration": 3.022996425628662 + }, + { + "auxiliary_loss_clip": 0.011059, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.03984332, + "balance_loss_mlp": 1.02063286, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 2.794162919392285, + "language_loss": 0.6772002, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69859552, + "num_input_tokens_seen": 322781130, + "step": 14965, + "time_per_iteration": 3.968111276626587 + }, + { + "auxiliary_loss_clip": 0.01111347, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.03743613, + "balance_loss_mlp": 1.01980591, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 1.9353679327998825, + "language_loss": 0.71921813, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.74064958, + "num_input_tokens_seen": 322800310, + "step": 14966, + "time_per_iteration": 2.423311233520508 + }, + { + "auxiliary_loss_clip": 0.01078904, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.035779, + "balance_loss_mlp": 1.01921141, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 2.035879646173059, + "language_loss": 0.73516417, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75627297, + "num_input_tokens_seen": 322820955, + "step": 14967, + "time_per_iteration": 2.5935986042022705 + }, + { + "auxiliary_loss_clip": 0.01072313, + "auxiliary_loss_mlp": 0.00778182, + "balance_loss_clip": 1.04136062, + "balance_loss_mlp": 1.00053179, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 2.2649365456775903, + "language_loss": 0.718647, + "learning_rate": 1.040813291960323e-07, + "loss": 0.73715198, + "num_input_tokens_seen": 322838780, + "step": 14968, + "time_per_iteration": 2.564490795135498 + }, + { + "auxiliary_loss_clip": 0.01099866, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.0417012, + "balance_loss_mlp": 1.02254236, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 2.2080375989427456, + "language_loss": 0.70781898, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.72916687, + "num_input_tokens_seen": 322856710, + "step": 14969, + "time_per_iteration": 2.492335319519043 + }, + { + "auxiliary_loss_clip": 0.01112484, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.03921556, + "balance_loss_mlp": 1.01797605, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 1.8531081023711278, + "language_loss": 0.75986499, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.7812953, + "num_input_tokens_seen": 322876070, + "step": 14970, + "time_per_iteration": 2.4488117694854736 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.03685379, + "balance_loss_mlp": 1.02034903, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.8076725502162376, + "language_loss": 0.72995865, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75129628, + "num_input_tokens_seen": 322895095, + "step": 14971, + "time_per_iteration": 2.462085485458374 + }, + { + "auxiliary_loss_clip": 0.01077229, + "auxiliary_loss_mlp": 0.01030613, + "balance_loss_clip": 1.0403111, + "balance_loss_mlp": 1.01789927, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 3.4292980343951327, + "language_loss": 0.81592137, + "learning_rate": 1.035858993572476e-07, + "loss": 0.83699977, + "num_input_tokens_seen": 322911845, + "step": 14972, + "time_per_iteration": 2.5319676399230957 + }, + { + "auxiliary_loss_clip": 0.01083289, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.0337832, + "balance_loss_mlp": 1.01799011, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 2.0505133958494897, + "language_loss": 0.81466603, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83580303, + "num_input_tokens_seen": 322928170, + "step": 14973, + "time_per_iteration": 2.453000068664551 + }, + { + "auxiliary_loss_clip": 0.01106774, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.03492451, + "balance_loss_mlp": 1.02216876, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 2.2779161054072308, + "language_loss": 0.5824554, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60387206, + "num_input_tokens_seen": 322948165, + "step": 14974, + "time_per_iteration": 2.471820831298828 + }, + { + "auxiliary_loss_clip": 0.01111132, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.03873479, + "balance_loss_mlp": 1.02104819, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.7166300449505494, + "language_loss": 0.63187635, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65332019, + "num_input_tokens_seen": 322968880, + "step": 14975, + "time_per_iteration": 2.4524552822113037 + }, + { + "auxiliary_loss_clip": 0.01098458, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.03633976, + "balance_loss_mlp": 1.02023673, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.5847913338266197, + "language_loss": 0.73328388, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75459385, + "num_input_tokens_seen": 322989395, + "step": 14976, + "time_per_iteration": 2.490081548690796 + }, + { + "auxiliary_loss_clip": 0.01094301, + "auxiliary_loss_mlp": 0.0103284, + "balance_loss_clip": 1.0379647, + "balance_loss_mlp": 1.02112746, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 2.0022355051894243, + "language_loss": 0.69659865, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71787, + "num_input_tokens_seen": 323009060, + "step": 14977, + "time_per_iteration": 2.5185141563415527 + }, + { + "auxiliary_loss_clip": 0.01083811, + "auxiliary_loss_mlp": 0.00779498, + "balance_loss_clip": 1.03617108, + "balance_loss_mlp": 1.00058627, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.6515067050345547, + "language_loss": 0.65491664, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67354977, + "num_input_tokens_seen": 323027530, + "step": 14978, + "time_per_iteration": 2.4761621952056885 + }, + { + "auxiliary_loss_clip": 0.01078866, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.03422976, + "balance_loss_mlp": 1.02192426, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.8125087867585663, + "language_loss": 0.7896595, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81080401, + "num_input_tokens_seen": 323045370, + "step": 14979, + "time_per_iteration": 2.5070133209228516 + }, + { + "auxiliary_loss_clip": 0.01010582, + "auxiliary_loss_mlp": 0.01007422, + "balance_loss_clip": 1.00617659, + "balance_loss_mlp": 1.00627124, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7210444464005739, + "language_loss": 0.53597116, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55615115, + "num_input_tokens_seen": 323105660, + "step": 14980, + "time_per_iteration": 4.542876720428467 + }, + { + "auxiliary_loss_clip": 0.01102257, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.03862381, + "balance_loss_mlp": 1.02846766, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 2.024283760312849, + "language_loss": 0.82311118, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84454894, + "num_input_tokens_seen": 323126365, + "step": 14981, + "time_per_iteration": 2.526047945022583 + }, + { + "auxiliary_loss_clip": 0.01067511, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.03690481, + "balance_loss_mlp": 1.01788306, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 1.4680461606248683, + "language_loss": 0.81594229, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83691061, + "num_input_tokens_seen": 323145655, + "step": 14982, + "time_per_iteration": 2.6168713569641113 + }, + { + "auxiliary_loss_clip": 0.01076482, + "auxiliary_loss_mlp": 0.01041032, + "balance_loss_clip": 1.03116584, + "balance_loss_mlp": 1.02827084, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 1.934716733065804, + "language_loss": 0.71451473, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73568988, + "num_input_tokens_seen": 323164540, + "step": 14983, + "time_per_iteration": 2.538154363632202 + }, + { + "auxiliary_loss_clip": 0.01095763, + "auxiliary_loss_mlp": 0.01022845, + "balance_loss_clip": 1.03639615, + "balance_loss_mlp": 1.01181817, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.304448698990258, + "language_loss": 0.74683249, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.7680186, + "num_input_tokens_seen": 323186960, + "step": 14984, + "time_per_iteration": 2.497941732406616 + }, + { + "auxiliary_loss_clip": 0.01103212, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.03369915, + "balance_loss_mlp": 1.01972651, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.6165768973772008, + "language_loss": 0.70236313, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72371566, + "num_input_tokens_seen": 323206135, + "step": 14985, + "time_per_iteration": 2.4406893253326416 + }, + { + "auxiliary_loss_clip": 0.01088117, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.03598094, + "balance_loss_mlp": 1.01793647, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 2.0267367763821054, + "language_loss": 0.7098403, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.73102379, + "num_input_tokens_seen": 323225980, + "step": 14986, + "time_per_iteration": 2.5187535285949707 + }, + { + "auxiliary_loss_clip": 0.01098099, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.03343546, + "balance_loss_mlp": 1.0213573, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.7067935745700593, + "language_loss": 0.76670712, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.78802919, + "num_input_tokens_seen": 323243700, + "step": 14987, + "time_per_iteration": 2.434025764465332 + }, + { + "auxiliary_loss_clip": 0.01097958, + "auxiliary_loss_mlp": 0.01030395, + "balance_loss_clip": 1.04121113, + "balance_loss_mlp": 1.01783657, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 1.9682079383530193, + "language_loss": 0.7351312, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.75641477, + "num_input_tokens_seen": 323261535, + "step": 14988, + "time_per_iteration": 2.4899795055389404 + }, + { + "auxiliary_loss_clip": 0.01088034, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.03932238, + "balance_loss_mlp": 1.01717293, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 2.341726168533962, + "language_loss": 0.69402111, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.7152065, + "num_input_tokens_seen": 323281855, + "step": 14989, + "time_per_iteration": 2.5256826877593994 + }, + { + "auxiliary_loss_clip": 0.01109917, + "auxiliary_loss_mlp": 0.01028489, + "balance_loss_clip": 1.03754711, + "balance_loss_mlp": 1.01637125, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 2.633951515263465, + "language_loss": 0.80354226, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.82492626, + "num_input_tokens_seen": 323299505, + "step": 14990, + "time_per_iteration": 2.3925113677978516 + }, + { + "auxiliary_loss_clip": 0.01072792, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.03491604, + "balance_loss_mlp": 1.01997828, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 1.8045557909495185, + "language_loss": 0.77922869, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.80028152, + "num_input_tokens_seen": 323318365, + "step": 14991, + "time_per_iteration": 4.218637228012085 + }, + { + "auxiliary_loss_clip": 0.01000772, + "auxiliary_loss_mlp": 0.00753606, + "balance_loss_clip": 1.00919878, + "balance_loss_mlp": 1.00024784, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.7767378818115587, + "language_loss": 0.60255444, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62009823, + "num_input_tokens_seen": 323371835, + "step": 14992, + "time_per_iteration": 3.018434524536133 + }, + { + "auxiliary_loss_clip": 0.01093, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.03470135, + "balance_loss_mlp": 1.01928771, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 1.9560940834971976, + "language_loss": 0.83078235, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.85202861, + "num_input_tokens_seen": 323388495, + "step": 14993, + "time_per_iteration": 2.4669806957244873 + }, + { + "auxiliary_loss_clip": 0.01107641, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.03539681, + "balance_loss_mlp": 1.02207589, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 2.2199555461107754, + "language_loss": 0.73735976, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75878108, + "num_input_tokens_seen": 323405280, + "step": 14994, + "time_per_iteration": 3.710956335067749 + }, + { + "auxiliary_loss_clip": 0.01093228, + "auxiliary_loss_mlp": 0.01027148, + "balance_loss_clip": 1.03465044, + "balance_loss_mlp": 1.0157162, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 3.7439305874504005, + "language_loss": 0.64556998, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66677374, + "num_input_tokens_seen": 323425310, + "step": 14995, + "time_per_iteration": 2.537029504776001 + }, + { + "auxiliary_loss_clip": 0.01071971, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.03029764, + "balance_loss_mlp": 1.02177024, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 1.6952047981022365, + "language_loss": 0.66483182, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68590581, + "num_input_tokens_seen": 323447805, + "step": 14996, + "time_per_iteration": 2.621011257171631 + }, + { + "auxiliary_loss_clip": 0.01096254, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.0347476, + "balance_loss_mlp": 1.01580215, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.635736854621331, + "language_loss": 0.65867066, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.679905, + "num_input_tokens_seen": 323467150, + "step": 14997, + "time_per_iteration": 2.4777379035949707 + }, + { + "auxiliary_loss_clip": 0.01082825, + "auxiliary_loss_mlp": 0.01034808, + "balance_loss_clip": 1.03299129, + "balance_loss_mlp": 1.02238619, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 1.6916491453151603, + "language_loss": 0.77552766, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79670399, + "num_input_tokens_seen": 323484250, + "step": 14998, + "time_per_iteration": 2.4969193935394287 + }, + { + "auxiliary_loss_clip": 0.0110801, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.03537726, + "balance_loss_mlp": 1.01877546, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.7824721031739421, + "language_loss": 0.75333297, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.7747221, + "num_input_tokens_seen": 323502910, + "step": 14999, + "time_per_iteration": 2.44477915763855 + }, + { + "auxiliary_loss_clip": 0.01048053, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.03734374, + "balance_loss_mlp": 1.01830041, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.0298279187787305, + "language_loss": 0.76236957, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.78316045, + "num_input_tokens_seen": 323521820, + "step": 15000, + "time_per_iteration": 2.637854814529419 + }, + { + "auxiliary_loss_clip": 0.01090634, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.03622031, + "balance_loss_mlp": 1.01533532, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.6354309462004304, + "language_loss": 0.8055135, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.82668853, + "num_input_tokens_seen": 323543200, + "step": 15001, + "time_per_iteration": 2.7941501140594482 + }, + { + "auxiliary_loss_clip": 0.01075832, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.0363518, + "balance_loss_mlp": 1.01806223, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 3.552700745507823, + "language_loss": 0.78301692, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80406404, + "num_input_tokens_seen": 323563075, + "step": 15002, + "time_per_iteration": 2.56937313079834 + }, + { + "auxiliary_loss_clip": 0.01080548, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.03419149, + "balance_loss_mlp": 1.02406192, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 2.2450962564626824, + "language_loss": 0.68400025, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70519102, + "num_input_tokens_seen": 323579065, + "step": 15003, + "time_per_iteration": 2.531311511993408 + }, + { + "auxiliary_loss_clip": 0.01085533, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.03355396, + "balance_loss_mlp": 1.01832473, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 2.317467892490426, + "language_loss": 0.85726798, + "learning_rate": 9.9663907182292e-08, + "loss": 0.87843597, + "num_input_tokens_seen": 323594835, + "step": 15004, + "time_per_iteration": 2.474076271057129 + }, + { + "auxiliary_loss_clip": 0.01075328, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.03402853, + "balance_loss_mlp": 1.02041745, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 2.3171623038972116, + "language_loss": 0.72858763, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74967587, + "num_input_tokens_seen": 323611475, + "step": 15005, + "time_per_iteration": 4.0656867027282715 + }, + { + "auxiliary_loss_clip": 0.01100587, + "auxiliary_loss_mlp": 0.01031696, + "balance_loss_clip": 1.03435314, + "balance_loss_mlp": 1.01873159, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 1.9331464263111386, + "language_loss": 0.71063775, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73196054, + "num_input_tokens_seen": 323629730, + "step": 15006, + "time_per_iteration": 2.4599926471710205 + }, + { + "auxiliary_loss_clip": 0.01091389, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.03731155, + "balance_loss_mlp": 1.01911759, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 2.1899246213554657, + "language_loss": 0.84586871, + "learning_rate": 9.930000126732618e-08, + "loss": 0.8670935, + "num_input_tokens_seen": 323646000, + "step": 15007, + "time_per_iteration": 2.488445520401001 + }, + { + "auxiliary_loss_clip": 0.01080459, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.03250146, + "balance_loss_mlp": 1.01831913, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.5934724031211056, + "language_loss": 0.78639209, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80750549, + "num_input_tokens_seen": 323667250, + "step": 15008, + "time_per_iteration": 2.5639350414276123 + }, + { + "auxiliary_loss_clip": 0.0106741, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.03755617, + "balance_loss_mlp": 1.01905465, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.566353825686051, + "language_loss": 0.73046201, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75144446, + "num_input_tokens_seen": 323687150, + "step": 15009, + "time_per_iteration": 2.558123826980591 + }, + { + "auxiliary_loss_clip": 0.01106875, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.03593874, + "balance_loss_mlp": 1.01940382, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.7888092145585108, + "language_loss": 0.73438883, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75577265, + "num_input_tokens_seen": 323703660, + "step": 15010, + "time_per_iteration": 2.3998618125915527 + }, + { + "auxiliary_loss_clip": 0.01084594, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.03894639, + "balance_loss_mlp": 1.02103865, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 1.6966576527153123, + "language_loss": 0.7390902, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76027387, + "num_input_tokens_seen": 323722060, + "step": 15011, + "time_per_iteration": 2.545037269592285 + }, + { + "auxiliary_loss_clip": 0.01100645, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.03457904, + "balance_loss_mlp": 1.01659346, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 1.9088060609576325, + "language_loss": 0.73433721, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75563687, + "num_input_tokens_seen": 323740645, + "step": 15012, + "time_per_iteration": 2.4673943519592285 + }, + { + "auxiliary_loss_clip": 0.01076357, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.03618085, + "balance_loss_mlp": 1.02157211, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 2.598083154451654, + "language_loss": 0.69282448, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71392214, + "num_input_tokens_seen": 323758905, + "step": 15013, + "time_per_iteration": 2.5223233699798584 + }, + { + "auxiliary_loss_clip": 0.01092276, + "auxiliary_loss_mlp": 0.01031515, + "balance_loss_clip": 1.0337801, + "balance_loss_mlp": 1.02007711, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.5511717584173894, + "language_loss": 0.72979224, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75103021, + "num_input_tokens_seen": 323780595, + "step": 15014, + "time_per_iteration": 2.4976320266723633 + }, + { + "auxiliary_loss_clip": 0.01106843, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.03443944, + "balance_loss_mlp": 1.0171411, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 1.7937880791070229, + "language_loss": 0.7172485, + "learning_rate": 9.833275711893474e-08, + "loss": 0.73861611, + "num_input_tokens_seen": 323798160, + "step": 15015, + "time_per_iteration": 2.4119269847869873 + }, + { + "auxiliary_loss_clip": 0.01082336, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.03309178, + "balance_loss_mlp": 1.01993799, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 1.8229611925434084, + "language_loss": 0.69112396, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71226501, + "num_input_tokens_seen": 323816810, + "step": 15016, + "time_per_iteration": 2.494014263153076 + }, + { + "auxiliary_loss_clip": 0.01105928, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.03511798, + "balance_loss_mlp": 1.01598299, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 2.0115806372434757, + "language_loss": 0.70648032, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72781116, + "num_input_tokens_seen": 323836900, + "step": 15017, + "time_per_iteration": 2.4881608486175537 + }, + { + "auxiliary_loss_clip": 0.01084685, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.03800225, + "balance_loss_mlp": 1.0198853, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 1.6796284168131497, + "language_loss": 0.69586563, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71702433, + "num_input_tokens_seen": 323855325, + "step": 15018, + "time_per_iteration": 2.5121243000030518 + }, + { + "auxiliary_loss_clip": 0.01097209, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.03600156, + "balance_loss_mlp": 1.01786411, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 2.0313623221020793, + "language_loss": 0.69118035, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71245706, + "num_input_tokens_seen": 323875650, + "step": 15019, + "time_per_iteration": 4.212982892990112 + }, + { + "auxiliary_loss_clip": 0.0110508, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.03515625, + "balance_loss_mlp": 1.02205682, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 4.170226248096894, + "language_loss": 0.72428828, + "learning_rate": 9.773057299808951e-08, + "loss": 0.74567801, + "num_input_tokens_seen": 323892920, + "step": 15020, + "time_per_iteration": 2.4313223361968994 + }, + { + "auxiliary_loss_clip": 0.01094804, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.03243375, + "balance_loss_mlp": 1.02117836, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.8733397887595737, + "language_loss": 0.74246538, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76375282, + "num_input_tokens_seen": 323913835, + "step": 15021, + "time_per_iteration": 2.4604358673095703 + }, + { + "auxiliary_loss_clip": 0.01112834, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.03822982, + "balance_loss_mlp": 1.01741481, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 2.4139179984260966, + "language_loss": 0.72522956, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74666244, + "num_input_tokens_seen": 323933440, + "step": 15022, + "time_per_iteration": 2.4294354915618896 + }, + { + "auxiliary_loss_clip": 0.01067282, + "auxiliary_loss_mlp": 0.01026694, + "balance_loss_clip": 1.03296673, + "balance_loss_mlp": 1.01552463, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 1.8306663610435996, + "language_loss": 0.72290069, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74384046, + "num_input_tokens_seen": 323954090, + "step": 15023, + "time_per_iteration": 2.623314619064331 + }, + { + "auxiliary_loss_clip": 0.01093711, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.03624511, + "balance_loss_mlp": 1.02074027, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 1.6690149594400787, + "language_loss": 0.82597041, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84723043, + "num_input_tokens_seen": 323974040, + "step": 15024, + "time_per_iteration": 2.469301223754883 + }, + { + "auxiliary_loss_clip": 0.0109823, + "auxiliary_loss_mlp": 0.01026742, + "balance_loss_clip": 1.03603303, + "balance_loss_mlp": 1.01533389, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.689786229031095, + "language_loss": 0.69460583, + "learning_rate": 9.713019223328966e-08, + "loss": 0.71585554, + "num_input_tokens_seen": 323996125, + "step": 15025, + "time_per_iteration": 2.469435691833496 + }, + { + "auxiliary_loss_clip": 0.01073423, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.03385973, + "balance_loss_mlp": 1.024436, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 1.7287679110728982, + "language_loss": 0.77137816, + "learning_rate": 9.70103325331717e-08, + "loss": 0.79247767, + "num_input_tokens_seen": 324017645, + "step": 15026, + "time_per_iteration": 2.5692176818847656 + }, + { + "auxiliary_loss_clip": 0.0109694, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.03706145, + "balance_loss_mlp": 1.01838374, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 1.708437579802041, + "language_loss": 0.68297297, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70424217, + "num_input_tokens_seen": 324036875, + "step": 15027, + "time_per_iteration": 2.4456515312194824 + }, + { + "auxiliary_loss_clip": 0.01054344, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.03135777, + "balance_loss_mlp": 1.0243603, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 1.5937673238013605, + "language_loss": 0.75722045, + "learning_rate": 9.677082962215477e-08, + "loss": 0.77813709, + "num_input_tokens_seen": 324057045, + "step": 15028, + "time_per_iteration": 2.599750280380249 + }, + { + "auxiliary_loss_clip": 0.01058143, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.0370363, + "balance_loss_mlp": 1.02232826, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 1.925892526070124, + "language_loss": 0.68940878, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71033573, + "num_input_tokens_seen": 324079735, + "step": 15029, + "time_per_iteration": 2.61596941947937 + }, + { + "auxiliary_loss_clip": 0.0109566, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.03803456, + "balance_loss_mlp": 1.01841021, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 4.377254440042338, + "language_loss": 0.73767775, + "learning_rate": 9.653161539369858e-08, + "loss": 0.75894821, + "num_input_tokens_seen": 324097785, + "step": 15030, + "time_per_iteration": 2.438493490219116 + }, + { + "auxiliary_loss_clip": 0.01099678, + "auxiliary_loss_mlp": 0.01033233, + "balance_loss_clip": 1.03576458, + "balance_loss_mlp": 1.02146125, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 1.8180129394903066, + "language_loss": 0.68162817, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70295721, + "num_input_tokens_seen": 324121625, + "step": 15031, + "time_per_iteration": 4.112116575241089 + }, + { + "auxiliary_loss_clip": 0.01085853, + "auxiliary_loss_mlp": 0.01024427, + "balance_loss_clip": 1.03702855, + "balance_loss_mlp": 1.01293528, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.6596161012050643, + "language_loss": 0.76792061, + "learning_rate": 9.629268988408723e-08, + "loss": 0.7890234, + "num_input_tokens_seen": 324142535, + "step": 15032, + "time_per_iteration": 2.5284574031829834 + }, + { + "auxiliary_loss_clip": 0.01108563, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.03560543, + "balance_loss_mlp": 1.01916575, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 1.7779370926604743, + "language_loss": 0.75390172, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77529985, + "num_input_tokens_seen": 324159610, + "step": 15033, + "time_per_iteration": 2.4443297386169434 + }, + { + "auxiliary_loss_clip": 0.01077296, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.03204775, + "balance_loss_mlp": 1.02153671, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.7022170739066005, + "language_loss": 0.74287117, + "learning_rate": 9.605405312956105e-08, + "loss": 0.763991, + "num_input_tokens_seen": 324182510, + "step": 15034, + "time_per_iteration": 3.954406261444092 + }, + { + "auxiliary_loss_clip": 0.01078477, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.03739965, + "balance_loss_mlp": 1.01904178, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 2.354729423722464, + "language_loss": 0.63296789, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65406287, + "num_input_tokens_seen": 324200555, + "step": 15035, + "time_per_iteration": 2.564354181289673 + }, + { + "auxiliary_loss_clip": 0.01108563, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.0369103, + "balance_loss_mlp": 1.01928961, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 2.833906565081733, + "language_loss": 0.62418276, + "learning_rate": 9.581570516631643e-08, + "loss": 0.64559209, + "num_input_tokens_seen": 324220255, + "step": 15036, + "time_per_iteration": 2.4662909507751465 + }, + { + "auxiliary_loss_clip": 0.01056875, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.03558397, + "balance_loss_mlp": 1.01614344, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.673478800069058, + "language_loss": 0.82343531, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84428328, + "num_input_tokens_seen": 324237855, + "step": 15037, + "time_per_iteration": 2.585418939590454 + }, + { + "auxiliary_loss_clip": 0.01110938, + "auxiliary_loss_mlp": 0.01032714, + "balance_loss_clip": 1.03641796, + "balance_loss_mlp": 1.02042913, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 1.9845426619198818, + "language_loss": 0.67929375, + "learning_rate": 9.557764603050667e-08, + "loss": 0.7007302, + "num_input_tokens_seen": 324257050, + "step": 15038, + "time_per_iteration": 2.4185383319854736 + }, + { + "auxiliary_loss_clip": 0.01085241, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.03300774, + "balance_loss_mlp": 1.02165329, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 2.023031794408634, + "language_loss": 0.75210893, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77330399, + "num_input_tokens_seen": 324275510, + "step": 15039, + "time_per_iteration": 2.496366024017334 + }, + { + "auxiliary_loss_clip": 0.01081666, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.03555465, + "balance_loss_mlp": 1.01807499, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 1.5217577304641714, + "language_loss": 0.70380503, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72492212, + "num_input_tokens_seen": 324295150, + "step": 15040, + "time_per_iteration": 2.5161147117614746 + }, + { + "auxiliary_loss_clip": 0.01074093, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.03324103, + "balance_loss_mlp": 1.01618576, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.790398562402608, + "language_loss": 0.67631006, + "learning_rate": 9.522109895720709e-08, + "loss": 0.69732982, + "num_input_tokens_seen": 324313855, + "step": 15041, + "time_per_iteration": 2.530705213546753 + }, + { + "auxiliary_loss_clip": 0.01096497, + "auxiliary_loss_mlp": 0.01034206, + "balance_loss_clip": 1.03460193, + "balance_loss_mlp": 1.0220232, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 1.741998722191202, + "language_loss": 0.57699084, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59829783, + "num_input_tokens_seen": 324338465, + "step": 15042, + "time_per_iteration": 2.5446181297302246 + }, + { + "auxiliary_loss_clip": 0.01012455, + "auxiliary_loss_mlp": 0.00753117, + "balance_loss_clip": 1.00807643, + "balance_loss_mlp": 1.00020111, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.8196594338717129, + "language_loss": 0.56956303, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58721876, + "num_input_tokens_seen": 324398740, + "step": 15043, + "time_per_iteration": 3.0651142597198486 + }, + { + "auxiliary_loss_clip": 0.01085611, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.03423333, + "balance_loss_mlp": 1.01530242, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 2.0019878267842355, + "language_loss": 0.70075399, + "learning_rate": 9.486520194855274e-08, + "loss": 0.72189301, + "num_input_tokens_seen": 324417335, + "step": 15044, + "time_per_iteration": 3.921424150466919 + }, + { + "auxiliary_loss_clip": 0.01088357, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.03612661, + "balance_loss_mlp": 1.02328467, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 2.4997381794605, + "language_loss": 0.69875556, + "learning_rate": 9.474671409214407e-08, + "loss": 0.72000366, + "num_input_tokens_seen": 324433240, + "step": 15045, + "time_per_iteration": 2.4623939990997314 + }, + { + "auxiliary_loss_clip": 0.01078583, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.03577363, + "balance_loss_mlp": 1.02133346, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 2.0328708447273494, + "language_loss": 0.65979624, + "learning_rate": 9.462829848313081e-08, + "loss": 0.6809231, + "num_input_tokens_seen": 324452675, + "step": 15046, + "time_per_iteration": 2.556419610977173 + }, + { + "auxiliary_loss_clip": 0.01080484, + "auxiliary_loss_mlp": 0.01036369, + "balance_loss_clip": 1.03845239, + "balance_loss_mlp": 1.02366102, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 2.013262653186321, + "language_loss": 0.62115324, + "learning_rate": 9.450995512600379e-08, + "loss": 0.64232171, + "num_input_tokens_seen": 324467865, + "step": 15047, + "time_per_iteration": 2.4934206008911133 + }, + { + "auxiliary_loss_clip": 0.01107422, + "auxiliary_loss_mlp": 0.00777543, + "balance_loss_clip": 1.0366286, + "balance_loss_mlp": 1.00059652, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 2.1297470400504848, + "language_loss": 0.71309459, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73194432, + "num_input_tokens_seen": 324490430, + "step": 15048, + "time_per_iteration": 2.5041263103485107 + }, + { + "auxiliary_loss_clip": 0.01096965, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.03396869, + "balance_loss_mlp": 1.01794171, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 2.260472586524025, + "language_loss": 0.75194371, + "learning_rate": 9.427348518535483e-08, + "loss": 0.7732203, + "num_input_tokens_seen": 324506620, + "step": 15049, + "time_per_iteration": 2.444995641708374 + }, + { + "auxiliary_loss_clip": 0.0109498, + "auxiliary_loss_mlp": 0.01029183, + "balance_loss_clip": 1.03713298, + "balance_loss_mlp": 1.01695836, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 1.7200179205747772, + "language_loss": 0.75551546, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77675706, + "num_input_tokens_seen": 324525505, + "step": 15050, + "time_per_iteration": 2.5558297634124756 + }, + { + "auxiliary_loss_clip": 0.01108421, + "auxiliary_loss_mlp": 0.007773, + "balance_loss_clip": 1.03575587, + "balance_loss_mlp": 1.00058436, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.8559670457423738, + "language_loss": 0.81904793, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83790517, + "num_input_tokens_seen": 324544415, + "step": 15051, + "time_per_iteration": 2.430994749069214 + }, + { + "auxiliary_loss_clip": 0.01099149, + "auxiliary_loss_mlp": 0.01026667, + "balance_loss_clip": 1.03746545, + "balance_loss_mlp": 1.01524079, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 1.9709751987664346, + "language_loss": 0.89273369, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91399187, + "num_input_tokens_seen": 324562555, + "step": 15052, + "time_per_iteration": 2.45534348487854 + }, + { + "auxiliary_loss_clip": 0.01101273, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.03562498, + "balance_loss_mlp": 1.02144492, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 2.01477053178522, + "language_loss": 0.77530545, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79665554, + "num_input_tokens_seen": 324580865, + "step": 15053, + "time_per_iteration": 2.463916301727295 + }, + { + "auxiliary_loss_clip": 0.01095992, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.03631234, + "balance_loss_mlp": 1.02306402, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 1.6086388056347494, + "language_loss": 0.72901094, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75032234, + "num_input_tokens_seen": 324600665, + "step": 15054, + "time_per_iteration": 2.498074769973755 + }, + { + "auxiliary_loss_clip": 0.01056019, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.03082061, + "balance_loss_mlp": 1.01974237, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 2.402972173461691, + "language_loss": 0.83491933, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85579967, + "num_input_tokens_seen": 324618145, + "step": 15055, + "time_per_iteration": 2.5687661170959473 + }, + { + "auxiliary_loss_clip": 0.01094316, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.03520751, + "balance_loss_mlp": 1.01964962, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.7732950676917467, + "language_loss": 0.85155427, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87281561, + "num_input_tokens_seen": 324638165, + "step": 15056, + "time_per_iteration": 2.4692251682281494 + }, + { + "auxiliary_loss_clip": 0.01081111, + "auxiliary_loss_mlp": 0.01029123, + "balance_loss_clip": 1.03575397, + "balance_loss_mlp": 1.01785779, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.946983444773047, + "language_loss": 0.72245717, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74355948, + "num_input_tokens_seen": 324658560, + "step": 15057, + "time_per_iteration": 2.565526247024536 + }, + { + "auxiliary_loss_clip": 0.01094523, + "auxiliary_loss_mlp": 0.01028422, + "balance_loss_clip": 1.03358376, + "balance_loss_mlp": 1.01646566, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 1.7573044307690322, + "language_loss": 0.80842769, + "learning_rate": 9.321294810356418e-08, + "loss": 0.8296572, + "num_input_tokens_seen": 324679185, + "step": 15058, + "time_per_iteration": 3.9208872318267822 + }, + { + "auxiliary_loss_clip": 0.01018878, + "auxiliary_loss_mlp": 0.00998927, + "balance_loss_clip": 1.00638199, + "balance_loss_mlp": 0.99774665, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6739719283313943, + "language_loss": 0.51385599, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53403401, + "num_input_tokens_seen": 324744830, + "step": 15059, + "time_per_iteration": 3.1358726024627686 + }, + { + "auxiliary_loss_clip": 0.01072243, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.03923273, + "balance_loss_mlp": 1.01508772, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 2.1309867655982444, + "language_loss": 0.6690222, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69002247, + "num_input_tokens_seen": 324762905, + "step": 15060, + "time_per_iteration": 2.5597903728485107 + }, + { + "auxiliary_loss_clip": 0.01089206, + "auxiliary_loss_mlp": 0.01031229, + "balance_loss_clip": 1.03653562, + "balance_loss_mlp": 1.0188967, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 1.9868401282561436, + "language_loss": 0.64142376, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66262811, + "num_input_tokens_seen": 324781905, + "step": 15061, + "time_per_iteration": 2.482172727584839 + }, + { + "auxiliary_loss_clip": 0.01084023, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.03480625, + "balance_loss_mlp": 1.02525449, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 2.2584671232574793, + "language_loss": 0.71614605, + "learning_rate": 9.274347804044058e-08, + "loss": 0.73736656, + "num_input_tokens_seen": 324799260, + "step": 15062, + "time_per_iteration": 2.4846291542053223 + }, + { + "auxiliary_loss_clip": 0.01105878, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.0349704, + "balance_loss_mlp": 1.01851296, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 1.541850177053073, + "language_loss": 0.70527822, + "learning_rate": 9.2626291321936e-08, + "loss": 0.72663689, + "num_input_tokens_seen": 324817800, + "step": 15063, + "time_per_iteration": 2.4111759662628174 + }, + { + "auxiliary_loss_clip": 0.01067936, + "auxiliary_loss_mlp": 0.01031084, + "balance_loss_clip": 1.03482449, + "balance_loss_mlp": 1.01927638, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 4.2618193005009495, + "language_loss": 0.72240549, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74339563, + "num_input_tokens_seen": 324838445, + "step": 15064, + "time_per_iteration": 2.6053466796875 + }, + { + "auxiliary_loss_clip": 0.01098075, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.03394985, + "balance_loss_mlp": 1.0185442, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 1.781211507610801, + "language_loss": 0.6929425, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71423376, + "num_input_tokens_seen": 324859895, + "step": 15065, + "time_per_iteration": 2.530266046524048 + }, + { + "auxiliary_loss_clip": 0.01078396, + "auxiliary_loss_mlp": 0.0103624, + "balance_loss_clip": 1.03383994, + "balance_loss_mlp": 1.02373457, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.6404319588654848, + "language_loss": 0.63055336, + "learning_rate": 9.227516515099743e-08, + "loss": 0.65169966, + "num_input_tokens_seen": 324879580, + "step": 15066, + "time_per_iteration": 2.4836223125457764 + }, + { + "auxiliary_loss_clip": 0.01039279, + "auxiliary_loss_mlp": 0.01032071, + "balance_loss_clip": 1.02979922, + "balance_loss_mlp": 1.01852334, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 2.229633069598639, + "language_loss": 0.80138826, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82210171, + "num_input_tokens_seen": 324898950, + "step": 15067, + "time_per_iteration": 2.6259865760803223 + }, + { + "auxiliary_loss_clip": 0.01090561, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.03795886, + "balance_loss_mlp": 1.01906228, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.59484718703927, + "language_loss": 0.70099509, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72222126, + "num_input_tokens_seen": 324917455, + "step": 15068, + "time_per_iteration": 2.470008373260498 + }, + { + "auxiliary_loss_clip": 0.01104668, + "auxiliary_loss_mlp": 0.01027588, + "balance_loss_clip": 1.03448915, + "balance_loss_mlp": 1.01548815, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 1.9558110566887779, + "language_loss": 0.8530671, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87438965, + "num_input_tokens_seen": 324934495, + "step": 15069, + "time_per_iteration": 2.4387128353118896 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03389263, + "balance_loss_mlp": 1.01901722, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 1.999983153626738, + "language_loss": 0.5927698, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61410415, + "num_input_tokens_seen": 324953230, + "step": 15070, + "time_per_iteration": 2.4830222129821777 + }, + { + "auxiliary_loss_clip": 0.0107787, + "auxiliary_loss_mlp": 0.01025069, + "balance_loss_clip": 1.03905988, + "balance_loss_mlp": 1.01194382, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 2.193884325161824, + "language_loss": 0.81564677, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83667612, + "num_input_tokens_seen": 324969880, + "step": 15071, + "time_per_iteration": 4.018348455429077 + }, + { + "auxiliary_loss_clip": 0.01112252, + "auxiliary_loss_mlp": 0.01041693, + "balance_loss_clip": 1.03654814, + "balance_loss_mlp": 1.02834153, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.7992377016169672, + "language_loss": 0.61739564, + "learning_rate": 9.157486613883758e-08, + "loss": 0.63893509, + "num_input_tokens_seen": 324987005, + "step": 15072, + "time_per_iteration": 2.392092227935791 + }, + { + "auxiliary_loss_clip": 0.01087189, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.03422427, + "balance_loss_mlp": 1.02002954, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.9714223140170497, + "language_loss": 0.73377383, + "learning_rate": 9.145840289787021e-08, + "loss": 0.754969, + "num_input_tokens_seen": 325010700, + "step": 15073, + "time_per_iteration": 4.142418146133423 + }, + { + "auxiliary_loss_clip": 0.01094718, + "auxiliary_loss_mlp": 0.01026604, + "balance_loss_clip": 1.03598154, + "balance_loss_mlp": 1.01494503, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 3.921723061042667, + "language_loss": 0.81079912, + "learning_rate": 9.134201202899161e-08, + "loss": 0.83201236, + "num_input_tokens_seen": 325028760, + "step": 15074, + "time_per_iteration": 2.4424843788146973 + }, + { + "auxiliary_loss_clip": 0.00987923, + "auxiliary_loss_mlp": 0.00754866, + "balance_loss_clip": 1.01151729, + "balance_loss_mlp": 1.0001626, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.8896393040664297, + "language_loss": 0.52362388, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54105175, + "num_input_tokens_seen": 325093545, + "step": 15075, + "time_per_iteration": 3.365694284439087 + }, + { + "auxiliary_loss_clip": 0.01010847, + "auxiliary_loss_mlp": 0.01000866, + "balance_loss_clip": 1.02088487, + "balance_loss_mlp": 0.99971533, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7266803307358832, + "language_loss": 0.62094855, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64106566, + "num_input_tokens_seen": 325152295, + "step": 15076, + "time_per_iteration": 3.7390408515930176 + }, + { + "auxiliary_loss_clip": 0.01095125, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.03418756, + "balance_loss_mlp": 1.02110124, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 2.0278180103219934, + "language_loss": 0.82557869, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84686011, + "num_input_tokens_seen": 325169705, + "step": 15077, + "time_per_iteration": 2.4684853553771973 + }, + { + "auxiliary_loss_clip": 0.01082542, + "auxiliary_loss_mlp": 0.00776813, + "balance_loss_clip": 1.03225422, + "balance_loss_mlp": 1.00049555, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 1.5045514296811546, + "language_loss": 0.84114981, + "learning_rate": 9.08771723625934e-08, + "loss": 0.85974336, + "num_input_tokens_seen": 325189175, + "step": 15078, + "time_per_iteration": 2.509917974472046 + }, + { + "auxiliary_loss_clip": 0.01092493, + "auxiliary_loss_mlp": 0.0077687, + "balance_loss_clip": 1.03611374, + "balance_loss_mlp": 1.0004667, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 4.1369133470202915, + "language_loss": 0.65257084, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67126447, + "num_input_tokens_seen": 325211020, + "step": 15079, + "time_per_iteration": 2.6493353843688965 + }, + { + "auxiliary_loss_clip": 0.01031823, + "auxiliary_loss_mlp": 0.01029076, + "balance_loss_clip": 1.03201318, + "balance_loss_mlp": 1.01707768, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.6907300164969674, + "language_loss": 0.71075159, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73136061, + "num_input_tokens_seen": 325236970, + "step": 15080, + "time_per_iteration": 2.891134023666382 + }, + { + "auxiliary_loss_clip": 0.01096838, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.0386076, + "balance_loss_mlp": 1.01762915, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 2.199984511439112, + "language_loss": 0.70584786, + "learning_rate": 9.052930273571547e-08, + "loss": 0.72711849, + "num_input_tokens_seen": 325252670, + "step": 15081, + "time_per_iteration": 2.4419138431549072 + }, + { + "auxiliary_loss_clip": 0.01082817, + "auxiliary_loss_mlp": 0.01032437, + "balance_loss_clip": 1.0381242, + "balance_loss_mlp": 1.02036679, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 1.817955262013025, + "language_loss": 0.74358535, + "learning_rate": 9.04134910022032e-08, + "loss": 0.7647379, + "num_input_tokens_seen": 325273860, + "step": 15082, + "time_per_iteration": 2.531855344772339 + }, + { + "auxiliary_loss_clip": 0.01074178, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.0363065, + "balance_loss_mlp": 1.02120256, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 2.577643232754893, + "language_loss": 0.78126884, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80233777, + "num_input_tokens_seen": 325294140, + "step": 15083, + "time_per_iteration": 2.5840375423431396 + }, + { + "auxiliary_loss_clip": 0.01082842, + "auxiliary_loss_mlp": 0.00776159, + "balance_loss_clip": 1.03713655, + "balance_loss_mlp": 1.00055313, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 2.2743239664040913, + "language_loss": 0.68650317, + "learning_rate": 9.01820847747028e-08, + "loss": 0.7050932, + "num_input_tokens_seen": 325313130, + "step": 15084, + "time_per_iteration": 4.464092969894409 + }, + { + "auxiliary_loss_clip": 0.01108336, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.03717661, + "balance_loss_mlp": 1.02043533, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 2.4191694476041703, + "language_loss": 0.66876709, + "learning_rate": 9.006649028948965e-08, + "loss": 0.69017529, + "num_input_tokens_seen": 325334880, + "step": 15085, + "time_per_iteration": 2.494722366333008 + }, + { + "auxiliary_loss_clip": 0.01009468, + "auxiliary_loss_mlp": 0.01007213, + "balance_loss_clip": 1.01869452, + "balance_loss_mlp": 1.00572932, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7815369667451877, + "language_loss": 0.61285686, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63302368, + "num_input_tokens_seen": 325394175, + "step": 15086, + "time_per_iteration": 3.174633026123047 + }, + { + "auxiliary_loss_clip": 0.01093429, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.03322494, + "balance_loss_mlp": 1.0253278, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.4805956687328135, + "language_loss": 0.72342062, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74473774, + "num_input_tokens_seen": 325415020, + "step": 15087, + "time_per_iteration": 2.4624083042144775 + }, + { + "auxiliary_loss_clip": 0.01086197, + "auxiliary_loss_mlp": 0.01028984, + "balance_loss_clip": 1.03445709, + "balance_loss_mlp": 1.01717615, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 2.3022166106268807, + "language_loss": 0.76821876, + "learning_rate": 8.972014140059058e-08, + "loss": 0.78937054, + "num_input_tokens_seen": 325433595, + "step": 15088, + "time_per_iteration": 2.4790661334991455 + }, + { + "auxiliary_loss_clip": 0.01078415, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.03739738, + "balance_loss_mlp": 1.01783776, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 1.9862484714228992, + "language_loss": 0.73074859, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75182849, + "num_input_tokens_seen": 325451605, + "step": 15089, + "time_per_iteration": 2.5141143798828125 + }, + { + "auxiliary_loss_clip": 0.01101481, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.0346241, + "balance_loss_mlp": 1.0193615, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 1.8163664910238941, + "language_loss": 0.75533104, + "learning_rate": 8.948960432404628e-08, + "loss": 0.7766521, + "num_input_tokens_seen": 325470645, + "step": 15090, + "time_per_iteration": 2.481153964996338 + }, + { + "auxiliary_loss_clip": 0.01082269, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.03448296, + "balance_loss_mlp": 1.01479721, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 7.341301185559, + "language_loss": 0.77788389, + "learning_rate": 8.93744444537079e-08, + "loss": 0.79898673, + "num_input_tokens_seen": 325488070, + "step": 15091, + "time_per_iteration": 2.507237434387207 + }, + { + "auxiliary_loss_clip": 0.01079398, + "auxiliary_loss_mlp": 0.01024308, + "balance_loss_clip": 1.03141856, + "balance_loss_mlp": 1.01348364, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 6.0296770922706004, + "language_loss": 0.86226404, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88330108, + "num_input_tokens_seen": 325509285, + "step": 15092, + "time_per_iteration": 2.5126214027404785 + }, + { + "auxiliary_loss_clip": 0.01085497, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.03777957, + "balance_loss_mlp": 1.01813173, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.5625034157781126, + "language_loss": 0.7894609, + "learning_rate": 8.914434207073296e-08, + "loss": 0.81061947, + "num_input_tokens_seen": 325529360, + "step": 15093, + "time_per_iteration": 2.5330965518951416 + }, + { + "auxiliary_loss_clip": 0.01018669, + "auxiliary_loss_mlp": 0.01001601, + "balance_loss_clip": 1.00419223, + "balance_loss_mlp": 1.00037313, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7370869231718794, + "language_loss": 0.57006156, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59026432, + "num_input_tokens_seen": 325583565, + "step": 15094, + "time_per_iteration": 2.9761180877685547 + }, + { + "auxiliary_loss_clip": 0.01099391, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.03475523, + "balance_loss_mlp": 1.01836848, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 3.0187622429308782, + "language_loss": 0.71521676, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73652577, + "num_input_tokens_seen": 325603690, + "step": 15095, + "time_per_iteration": 2.457981586456299 + }, + { + "auxiliary_loss_clip": 0.01066935, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.03377414, + "balance_loss_mlp": 1.02218461, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 2.0126451517561943, + "language_loss": 0.74326122, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76427388, + "num_input_tokens_seen": 325622255, + "step": 15096, + "time_per_iteration": 2.5327768325805664 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.03586531, + "balance_loss_mlp": 1.0228219, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 1.9469780714581215, + "language_loss": 0.5715853, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59304649, + "num_input_tokens_seen": 325640165, + "step": 15097, + "time_per_iteration": 4.364338159561157 + }, + { + "auxiliary_loss_clip": 0.01086366, + "auxiliary_loss_mlp": 0.01024406, + "balance_loss_clip": 1.03261638, + "balance_loss_mlp": 1.01340258, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.5750832048244086, + "language_loss": 0.79474849, + "learning_rate": 8.857035423668935e-08, + "loss": 0.81585622, + "num_input_tokens_seen": 325659455, + "step": 15098, + "time_per_iteration": 2.477220058441162 + }, + { + "auxiliary_loss_clip": 0.0106742, + "auxiliary_loss_mlp": 0.00779624, + "balance_loss_clip": 1.03414893, + "balance_loss_mlp": 1.00062275, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 1.7864033733090985, + "language_loss": 0.66135824, + "learning_rate": 8.845577409729266e-08, + "loss": 0.67982864, + "num_input_tokens_seen": 325678095, + "step": 15099, + "time_per_iteration": 2.6370747089385986 + }, + { + "auxiliary_loss_clip": 0.01088547, + "auxiliary_loss_mlp": 0.01032253, + "balance_loss_clip": 1.03499174, + "balance_loss_mlp": 1.01987946, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 2.4629433066467206, + "language_loss": 0.7000109, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72121894, + "num_input_tokens_seen": 325695825, + "step": 15100, + "time_per_iteration": 2.5100293159484863 + }, + { + "auxiliary_loss_clip": 0.01017746, + "auxiliary_loss_mlp": 0.01001984, + "balance_loss_clip": 1.00411797, + "balance_loss_mlp": 1.0007267, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.6225171332946645, + "language_loss": 0.53422379, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55442119, + "num_input_tokens_seen": 325764515, + "step": 15101, + "time_per_iteration": 3.098710298538208 + }, + { + "auxiliary_loss_clip": 0.01075742, + "auxiliary_loss_mlp": 0.01028683, + "balance_loss_clip": 1.034657, + "balance_loss_mlp": 1.01613057, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 3.1189550044796834, + "language_loss": 0.6818949, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70293915, + "num_input_tokens_seen": 325783235, + "step": 15102, + "time_per_iteration": 2.574235439300537 + }, + { + "auxiliary_loss_clip": 0.01093694, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.03392911, + "balance_loss_mlp": 1.01806569, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 2.657051306854493, + "language_loss": 0.78852856, + "learning_rate": 8.799817844260049e-08, + "loss": 0.80976367, + "num_input_tokens_seen": 325800195, + "step": 15103, + "time_per_iteration": 2.4692251682281494 + }, + { + "auxiliary_loss_clip": 0.01081492, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.0333885, + "balance_loss_mlp": 1.01845455, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.6786171857018317, + "language_loss": 0.7230804, + "learning_rate": 8.78839607763413e-08, + "loss": 0.74420291, + "num_input_tokens_seen": 325820215, + "step": 15104, + "time_per_iteration": 2.554072141647339 + }, + { + "auxiliary_loss_clip": 0.01085072, + "auxiliary_loss_mlp": 0.01027485, + "balance_loss_clip": 1.03516245, + "balance_loss_mlp": 1.01652968, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.74075481483346, + "language_loss": 0.77227908, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79340464, + "num_input_tokens_seen": 325838415, + "step": 15105, + "time_per_iteration": 2.528047800064087 + }, + { + "auxiliary_loss_clip": 0.01105611, + "auxiliary_loss_mlp": 0.00778271, + "balance_loss_clip": 1.03370965, + "balance_loss_mlp": 1.00060332, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 1.9858694773792944, + "language_loss": 0.74138254, + "learning_rate": 8.765574297104628e-08, + "loss": 0.76022136, + "num_input_tokens_seen": 325855580, + "step": 15106, + "time_per_iteration": 2.4505674839019775 + }, + { + "auxiliary_loss_clip": 0.01062601, + "auxiliary_loss_mlp": 0.0103819, + "balance_loss_clip": 1.02943563, + "balance_loss_mlp": 1.02473676, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.76701290823139, + "language_loss": 0.80264395, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82365179, + "num_input_tokens_seen": 325874890, + "step": 15107, + "time_per_iteration": 2.5800366401672363 + }, + { + "auxiliary_loss_clip": 0.01015424, + "auxiliary_loss_mlp": 0.00999666, + "balance_loss_clip": 1.01654458, + "balance_loss_mlp": 0.99859351, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8116700535006551, + "language_loss": 0.59658498, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61673588, + "num_input_tokens_seen": 325935835, + "step": 15108, + "time_per_iteration": 3.0514495372772217 + }, + { + "auxiliary_loss_clip": 0.01085135, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.03342867, + "balance_loss_mlp": 1.01350951, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 4.16256422052927, + "language_loss": 0.73384285, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75495386, + "num_input_tokens_seen": 325958035, + "step": 15109, + "time_per_iteration": 2.593003988265991 + }, + { + "auxiliary_loss_clip": 0.01075504, + "auxiliary_loss_mlp": 0.01028523, + "balance_loss_clip": 1.0352242, + "balance_loss_mlp": 1.01709712, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 1.8822548964266423, + "language_loss": 0.71458918, + "learning_rate": 8.720017759045073e-08, + "loss": 0.7356295, + "num_input_tokens_seen": 325979870, + "step": 15110, + "time_per_iteration": 4.115428686141968 + }, + { + "auxiliary_loss_clip": 0.01079726, + "auxiliary_loss_mlp": 0.01035251, + "balance_loss_clip": 1.03072333, + "balance_loss_mlp": 1.02225137, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 1.9800030236902457, + "language_loss": 0.68889725, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71004707, + "num_input_tokens_seen": 325998245, + "step": 15111, + "time_per_iteration": 2.5683841705322266 + }, + { + "auxiliary_loss_clip": 0.00999739, + "auxiliary_loss_mlp": 0.01003246, + "balance_loss_clip": 1.0057081, + "balance_loss_mlp": 1.00208378, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.7049609161427235, + "language_loss": 0.51665723, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53668702, + "num_input_tokens_seen": 326061770, + "step": 15112, + "time_per_iteration": 4.530330419540405 + }, + { + "auxiliary_loss_clip": 0.01097434, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.03376663, + "balance_loss_mlp": 1.02138543, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 1.7766982212388414, + "language_loss": 0.69790626, + "learning_rate": 8.685926514226837e-08, + "loss": 0.71921432, + "num_input_tokens_seen": 326080945, + "step": 15113, + "time_per_iteration": 2.4486405849456787 + }, + { + "auxiliary_loss_clip": 0.01098467, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.03780484, + "balance_loss_mlp": 1.0182395, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.2175763202947323, + "language_loss": 0.79121733, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81249833, + "num_input_tokens_seen": 326100630, + "step": 15114, + "time_per_iteration": 2.5565361976623535 + }, + { + "auxiliary_loss_clip": 0.01071123, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.0367732, + "balance_loss_mlp": 1.01884389, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 2.0988203176441305, + "language_loss": 0.7022028, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72324109, + "num_input_tokens_seen": 326120145, + "step": 15115, + "time_per_iteration": 2.5759449005126953 + }, + { + "auxiliary_loss_clip": 0.01087793, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.03893781, + "balance_loss_mlp": 1.0153141, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 1.5752879517458094, + "language_loss": 0.65946233, + "learning_rate": 8.651900561246561e-08, + "loss": 0.68062866, + "num_input_tokens_seen": 326140715, + "step": 15116, + "time_per_iteration": 2.5603699684143066 + }, + { + "auxiliary_loss_clip": 0.01107018, + "auxiliary_loss_mlp": 0.01034175, + "balance_loss_clip": 1.03730667, + "balance_loss_mlp": 1.02141929, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 1.7129374088143157, + "language_loss": 0.69527519, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71668708, + "num_input_tokens_seen": 326159130, + "step": 15117, + "time_per_iteration": 2.436314821243286 + }, + { + "auxiliary_loss_clip": 0.01066948, + "auxiliary_loss_mlp": 0.01026092, + "balance_loss_clip": 1.03580832, + "balance_loss_mlp": 1.01454091, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.4404420900994306, + "language_loss": 0.74677145, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76770186, + "num_input_tokens_seen": 326181375, + "step": 15118, + "time_per_iteration": 2.5779218673706055 + }, + { + "auxiliary_loss_clip": 0.01084513, + "auxiliary_loss_mlp": 0.01036904, + "balance_loss_clip": 1.0339576, + "balance_loss_mlp": 1.02297413, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 2.5420694572115994, + "language_loss": 0.73327708, + "learning_rate": 8.617939911716554e-08, + "loss": 0.75449127, + "num_input_tokens_seen": 326199740, + "step": 15119, + "time_per_iteration": 2.509202480316162 + }, + { + "auxiliary_loss_clip": 0.01075321, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.03590834, + "balance_loss_mlp": 1.02076364, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.503347683027668, + "language_loss": 0.71373761, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73484236, + "num_input_tokens_seen": 326214350, + "step": 15120, + "time_per_iteration": 2.5034661293029785 + }, + { + "auxiliary_loss_clip": 0.01109397, + "auxiliary_loss_mlp": 0.00777985, + "balance_loss_clip": 1.03645015, + "balance_loss_mlp": 1.00058091, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 2.175646333296631, + "language_loss": 0.65840209, + "learning_rate": 8.595335764115596e-08, + "loss": 0.6772759, + "num_input_tokens_seen": 326234580, + "step": 15121, + "time_per_iteration": 2.4597811698913574 + }, + { + "auxiliary_loss_clip": 0.01098654, + "auxiliary_loss_mlp": 0.01039878, + "balance_loss_clip": 1.03582704, + "balance_loss_mlp": 1.02731967, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.7311137254508966, + "language_loss": 0.70072579, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72211111, + "num_input_tokens_seen": 326259080, + "step": 15122, + "time_per_iteration": 2.726898193359375 + }, + { + "auxiliary_loss_clip": 0.01059007, + "auxiliary_loss_mlp": 0.01033316, + "balance_loss_clip": 1.03582859, + "balance_loss_mlp": 1.0212816, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.4629074241221565, + "language_loss": 0.74734735, + "learning_rate": 8.572760648850575e-08, + "loss": 0.76827061, + "num_input_tokens_seen": 326280175, + "step": 15123, + "time_per_iteration": 4.167697906494141 + }, + { + "auxiliary_loss_clip": 0.01094834, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.03576016, + "balance_loss_mlp": 1.02149189, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 1.919844018645518, + "language_loss": 0.76195121, + "learning_rate": 8.561483979414253e-08, + "loss": 0.78323221, + "num_input_tokens_seen": 326297990, + "step": 15124, + "time_per_iteration": 2.5470027923583984 + }, + { + "auxiliary_loss_clip": 0.01091879, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.03394115, + "balance_loss_mlp": 1.01947451, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 2.327612230247668, + "language_loss": 0.72157729, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74282098, + "num_input_tokens_seen": 326316735, + "step": 15125, + "time_per_iteration": 2.4726693630218506 + }, + { + "auxiliary_loss_clip": 0.01068985, + "auxiliary_loss_mlp": 0.01034874, + "balance_loss_clip": 1.03680634, + "balance_loss_mlp": 1.02275014, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 2.5499592464527465, + "language_loss": 0.79163182, + "learning_rate": 8.538952419072143e-08, + "loss": 0.81267035, + "num_input_tokens_seen": 326334370, + "step": 15126, + "time_per_iteration": 2.5147433280944824 + }, + { + "auxiliary_loss_clip": 0.01066395, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.0366466, + "balance_loss_mlp": 1.02121854, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.5847954141192155, + "language_loss": 0.75607789, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77708089, + "num_input_tokens_seen": 326353435, + "step": 15127, + "time_per_iteration": 2.636127471923828 + }, + { + "auxiliary_loss_clip": 0.01032404, + "auxiliary_loss_mlp": 0.01036686, + "balance_loss_clip": 1.03248096, + "balance_loss_mlp": 1.02428198, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.8982350203987666, + "language_loss": 0.62492144, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64561236, + "num_input_tokens_seen": 326371810, + "step": 15128, + "time_per_iteration": 2.6358911991119385 + }, + { + "auxiliary_loss_clip": 0.01074277, + "auxiliary_loss_mlp": 0.01024182, + "balance_loss_clip": 1.03423858, + "balance_loss_mlp": 1.01259518, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.5585324017839066, + "language_loss": 0.7669723, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78795683, + "num_input_tokens_seen": 326391380, + "step": 15129, + "time_per_iteration": 2.533536911010742 + }, + { + "auxiliary_loss_clip": 0.01097211, + "auxiliary_loss_mlp": 0.0102577, + "balance_loss_clip": 1.03406858, + "balance_loss_mlp": 1.01363444, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 2.1823222474451396, + "language_loss": 0.83717889, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85840869, + "num_input_tokens_seen": 326408800, + "step": 15130, + "time_per_iteration": 2.483489751815796 + }, + { + "auxiliary_loss_clip": 0.01087396, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.03478765, + "balance_loss_mlp": 1.02063322, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.873504879067482, + "language_loss": 0.74899113, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77019423, + "num_input_tokens_seen": 326431565, + "step": 15131, + "time_per_iteration": 2.6575253009796143 + }, + { + "auxiliary_loss_clip": 0.01083303, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.0348587, + "balance_loss_mlp": 1.0211314, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 1.8160938257411727, + "language_loss": 0.5973419, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61851567, + "num_input_tokens_seen": 326451715, + "step": 15132, + "time_per_iteration": 2.5971455574035645 + }, + { + "auxiliary_loss_clip": 0.01068245, + "auxiliary_loss_mlp": 0.01030331, + "balance_loss_clip": 1.0364877, + "balance_loss_mlp": 1.01888108, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.5291380951753708, + "language_loss": 0.82543868, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84642446, + "num_input_tokens_seen": 326470855, + "step": 15133, + "time_per_iteration": 2.534975290298462 + }, + { + "auxiliary_loss_clip": 0.01086174, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.0333426, + "balance_loss_mlp": 1.01991916, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.5731879647828808, + "language_loss": 0.73916805, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76035303, + "num_input_tokens_seen": 326490480, + "step": 15134, + "time_per_iteration": 2.551917552947998 + }, + { + "auxiliary_loss_clip": 0.01086386, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.04215384, + "balance_loss_mlp": 1.02171695, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 1.5076263575061597, + "language_loss": 0.72683287, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74803907, + "num_input_tokens_seen": 326509445, + "step": 15135, + "time_per_iteration": 2.5762076377868652 + }, + { + "auxiliary_loss_clip": 0.01097046, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.03705883, + "balance_loss_mlp": 1.01867044, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.8399094260339, + "language_loss": 0.69957578, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72084814, + "num_input_tokens_seen": 326528380, + "step": 15136, + "time_per_iteration": 4.012638807296753 + }, + { + "auxiliary_loss_clip": 0.00993397, + "auxiliary_loss_mlp": 0.01005685, + "balance_loss_clip": 1.00742924, + "balance_loss_mlp": 1.00461245, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8197604054258827, + "language_loss": 0.59289467, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61288548, + "num_input_tokens_seen": 326576940, + "step": 15137, + "time_per_iteration": 2.8617188930511475 + }, + { + "auxiliary_loss_clip": 0.01098784, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.03612041, + "balance_loss_mlp": 1.02377152, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 1.6928046381231097, + "language_loss": 0.8258332, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84717953, + "num_input_tokens_seen": 326596100, + "step": 15138, + "time_per_iteration": 2.5080699920654297 + }, + { + "auxiliary_loss_clip": 0.01094686, + "auxiliary_loss_mlp": 0.01026367, + "balance_loss_clip": 1.03730476, + "balance_loss_mlp": 1.01513755, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.5735182823501126, + "language_loss": 0.81217301, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83338356, + "num_input_tokens_seen": 326615700, + "step": 15139, + "time_per_iteration": 2.5218210220336914 + }, + { + "auxiliary_loss_clip": 0.01076595, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.03669834, + "balance_loss_mlp": 1.022578, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 1.6963907221947998, + "language_loss": 0.77678406, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79789054, + "num_input_tokens_seen": 326635905, + "step": 15140, + "time_per_iteration": 2.5259578227996826 + }, + { + "auxiliary_loss_clip": 0.01106464, + "auxiliary_loss_mlp": 0.01027352, + "balance_loss_clip": 1.03516626, + "balance_loss_mlp": 1.01580679, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 1.7053281870181172, + "language_loss": 0.66508996, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68642807, + "num_input_tokens_seen": 326661855, + "step": 15141, + "time_per_iteration": 2.5701913833618164 + }, + { + "auxiliary_loss_clip": 0.01098019, + "auxiliary_loss_mlp": 0.01034401, + "balance_loss_clip": 1.03546095, + "balance_loss_mlp": 1.02224791, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.86033702688652, + "language_loss": 0.75089276, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77221704, + "num_input_tokens_seen": 326679320, + "step": 15142, + "time_per_iteration": 2.4681968688964844 + }, + { + "auxiliary_loss_clip": 0.01070436, + "auxiliary_loss_mlp": 0.01036498, + "balance_loss_clip": 1.03098273, + "balance_loss_mlp": 1.02412391, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.9747365886759258, + "language_loss": 0.64292103, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66399038, + "num_input_tokens_seen": 326698110, + "step": 15143, + "time_per_iteration": 2.5003156661987305 + }, + { + "auxiliary_loss_clip": 0.01109022, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.03527784, + "balance_loss_mlp": 1.02064157, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 1.9192458046906176, + "language_loss": 0.61258328, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63400722, + "num_input_tokens_seen": 326718370, + "step": 15144, + "time_per_iteration": 2.5198590755462646 + }, + { + "auxiliary_loss_clip": 0.01068054, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.03071642, + "balance_loss_mlp": 1.01730418, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.6405383662156579, + "language_loss": 0.70755172, + "learning_rate": 8.326351491278382e-08, + "loss": 0.72852516, + "num_input_tokens_seen": 326738445, + "step": 15145, + "time_per_iteration": 2.6137564182281494 + }, + { + "auxiliary_loss_clip": 0.01053839, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.03681374, + "balance_loss_mlp": 1.01692998, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 1.5033488933236954, + "language_loss": 0.70654303, + "learning_rate": 8.315234626222545e-08, + "loss": 0.72736776, + "num_input_tokens_seen": 326758855, + "step": 15146, + "time_per_iteration": 2.659705400466919 + }, + { + "auxiliary_loss_clip": 0.01086622, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.0343045, + "balance_loss_mlp": 1.02018917, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 2.1302660805669165, + "language_loss": 0.72693598, + "learning_rate": 8.304125029872233e-08, + "loss": 0.74811888, + "num_input_tokens_seen": 326777140, + "step": 15147, + "time_per_iteration": 2.5336899757385254 + }, + { + "auxiliary_loss_clip": 0.01080887, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.03649688, + "balance_loss_mlp": 1.01795983, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 1.934344899561876, + "language_loss": 0.80151594, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82262182, + "num_input_tokens_seen": 326794070, + "step": 15148, + "time_per_iteration": 2.518200635910034 + }, + { + "auxiliary_loss_clip": 0.01075708, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.03448963, + "balance_loss_mlp": 1.02525377, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 2.0443170189826723, + "language_loss": 0.67767072, + "learning_rate": 8.281927644972996e-08, + "loss": 0.69880217, + "num_input_tokens_seen": 326814695, + "step": 15149, + "time_per_iteration": 4.087657690048218 + }, + { + "auxiliary_loss_clip": 0.01108184, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.03648257, + "balance_loss_mlp": 1.01650953, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 3.9303466783834238, + "language_loss": 0.6332764, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65465039, + "num_input_tokens_seen": 326835295, + "step": 15150, + "time_per_iteration": 2.4906692504882812 + }, + { + "auxiliary_loss_clip": 0.01067068, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.03453314, + "balance_loss_mlp": 1.02169967, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 1.909207512570382, + "language_loss": 0.72315294, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74416512, + "num_input_tokens_seen": 326853350, + "step": 15151, + "time_per_iteration": 3.954902410507202 + }, + { + "auxiliary_loss_clip": 0.01095595, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.03579617, + "balance_loss_mlp": 1.01486623, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.803020538308117, + "language_loss": 0.64644367, + "learning_rate": 8.248686093438429e-08, + "loss": 0.6676656, + "num_input_tokens_seen": 326873425, + "step": 15152, + "time_per_iteration": 2.507982015609741 + }, + { + "auxiliary_loss_clip": 0.0108838, + "auxiliary_loss_mlp": 0.00778016, + "balance_loss_clip": 1.03606081, + "balance_loss_mlp": 1.00061822, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 1.9408442037321099, + "language_loss": 0.73442268, + "learning_rate": 8.23762011815834e-08, + "loss": 0.75308669, + "num_input_tokens_seen": 326893455, + "step": 15153, + "time_per_iteration": 2.518634080886841 + }, + { + "auxiliary_loss_clip": 0.01069199, + "auxiliary_loss_mlp": 0.01044876, + "balance_loss_clip": 1.03119397, + "balance_loss_mlp": 1.03057098, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 1.9846253870091228, + "language_loss": 0.72217315, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74331391, + "num_input_tokens_seen": 326910210, + "step": 15154, + "time_per_iteration": 2.486862897872925 + }, + { + "auxiliary_loss_clip": 0.01086667, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.03828907, + "balance_loss_mlp": 1.0206902, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 2.2724245247122026, + "language_loss": 0.82532954, + "learning_rate": 8.215509982963564e-08, + "loss": 0.84651828, + "num_input_tokens_seen": 326929350, + "step": 15155, + "time_per_iteration": 2.487100601196289 + }, + { + "auxiliary_loss_clip": 0.01094755, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.03711486, + "balance_loss_mlp": 1.01818073, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.506953603993499, + "language_loss": 0.59806383, + "learning_rate": 8.204465823887252e-08, + "loss": 0.61931932, + "num_input_tokens_seen": 326949060, + "step": 15156, + "time_per_iteration": 2.4496397972106934 + }, + { + "auxiliary_loss_clip": 0.01098815, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.03456461, + "balance_loss_mlp": 1.01749134, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 2.182587433966662, + "language_loss": 0.74015957, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76144725, + "num_input_tokens_seen": 326968950, + "step": 15157, + "time_per_iteration": 2.4964191913604736 + }, + { + "auxiliary_loss_clip": 0.01065182, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.03211212, + "balance_loss_mlp": 1.02001405, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 1.5962938599244632, + "language_loss": 0.59439445, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61535615, + "num_input_tokens_seen": 326989455, + "step": 15158, + "time_per_iteration": 2.695765495300293 + }, + { + "auxiliary_loss_clip": 0.01054837, + "auxiliary_loss_mlp": 0.01033253, + "balance_loss_clip": 1.04001415, + "balance_loss_mlp": 1.02213669, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.5607055407321333, + "language_loss": 0.68007648, + "learning_rate": 8.171376985767375e-08, + "loss": 0.70095742, + "num_input_tokens_seen": 327009640, + "step": 15159, + "time_per_iteration": 2.6446783542633057 + }, + { + "auxiliary_loss_clip": 0.01088037, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.03530359, + "balance_loss_mlp": 1.01699817, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 1.9130566076181965, + "language_loss": 0.78575182, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80692321, + "num_input_tokens_seen": 327027690, + "step": 15160, + "time_per_iteration": 2.5318315029144287 + }, + { + "auxiliary_loss_clip": 0.01110535, + "auxiliary_loss_mlp": 0.01027982, + "balance_loss_clip": 1.03778815, + "balance_loss_mlp": 1.01462412, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.8040724907642516, + "language_loss": 0.68701321, + "learning_rate": 8.149354130460073e-08, + "loss": 0.7083984, + "num_input_tokens_seen": 327045915, + "step": 15161, + "time_per_iteration": 2.3954317569732666 + }, + { + "auxiliary_loss_clip": 0.01065288, + "auxiliary_loss_mlp": 0.01040297, + "balance_loss_clip": 1.03444254, + "balance_loss_mlp": 1.02610493, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.9499394894066966, + "language_loss": 0.76311982, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78417575, + "num_input_tokens_seen": 327066355, + "step": 15162, + "time_per_iteration": 4.196109056472778 + }, + { + "auxiliary_loss_clip": 0.01082547, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.03611803, + "balance_loss_mlp": 1.02247381, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 1.9074489850640914, + "language_loss": 0.66578037, + "learning_rate": 8.127360375135395e-08, + "loss": 0.68694949, + "num_input_tokens_seen": 327086735, + "step": 15163, + "time_per_iteration": 2.5320420265197754 + }, + { + "auxiliary_loss_clip": 0.01068608, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.03545868, + "balance_loss_mlp": 1.02038407, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.0303337249475324, + "language_loss": 0.70647389, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72749037, + "num_input_tokens_seen": 327104035, + "step": 15164, + "time_per_iteration": 2.539942502975464 + }, + { + "auxiliary_loss_clip": 0.01106361, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.03791058, + "balance_loss_mlp": 1.02102435, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.5107135206452451, + "language_loss": 0.75942934, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78082192, + "num_input_tokens_seen": 327124370, + "step": 15165, + "time_per_iteration": 2.4248642921447754 + }, + { + "auxiliary_loss_clip": 0.01093936, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.0343504, + "balance_loss_mlp": 1.02186394, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.1712550442701524, + "language_loss": 0.72877407, + "learning_rate": 8.094424311912074e-08, + "loss": 0.75005919, + "num_input_tokens_seen": 327140915, + "step": 15166, + "time_per_iteration": 2.4647130966186523 + }, + { + "auxiliary_loss_clip": 0.01061977, + "auxiliary_loss_mlp": 0.01034773, + "balance_loss_clip": 1.034073, + "balance_loss_mlp": 1.02145171, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 4.033803700518343, + "language_loss": 0.72907168, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75003916, + "num_input_tokens_seen": 327158940, + "step": 15167, + "time_per_iteration": 2.575835943222046 + }, + { + "auxiliary_loss_clip": 0.01013489, + "auxiliary_loss_mlp": 0.01001244, + "balance_loss_clip": 1.00784886, + "balance_loss_mlp": 0.99993247, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7732280066263209, + "language_loss": 0.65562773, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67577505, + "num_input_tokens_seen": 327217450, + "step": 15168, + "time_per_iteration": 3.052065134048462 + }, + { + "auxiliary_loss_clip": 0.01078148, + "auxiliary_loss_mlp": 0.01030412, + "balance_loss_clip": 1.03594601, + "balance_loss_mlp": 1.01896191, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 1.921516415450786, + "language_loss": 0.77982491, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80091053, + "num_input_tokens_seen": 327233905, + "step": 15169, + "time_per_iteration": 2.4744443893432617 + }, + { + "auxiliary_loss_clip": 0.01097726, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.03653026, + "balance_loss_mlp": 1.01604033, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 1.9463077948919427, + "language_loss": 0.82095933, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84221703, + "num_input_tokens_seen": 327252430, + "step": 15170, + "time_per_iteration": 2.455413818359375 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.03755701, + "balance_loss_mlp": 1.01879144, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 1.9622632368658344, + "language_loss": 0.77275497, + "learning_rate": 8.039676420316799e-08, + "loss": 0.7940675, + "num_input_tokens_seen": 327269215, + "step": 15171, + "time_per_iteration": 2.414698600769043 + }, + { + "auxiliary_loss_clip": 0.01034802, + "auxiliary_loss_mlp": 0.01037541, + "balance_loss_clip": 1.03730428, + "balance_loss_mlp": 1.02421927, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.3712060427823147, + "language_loss": 0.66981119, + "learning_rate": 8.02874867780241e-08, + "loss": 0.69053459, + "num_input_tokens_seen": 327290320, + "step": 15172, + "time_per_iteration": 2.637033700942993 + }, + { + "auxiliary_loss_clip": 0.01086225, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.04051113, + "balance_loss_mlp": 1.02077568, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 1.9618045556987294, + "language_loss": 0.74799621, + "learning_rate": 8.017828214857103e-08, + "loss": 0.76919204, + "num_input_tokens_seen": 327310150, + "step": 15173, + "time_per_iteration": 2.513502359390259 + }, + { + "auxiliary_loss_clip": 0.01094191, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.03878748, + "balance_loss_mlp": 1.01990461, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.3817105666605443, + "language_loss": 0.66215706, + "learning_rate": 8.00691503189499e-08, + "loss": 0.68343514, + "num_input_tokens_seen": 327326660, + "step": 15174, + "time_per_iteration": 2.4592697620391846 + }, + { + "auxiliary_loss_clip": 0.0109646, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.03452957, + "balance_loss_mlp": 1.02033341, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 2.831524553392763, + "language_loss": 0.74555534, + "learning_rate": 7.996009129329894e-08, + "loss": 0.76686174, + "num_input_tokens_seen": 327346700, + "step": 15175, + "time_per_iteration": 3.929596424102783 + }, + { + "auxiliary_loss_clip": 0.01018249, + "auxiliary_loss_mlp": 0.01005697, + "balance_loss_clip": 1.00478065, + "balance_loss_mlp": 1.00451112, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9666196336751469, + "language_loss": 0.58412921, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60436869, + "num_input_tokens_seen": 327403050, + "step": 15176, + "time_per_iteration": 3.090426206588745 + }, + { + "auxiliary_loss_clip": 0.01084773, + "auxiliary_loss_mlp": 0.01037207, + "balance_loss_clip": 1.03244615, + "balance_loss_mlp": 1.02473152, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1.8681618061064098, + "language_loss": 0.65309644, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67431623, + "num_input_tokens_seen": 327422225, + "step": 15177, + "time_per_iteration": 2.5063610076904297 + }, + { + "auxiliary_loss_clip": 0.01077661, + "auxiliary_loss_mlp": 0.01027336, + "balance_loss_clip": 1.03376234, + "balance_loss_mlp": 1.01525402, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 1.9016505716741399, + "language_loss": 0.81084621, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83189619, + "num_input_tokens_seen": 327437025, + "step": 15178, + "time_per_iteration": 2.469416618347168 + }, + { + "auxiliary_loss_clip": 0.01053424, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.03250301, + "balance_loss_mlp": 1.02212095, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 2.6713938945029003, + "language_loss": 0.78811526, + "learning_rate": 7.952458331306711e-08, + "loss": 0.80900383, + "num_input_tokens_seen": 327453915, + "step": 15179, + "time_per_iteration": 2.6655631065368652 + }, + { + "auxiliary_loss_clip": 0.01086078, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.03699708, + "balance_loss_mlp": 1.01827741, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 1.5996669467684264, + "language_loss": 0.68358243, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70474333, + "num_input_tokens_seen": 327474415, + "step": 15180, + "time_per_iteration": 2.5882835388183594 + }, + { + "auxiliary_loss_clip": 0.01095001, + "auxiliary_loss_mlp": 0.01025325, + "balance_loss_clip": 1.03373218, + "balance_loss_mlp": 1.01425052, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 1.8665923932133683, + "language_loss": 0.75407171, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77527499, + "num_input_tokens_seen": 327492750, + "step": 15181, + "time_per_iteration": 2.4713075160980225 + }, + { + "auxiliary_loss_clip": 0.01112667, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.03779685, + "balance_loss_mlp": 1.02008247, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 1.7435881649125864, + "language_loss": 0.74596226, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76741421, + "num_input_tokens_seen": 327509470, + "step": 15182, + "time_per_iteration": 2.4122424125671387 + }, + { + "auxiliary_loss_clip": 0.01110096, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.03614366, + "balance_loss_mlp": 1.01956105, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 1.433065431305403, + "language_loss": 0.76448786, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78590661, + "num_input_tokens_seen": 327530520, + "step": 15183, + "time_per_iteration": 2.456610918045044 + }, + { + "auxiliary_loss_clip": 0.01102062, + "auxiliary_loss_mlp": 0.01028289, + "balance_loss_clip": 1.04192066, + "balance_loss_mlp": 1.01576042, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 4.486240375411252, + "language_loss": 0.76831508, + "learning_rate": 7.898183692255256e-08, + "loss": 0.78961855, + "num_input_tokens_seen": 327546960, + "step": 15184, + "time_per_iteration": 2.4492015838623047 + }, + { + "auxiliary_loss_clip": 0.01094062, + "auxiliary_loss_mlp": 0.01037631, + "balance_loss_clip": 1.03713429, + "balance_loss_mlp": 1.02585959, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 1.8691696041037735, + "language_loss": 0.74573898, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76705593, + "num_input_tokens_seen": 327564830, + "step": 15185, + "time_per_iteration": 2.464522123336792 + }, + { + "auxiliary_loss_clip": 0.01081792, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.03403568, + "balance_loss_mlp": 1.02017474, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 2.2934100091973155, + "language_loss": 0.68870032, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70984608, + "num_input_tokens_seen": 327583675, + "step": 15186, + "time_per_iteration": 2.4694364070892334 + }, + { + "auxiliary_loss_clip": 0.01089767, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.03459716, + "balance_loss_mlp": 1.02480865, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 1.8988656004011306, + "language_loss": 0.77464175, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79593825, + "num_input_tokens_seen": 327602280, + "step": 15187, + "time_per_iteration": 2.457516670227051 + }, + { + "auxiliary_loss_clip": 0.0110823, + "auxiliary_loss_mlp": 0.00777507, + "balance_loss_clip": 1.03578329, + "balance_loss_mlp": 1.00060558, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 5.074067695720794, + "language_loss": 0.6572094, + "learning_rate": 7.854895099902515e-08, + "loss": 0.67606676, + "num_input_tokens_seen": 327623515, + "step": 15188, + "time_per_iteration": 3.9705522060394287 + }, + { + "auxiliary_loss_clip": 0.01035894, + "auxiliary_loss_mlp": 0.01037757, + "balance_loss_clip": 1.03010428, + "balance_loss_mlp": 1.02485228, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 4.248686124682684, + "language_loss": 0.76433682, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78507334, + "num_input_tokens_seen": 327642875, + "step": 15189, + "time_per_iteration": 2.6159508228302 + }, + { + "auxiliary_loss_clip": 0.01095713, + "auxiliary_loss_mlp": 0.01028702, + "balance_loss_clip": 1.03417861, + "balance_loss_mlp": 1.01799655, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 2.669474405691612, + "language_loss": 0.7532531, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77449721, + "num_input_tokens_seen": 327662450, + "step": 15190, + "time_per_iteration": 3.8579766750335693 + }, + { + "auxiliary_loss_clip": 0.01018562, + "auxiliary_loss_mlp": 0.01001166, + "balance_loss_clip": 1.00508749, + "balance_loss_mlp": 0.99998552, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.6974825040087524, + "language_loss": 0.57339478, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59359205, + "num_input_tokens_seen": 327723845, + "step": 15191, + "time_per_iteration": 3.0604248046875 + }, + { + "auxiliary_loss_clip": 0.01112382, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.03791356, + "balance_loss_mlp": 1.02183235, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 2.0762515223242333, + "language_loss": 0.74431586, + "learning_rate": 7.81172308613034e-08, + "loss": 0.7657876, + "num_input_tokens_seen": 327742590, + "step": 15192, + "time_per_iteration": 2.466801643371582 + }, + { + "auxiliary_loss_clip": 0.01096193, + "auxiliary_loss_mlp": 0.01026704, + "balance_loss_clip": 1.03749585, + "balance_loss_mlp": 1.01488423, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 3.3336149407544373, + "language_loss": 0.69462585, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71585482, + "num_input_tokens_seen": 327764350, + "step": 15193, + "time_per_iteration": 2.6432981491088867 + }, + { + "auxiliary_loss_clip": 0.01097219, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.03861248, + "balance_loss_mlp": 1.02428329, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.9387895437939764, + "language_loss": 0.73458606, + "learning_rate": 7.790180804400215e-08, + "loss": 0.75591731, + "num_input_tokens_seen": 327783120, + "step": 15194, + "time_per_iteration": 2.491084337234497 + }, + { + "auxiliary_loss_clip": 0.01064123, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.0322839, + "balance_loss_mlp": 1.02522039, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 1.782830473777864, + "language_loss": 0.61433613, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63537908, + "num_input_tokens_seen": 327801960, + "step": 15195, + "time_per_iteration": 2.5683791637420654 + }, + { + "auxiliary_loss_clip": 0.01099913, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.03616333, + "balance_loss_mlp": 1.02020931, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.5584941117580802, + "language_loss": 0.71610945, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73743486, + "num_input_tokens_seen": 327823795, + "step": 15196, + "time_per_iteration": 2.521261215209961 + }, + { + "auxiliary_loss_clip": 0.01086927, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.03556621, + "balance_loss_mlp": 1.02024937, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.5200567987606843, + "language_loss": 0.71444184, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73563218, + "num_input_tokens_seen": 327845175, + "step": 15197, + "time_per_iteration": 2.582120418548584 + }, + { + "auxiliary_loss_clip": 0.01086875, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.03253293, + "balance_loss_mlp": 1.01643848, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.8822440987426003, + "language_loss": 0.7794466, + "learning_rate": 7.747183707589489e-08, + "loss": 0.8006044, + "num_input_tokens_seen": 327863150, + "step": 15198, + "time_per_iteration": 2.5145957469940186 + }, + { + "auxiliary_loss_clip": 0.01094756, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.03975677, + "balance_loss_mlp": 1.02255356, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.3989164058776262, + "language_loss": 0.68156445, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70285738, + "num_input_tokens_seen": 327883445, + "step": 15199, + "time_per_iteration": 2.519380569458008 + }, + { + "auxiliary_loss_clip": 0.01098137, + "auxiliary_loss_mlp": 0.0077743, + "balance_loss_clip": 1.03518093, + "balance_loss_mlp": 1.00060487, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.7355461877604559, + "language_loss": 0.67682302, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69557869, + "num_input_tokens_seen": 327905745, + "step": 15200, + "time_per_iteration": 2.5384182929992676 + }, + { + "auxiliary_loss_clip": 0.0109494, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.03623104, + "balance_loss_mlp": 1.02059805, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.5158788880820997, + "language_loss": 0.71617329, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73744148, + "num_input_tokens_seen": 327925435, + "step": 15201, + "time_per_iteration": 4.0560503005981445 + }, + { + "auxiliary_loss_clip": 0.01093616, + "auxiliary_loss_mlp": 0.01027416, + "balance_loss_clip": 1.03342962, + "balance_loss_mlp": 1.01627553, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 2.1784104517242397, + "language_loss": 0.70499134, + "learning_rate": 7.704303254710165e-08, + "loss": 0.72620165, + "num_input_tokens_seen": 327944145, + "step": 15202, + "time_per_iteration": 2.42993426322937 + }, + { + "auxiliary_loss_clip": 0.01107617, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.03586698, + "balance_loss_mlp": 1.01644325, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 1.76808060358515, + "language_loss": 0.6654979, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68686128, + "num_input_tokens_seen": 327960565, + "step": 15203, + "time_per_iteration": 2.386573553085327 + }, + { + "auxiliary_loss_clip": 0.01098759, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.03556943, + "balance_loss_mlp": 1.01944959, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.838459324870151, + "language_loss": 0.69043839, + "learning_rate": 7.682906777877751e-08, + "loss": 0.71175134, + "num_input_tokens_seen": 327981180, + "step": 15204, + "time_per_iteration": 2.5276098251342773 + }, + { + "auxiliary_loss_clip": 0.01096726, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.03303361, + "balance_loss_mlp": 1.01529717, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 1.9044988754145253, + "language_loss": 0.60369766, + "learning_rate": 7.672219478283915e-08, + "loss": 0.62494469, + "num_input_tokens_seen": 328001500, + "step": 15205, + "time_per_iteration": 2.477066993713379 + }, + { + "auxiliary_loss_clip": 0.01066789, + "auxiliary_loss_mlp": 0.01033281, + "balance_loss_clip": 1.03330421, + "balance_loss_mlp": 1.02077019, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.8678364870242834, + "language_loss": 0.81245494, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83345568, + "num_input_tokens_seen": 328023025, + "step": 15206, + "time_per_iteration": 2.586243152618408 + }, + { + "auxiliary_loss_clip": 0.01064382, + "auxiliary_loss_mlp": 0.01026694, + "balance_loss_clip": 1.03389633, + "balance_loss_mlp": 1.01444542, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 2.3651786352196034, + "language_loss": 0.73442113, + "learning_rate": 7.650866758767382e-08, + "loss": 0.75533187, + "num_input_tokens_seen": 328041410, + "step": 15207, + "time_per_iteration": 2.5504729747772217 + }, + { + "auxiliary_loss_clip": 0.01064561, + "auxiliary_loss_mlp": 0.01038587, + "balance_loss_clip": 1.03786564, + "balance_loss_mlp": 1.02619541, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 6.589662010517021, + "language_loss": 0.7304256, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75145704, + "num_input_tokens_seen": 328060495, + "step": 15208, + "time_per_iteration": 2.579958438873291 + }, + { + "auxiliary_loss_clip": 0.01091136, + "auxiliary_loss_mlp": 0.01028639, + "balance_loss_clip": 1.03687406, + "balance_loss_mlp": 1.01728451, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 3.5935642941690698, + "language_loss": 0.86667717, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88787496, + "num_input_tokens_seen": 328076905, + "step": 15209, + "time_per_iteration": 2.4301750659942627 + }, + { + "auxiliary_loss_clip": 0.01090077, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.03897476, + "balance_loss_mlp": 1.02103782, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 1.7173134230013838, + "language_loss": 0.75122237, + "learning_rate": 7.618892384741093e-08, + "loss": 0.7724449, + "num_input_tokens_seen": 328096960, + "step": 15210, + "time_per_iteration": 2.5342955589294434 + }, + { + "auxiliary_loss_clip": 0.01084661, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.03186989, + "balance_loss_mlp": 1.02126896, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 2.2475186000176497, + "language_loss": 0.77897674, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80015683, + "num_input_tokens_seen": 328115445, + "step": 15211, + "time_per_iteration": 2.517185688018799 + }, + { + "auxiliary_loss_clip": 0.01100553, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.03707063, + "balance_loss_mlp": 1.0191071, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 7.236304504674556, + "language_loss": 0.82684255, + "learning_rate": 7.597612610270986e-08, + "loss": 0.84815675, + "num_input_tokens_seen": 328133965, + "step": 15212, + "time_per_iteration": 2.4518182277679443 + }, + { + "auxiliary_loss_clip": 0.01095084, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.03581071, + "balance_loss_mlp": 1.01806688, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.791159047696892, + "language_loss": 0.83697313, + "learning_rate": 7.586983666711022e-08, + "loss": 0.85821772, + "num_input_tokens_seen": 328151520, + "step": 15213, + "time_per_iteration": 2.4295458793640137 + }, + { + "auxiliary_loss_clip": 0.01092205, + "auxiliary_loss_mlp": 0.01028563, + "balance_loss_clip": 1.03602564, + "balance_loss_mlp": 1.01699948, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 1.7807977630847185, + "language_loss": 0.70615447, + "learning_rate": 7.576362019471894e-08, + "loss": 0.72736216, + "num_input_tokens_seen": 328171275, + "step": 15214, + "time_per_iteration": 3.9750871658325195 + }, + { + "auxiliary_loss_clip": 0.0110146, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.03776503, + "balance_loss_mlp": 1.02645779, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.725344791629375, + "language_loss": 0.62605393, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64746219, + "num_input_tokens_seen": 328192115, + "step": 15215, + "time_per_iteration": 2.494661569595337 + }, + { + "auxiliary_loss_clip": 0.01087922, + "auxiliary_loss_mlp": 0.01027618, + "balance_loss_clip": 1.04432678, + "balance_loss_mlp": 1.01510131, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.363803055147033, + "language_loss": 0.76036537, + "learning_rate": 7.555140615567058e-08, + "loss": 0.78152072, + "num_input_tokens_seen": 328208990, + "step": 15216, + "time_per_iteration": 2.5336835384368896 + }, + { + "auxiliary_loss_clip": 0.01083814, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_clip": 1.03506875, + "balance_loss_mlp": 1.02713704, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 7.986410800097935, + "language_loss": 0.68246204, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70371419, + "num_input_tokens_seen": 328227840, + "step": 15217, + "time_per_iteration": 2.4986605644226074 + }, + { + "auxiliary_loss_clip": 0.01095408, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.03629422, + "balance_loss_mlp": 1.02092123, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 4.198177519817049, + "language_loss": 0.80224848, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82352841, + "num_input_tokens_seen": 328246250, + "step": 15218, + "time_per_iteration": 2.4287655353546143 + }, + { + "auxiliary_loss_clip": 0.00998192, + "auxiliary_loss_mlp": 0.01001363, + "balance_loss_clip": 1.00709748, + "balance_loss_mlp": 1.00015283, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8494538644771645, + "language_loss": 0.59223568, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61223125, + "num_input_tokens_seen": 328303625, + "step": 15219, + "time_per_iteration": 3.0609633922576904 + }, + { + "auxiliary_loss_clip": 0.0109505, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.03396606, + "balance_loss_mlp": 1.0195235, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 2.406703976645856, + "language_loss": 0.78977716, + "learning_rate": 7.512785381311216e-08, + "loss": 0.81104243, + "num_input_tokens_seen": 328322135, + "step": 15220, + "time_per_iteration": 2.42728328704834 + }, + { + "auxiliary_loss_clip": 0.01060599, + "auxiliary_loss_mlp": 0.01047247, + "balance_loss_clip": 1.03524566, + "balance_loss_mlp": 1.03231645, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 3.1211338671896813, + "language_loss": 0.66007942, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68115789, + "num_input_tokens_seen": 328340750, + "step": 15221, + "time_per_iteration": 2.531203269958496 + }, + { + "auxiliary_loss_clip": 0.01086975, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.03588343, + "balance_loss_mlp": 1.02077103, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 5.302448301023596, + "language_loss": 0.84107101, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86226255, + "num_input_tokens_seen": 328359995, + "step": 15222, + "time_per_iteration": 2.4988791942596436 + }, + { + "auxiliary_loss_clip": 0.0101462, + "auxiliary_loss_mlp": 0.01001695, + "balance_loss_clip": 1.01034725, + "balance_loss_mlp": 1.00041986, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.7251918015045187, + "language_loss": 0.49590138, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51606452, + "num_input_tokens_seen": 328426865, + "step": 15223, + "time_per_iteration": 3.1021015644073486 + }, + { + "auxiliary_loss_clip": 0.0107796, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.03711677, + "balance_loss_mlp": 1.02668536, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 1.8878920912611228, + "language_loss": 0.72362703, + "learning_rate": 7.470546933201349e-08, + "loss": 0.74479795, + "num_input_tokens_seen": 328445970, + "step": 15224, + "time_per_iteration": 2.5425684452056885 + }, + { + "auxiliary_loss_clip": 0.0109225, + "auxiliary_loss_mlp": 0.01027983, + "balance_loss_clip": 1.03497267, + "balance_loss_mlp": 1.01555502, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 2.3796763014493116, + "language_loss": 0.80945045, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83065271, + "num_input_tokens_seen": 328464585, + "step": 15225, + "time_per_iteration": 2.49695086479187 + }, + { + "auxiliary_loss_clip": 0.01106292, + "auxiliary_loss_mlp": 0.01025175, + "balance_loss_clip": 1.03488934, + "balance_loss_mlp": 1.01393938, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.4018884879103837, + "language_loss": 0.7135191, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73483384, + "num_input_tokens_seen": 328490155, + "step": 15226, + "time_per_iteration": 2.5431931018829346 + }, + { + "auxiliary_loss_clip": 0.01043673, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.03401184, + "balance_loss_mlp": 1.01848543, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 1.9995808177616974, + "language_loss": 0.74774373, + "learning_rate": 7.43894475344613e-08, + "loss": 0.76849449, + "num_input_tokens_seen": 328508275, + "step": 15227, + "time_per_iteration": 4.173069000244141 + }, + { + "auxiliary_loss_clip": 0.01084149, + "auxiliary_loss_mlp": 0.0102885, + "balance_loss_clip": 1.03408313, + "balance_loss_mlp": 1.01693559, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.4150341933741188, + "language_loss": 0.73799545, + "learning_rate": 7.428425296864404e-08, + "loss": 0.75912547, + "num_input_tokens_seen": 328529425, + "step": 15228, + "time_per_iteration": 2.540290355682373 + }, + { + "auxiliary_loss_clip": 0.01073815, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.03732681, + "balance_loss_mlp": 1.02099419, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.5464846724420698, + "language_loss": 0.72341591, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74448097, + "num_input_tokens_seen": 328550200, + "step": 15229, + "time_per_iteration": 3.957542896270752 + }, + { + "auxiliary_loss_clip": 0.01110135, + "auxiliary_loss_mlp": 0.01034656, + "balance_loss_clip": 1.03778481, + "balance_loss_mlp": 1.02148974, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 1.9627853209298924, + "language_loss": 0.83192706, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85337496, + "num_input_tokens_seen": 328568540, + "step": 15230, + "time_per_iteration": 2.4103870391845703 + }, + { + "auxiliary_loss_clip": 0.01069797, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.03577304, + "balance_loss_mlp": 1.01783645, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.788311719920776, + "language_loss": 0.83314705, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85413903, + "num_input_tokens_seen": 328587300, + "step": 15231, + "time_per_iteration": 2.593940496444702 + }, + { + "auxiliary_loss_clip": 0.01091646, + "auxiliary_loss_mlp": 0.01027883, + "balance_loss_clip": 1.03098416, + "balance_loss_mlp": 1.01595008, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.4509425279435135, + "language_loss": 0.72264934, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74384463, + "num_input_tokens_seen": 328610055, + "step": 15232, + "time_per_iteration": 2.5070853233337402 + }, + { + "auxiliary_loss_clip": 0.01109143, + "auxiliary_loss_mlp": 0.01035334, + "balance_loss_clip": 1.03600955, + "balance_loss_mlp": 1.0228703, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 2.307258163278866, + "language_loss": 0.67882639, + "learning_rate": 7.375937556925338e-08, + "loss": 0.70027119, + "num_input_tokens_seen": 328626815, + "step": 15233, + "time_per_iteration": 2.3929338455200195 + }, + { + "auxiliary_loss_clip": 0.01084718, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.03819108, + "balance_loss_mlp": 1.02105618, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 5.208328324410644, + "language_loss": 0.69532979, + "learning_rate": 7.365461920317861e-08, + "loss": 0.7165128, + "num_input_tokens_seen": 328643995, + "step": 15234, + "time_per_iteration": 2.4936392307281494 + }, + { + "auxiliary_loss_clip": 0.01086702, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.03642941, + "balance_loss_mlp": 1.02047801, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 1.8041144577222121, + "language_loss": 0.88213682, + "learning_rate": 7.354993588431391e-08, + "loss": 0.90333289, + "num_input_tokens_seen": 328659565, + "step": 15235, + "time_per_iteration": 2.4964826107025146 + }, + { + "auxiliary_loss_clip": 0.01044051, + "auxiliary_loss_mlp": 0.01040534, + "balance_loss_clip": 1.03224981, + "balance_loss_mlp": 1.02630019, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.6748654537614653, + "language_loss": 0.76824319, + "learning_rate": 7.344532561662853e-08, + "loss": 0.78908908, + "num_input_tokens_seen": 328679045, + "step": 15236, + "time_per_iteration": 2.6526432037353516 + }, + { + "auxiliary_loss_clip": 0.00989324, + "auxiliary_loss_mlp": 0.01000206, + "balance_loss_clip": 1.01259375, + "balance_loss_mlp": 0.99902564, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6864001550031946, + "language_loss": 0.62176526, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64166057, + "num_input_tokens_seen": 328744565, + "step": 15237, + "time_per_iteration": 3.3600549697875977 + }, + { + "auxiliary_loss_clip": 0.01110164, + "auxiliary_loss_mlp": 0.00778418, + "balance_loss_clip": 1.03626788, + "balance_loss_mlp": 1.00069249, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 2.0063955944215026, + "language_loss": 0.7462821, + "learning_rate": 7.323632425066151e-08, + "loss": 0.76516795, + "num_input_tokens_seen": 328762455, + "step": 15238, + "time_per_iteration": 2.6771607398986816 + }, + { + "auxiliary_loss_clip": 0.0110846, + "auxiliary_loss_mlp": 0.01029309, + "balance_loss_clip": 1.03567255, + "balance_loss_mlp": 1.01749516, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.7858986280217577, + "language_loss": 0.7462604, + "learning_rate": 7.313193316030464e-08, + "loss": 0.76763809, + "num_input_tokens_seen": 328780320, + "step": 15239, + "time_per_iteration": 2.4269299507141113 + }, + { + "auxiliary_loss_clip": 0.01077481, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.03360772, + "balance_loss_mlp": 1.01935065, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 1.9096035909661848, + "language_loss": 0.63539314, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65648216, + "num_input_tokens_seen": 328797570, + "step": 15240, + "time_per_iteration": 2.5116424560546875 + }, + { + "auxiliary_loss_clip": 0.01086432, + "auxiliary_loss_mlp": 0.00776985, + "balance_loss_clip": 1.03561592, + "balance_loss_mlp": 1.0006249, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.7304038096569554, + "language_loss": 0.7640475, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78268164, + "num_input_tokens_seen": 328814075, + "step": 15241, + "time_per_iteration": 4.062218427658081 + }, + { + "auxiliary_loss_clip": 0.01104087, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.0366689, + "balance_loss_mlp": 1.01605308, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.3890192613025034, + "language_loss": 0.67802584, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69936454, + "num_input_tokens_seen": 328831990, + "step": 15242, + "time_per_iteration": 2.5102086067199707 + }, + { + "auxiliary_loss_clip": 0.01097382, + "auxiliary_loss_mlp": 0.01031085, + "balance_loss_clip": 1.03382802, + "balance_loss_mlp": 1.01785243, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 2.060935899467284, + "language_loss": 0.80971462, + "learning_rate": 7.271509950872334e-08, + "loss": 0.83099926, + "num_input_tokens_seen": 328849105, + "step": 15243, + "time_per_iteration": 2.436824321746826 + }, + { + "auxiliary_loss_clip": 0.01086155, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.03145897, + "balance_loss_mlp": 1.01873827, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.713972054178594, + "language_loss": 0.81914365, + "learning_rate": 7.261107379304721e-08, + "loss": 0.8403163, + "num_input_tokens_seen": 328866810, + "step": 15244, + "time_per_iteration": 2.4987993240356445 + }, + { + "auxiliary_loss_clip": 0.0111111, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.03588986, + "balance_loss_mlp": 1.02516472, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 5.498824464530019, + "language_loss": 0.72276843, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74426568, + "num_input_tokens_seen": 328885325, + "step": 15245, + "time_per_iteration": 2.408642053604126 + }, + { + "auxiliary_loss_clip": 0.0108378, + "auxiliary_loss_mlp": 0.01028825, + "balance_loss_clip": 1.03345108, + "balance_loss_mlp": 1.01752996, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.6370159772535249, + "language_loss": 0.75066662, + "learning_rate": 7.240324162598033e-08, + "loss": 0.77179265, + "num_input_tokens_seen": 328902655, + "step": 15246, + "time_per_iteration": 2.470398426055908 + }, + { + "auxiliary_loss_clip": 0.01081356, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.03377581, + "balance_loss_mlp": 1.02062941, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 1.9964119700771987, + "language_loss": 0.75166035, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77281266, + "num_input_tokens_seen": 328918440, + "step": 15247, + "time_per_iteration": 2.460184097290039 + }, + { + "auxiliary_loss_clip": 0.01102468, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.03954804, + "balance_loss_mlp": 1.01517427, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.865535814875826, + "language_loss": 0.75680315, + "learning_rate": 7.219570183756052e-08, + "loss": 0.7781055, + "num_input_tokens_seen": 328938055, + "step": 15248, + "time_per_iteration": 2.489887237548828 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.03480387, + "balance_loss_mlp": 1.02211702, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.0491236267508874, + "language_loss": 0.73354822, + "learning_rate": 7.209204159518178e-08, + "loss": 0.75489116, + "num_input_tokens_seen": 328957895, + "step": 15249, + "time_per_iteration": 2.510774850845337 + }, + { + "auxiliary_loss_clip": 0.01062474, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.03441679, + "balance_loss_mlp": 1.01569128, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 1.8942249969330045, + "language_loss": 0.75300789, + "learning_rate": 7.198845445926616e-08, + "loss": 0.77392191, + "num_input_tokens_seen": 328971365, + "step": 15250, + "time_per_iteration": 2.614469528198242 + }, + { + "auxiliary_loss_clip": 0.01069963, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.03750122, + "balance_loss_mlp": 1.01575494, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.781214994982757, + "language_loss": 0.76012522, + "learning_rate": 7.188494043374138e-08, + "loss": 0.78109878, + "num_input_tokens_seen": 328990830, + "step": 15251, + "time_per_iteration": 2.5877180099487305 + }, + { + "auxiliary_loss_clip": 0.01085373, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.03634799, + "balance_loss_mlp": 1.01960838, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 2.578382453377054, + "language_loss": 0.80097443, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82216477, + "num_input_tokens_seen": 329008345, + "step": 15252, + "time_per_iteration": 2.5382297039031982 + }, + { + "auxiliary_loss_clip": 0.01109401, + "auxiliary_loss_mlp": 0.01033784, + "balance_loss_clip": 1.03622317, + "balance_loss_mlp": 1.02124882, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.8084734021392248, + "language_loss": 0.77497864, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79641044, + "num_input_tokens_seen": 329027440, + "step": 15253, + "time_per_iteration": 2.4189107418060303 + }, + { + "auxiliary_loss_clip": 0.01100302, + "auxiliary_loss_mlp": 0.01027226, + "balance_loss_clip": 1.03730857, + "balance_loss_mlp": 1.01538301, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 1.888254173458046, + "language_loss": 0.73002458, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75129986, + "num_input_tokens_seen": 329046445, + "step": 15254, + "time_per_iteration": 4.2805609703063965 + }, + { + "auxiliary_loss_clip": 0.01073444, + "auxiliary_loss_mlp": 0.01023682, + "balance_loss_clip": 1.03952408, + "balance_loss_mlp": 1.01229143, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.5710470795113272, + "language_loss": 0.79249823, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81346947, + "num_input_tokens_seen": 329065555, + "step": 15255, + "time_per_iteration": 2.6411492824554443 + }, + { + "auxiliary_loss_clip": 0.01099403, + "auxiliary_loss_mlp": 0.0103717, + "balance_loss_clip": 1.03594851, + "balance_loss_mlp": 1.02452183, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 2.038952746125524, + "language_loss": 0.68825686, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70962262, + "num_input_tokens_seen": 329087515, + "step": 15256, + "time_per_iteration": 2.6086902618408203 + }, + { + "auxiliary_loss_clip": 0.0109166, + "auxiliary_loss_mlp": 0.01040498, + "balance_loss_clip": 1.03347397, + "balance_loss_mlp": 1.02758789, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 1.5948120323143418, + "language_loss": 0.83870232, + "learning_rate": 7.126539181842561e-08, + "loss": 0.86002398, + "num_input_tokens_seen": 329106820, + "step": 15257, + "time_per_iteration": 2.474468946456909 + }, + { + "auxiliary_loss_clip": 0.01083165, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.03179145, + "balance_loss_mlp": 1.02508926, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.5432251514541333, + "language_loss": 0.77562761, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79682565, + "num_input_tokens_seen": 329126515, + "step": 15258, + "time_per_iteration": 2.513522148132324 + }, + { + "auxiliary_loss_clip": 0.0109481, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.03891718, + "balance_loss_mlp": 1.01893234, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 1.9541567062757617, + "language_loss": 0.78584862, + "learning_rate": 7.105946067406999e-08, + "loss": 0.8071084, + "num_input_tokens_seen": 329142660, + "step": 15259, + "time_per_iteration": 2.4233272075653076 + }, + { + "auxiliary_loss_clip": 0.01062398, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.03336179, + "balance_loss_mlp": 1.01969683, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 2.222206550451084, + "language_loss": 0.76464546, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78557873, + "num_input_tokens_seen": 329162575, + "step": 15260, + "time_per_iteration": 2.6293461322784424 + }, + { + "auxiliary_loss_clip": 0.01067451, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.03335643, + "balance_loss_mlp": 1.01754642, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.7483692883829154, + "language_loss": 0.60923761, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63020569, + "num_input_tokens_seen": 329182090, + "step": 15261, + "time_per_iteration": 2.679128408432007 + }, + { + "auxiliary_loss_clip": 0.01083703, + "auxiliary_loss_mlp": 0.01031773, + "balance_loss_clip": 1.03386652, + "balance_loss_mlp": 1.01993597, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.9153013694509762, + "language_loss": 0.73965502, + "learning_rate": 7.075111255942002e-08, + "loss": 0.76080978, + "num_input_tokens_seen": 329196535, + "step": 15262, + "time_per_iteration": 2.4584317207336426 + }, + { + "auxiliary_loss_clip": 0.01110178, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.03472495, + "balance_loss_mlp": 1.02427912, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.9659075197878217, + "language_loss": 0.77329564, + "learning_rate": 7.064847616396496e-08, + "loss": 0.7947644, + "num_input_tokens_seen": 329215135, + "step": 15263, + "time_per_iteration": 2.418339490890503 + }, + { + "auxiliary_loss_clip": 0.0111159, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.03632462, + "balance_loss_mlp": 1.02074778, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 1.9437881153790428, + "language_loss": 0.75631207, + "learning_rate": 7.054591292971324e-08, + "loss": 0.77776164, + "num_input_tokens_seen": 329235150, + "step": 15264, + "time_per_iteration": 2.4267334938049316 + }, + { + "auxiliary_loss_clip": 0.01087055, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.0373615, + "balance_loss_mlp": 1.02219748, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 2.371556634874941, + "language_loss": 0.8336966, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85490435, + "num_input_tokens_seen": 329254365, + "step": 15265, + "time_per_iteration": 4.0376129150390625 + }, + { + "auxiliary_loss_clip": 0.01112836, + "auxiliary_loss_mlp": 0.01042845, + "balance_loss_clip": 1.03722918, + "balance_loss_mlp": 1.02966666, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.5651781459779044, + "language_loss": 0.73553252, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75708938, + "num_input_tokens_seen": 329274385, + "step": 15266, + "time_per_iteration": 2.4448940753936768 + }, + { + "auxiliary_loss_clip": 0.0110731, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.03524685, + "balance_loss_mlp": 1.01668632, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.6089020555288938, + "language_loss": 0.78116214, + "learning_rate": 7.023866223305486e-08, + "loss": 0.80251771, + "num_input_tokens_seen": 329292160, + "step": 15267, + "time_per_iteration": 2.409322500228882 + }, + { + "auxiliary_loss_clip": 0.01015793, + "auxiliary_loss_mlp": 0.00753484, + "balance_loss_clip": 1.00542593, + "balance_loss_mlp": 1.00021219, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7367721392784004, + "language_loss": 0.56256509, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58025789, + "num_input_tokens_seen": 329351870, + "step": 15268, + "time_per_iteration": 3.0591962337493896 + }, + { + "auxiliary_loss_clip": 0.01109261, + "auxiliary_loss_mlp": 0.00777739, + "balance_loss_clip": 1.03612065, + "balance_loss_mlp": 1.00059342, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 1.7514500504943649, + "language_loss": 0.76803571, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78690571, + "num_input_tokens_seen": 329370930, + "step": 15269, + "time_per_iteration": 3.893084764480591 + }, + { + "auxiliary_loss_clip": 0.0107481, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.03297246, + "balance_loss_mlp": 1.01832235, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 2.312081942190411, + "language_loss": 0.72581553, + "learning_rate": 6.993207012706936e-08, + "loss": 0.74686986, + "num_input_tokens_seen": 329391275, + "step": 15270, + "time_per_iteration": 2.7039999961853027 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.03455949, + "balance_loss_mlp": 1.0190568, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.5722054961669067, + "language_loss": 0.7935226, + "learning_rate": 6.98300191299821e-08, + "loss": 0.81488872, + "num_input_tokens_seen": 329412775, + "step": 15271, + "time_per_iteration": 2.4703590869903564 + }, + { + "auxiliary_loss_clip": 0.01071132, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03534925, + "balance_loss_mlp": 1.01953244, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 2.353401151074762, + "language_loss": 0.73027289, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75130707, + "num_input_tokens_seen": 329432440, + "step": 15272, + "time_per_iteration": 2.632909059524536 + }, + { + "auxiliary_loss_clip": 0.01078613, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.03337932, + "balance_loss_mlp": 1.02083862, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 2.370086447212316, + "language_loss": 0.72705877, + "learning_rate": 6.962613671639105e-08, + "loss": 0.74816668, + "num_input_tokens_seen": 329450605, + "step": 15273, + "time_per_iteration": 2.4876222610473633 + }, + { + "auxiliary_loss_clip": 0.01070112, + "auxiliary_loss_mlp": 0.01025517, + "balance_loss_clip": 1.03491569, + "balance_loss_mlp": 1.01479375, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.54322038210742, + "language_loss": 0.74310857, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76406491, + "num_input_tokens_seen": 329470550, + "step": 15274, + "time_per_iteration": 2.532784938812256 + }, + { + "auxiliary_loss_clip": 0.01097541, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.03395534, + "balance_loss_mlp": 1.02770925, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.509549408127571, + "language_loss": 0.68690324, + "learning_rate": 6.942254710267902e-08, + "loss": 0.70827234, + "num_input_tokens_seen": 329489765, + "step": 15275, + "time_per_iteration": 2.4438586235046387 + }, + { + "auxiliary_loss_clip": 0.01093226, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.03449786, + "balance_loss_mlp": 1.02138972, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 1.8581134064061064, + "language_loss": 0.72314608, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74441206, + "num_input_tokens_seen": 329507040, + "step": 15276, + "time_per_iteration": 2.41011905670166 + }, + { + "auxiliary_loss_clip": 0.01085944, + "auxiliary_loss_mlp": 0.01025673, + "balance_loss_clip": 1.03582311, + "balance_loss_mlp": 1.01451457, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 1.7352953710231622, + "language_loss": 0.73397994, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75509614, + "num_input_tokens_seen": 329525540, + "step": 15277, + "time_per_iteration": 2.5159804821014404 + }, + { + "auxiliary_loss_clip": 0.01004791, + "auxiliary_loss_mlp": 0.01004256, + "balance_loss_clip": 1.00930882, + "balance_loss_mlp": 1.003034, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7219583852380784, + "language_loss": 0.5922454, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61233586, + "num_input_tokens_seen": 329592905, + "step": 15278, + "time_per_iteration": 3.2007694244384766 + }, + { + "auxiliary_loss_clip": 0.01069489, + "auxiliary_loss_mlp": 0.01024222, + "balance_loss_clip": 1.0321759, + "balance_loss_mlp": 1.01395237, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 2.0429240454147517, + "language_loss": 0.63974285, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66068, + "num_input_tokens_seen": 329610150, + "step": 15279, + "time_per_iteration": 3.995636224746704 + }, + { + "auxiliary_loss_clip": 0.01026446, + "auxiliary_loss_mlp": 0.00753371, + "balance_loss_clip": 1.00326598, + "balance_loss_mlp": 1.00018811, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 1.3131036550712507, + "language_loss": 0.60176456, + "learning_rate": 6.891485427041211e-08, + "loss": 0.61956275, + "num_input_tokens_seen": 329673650, + "step": 15280, + "time_per_iteration": 2.9862494468688965 + }, + { + "auxiliary_loss_clip": 0.01088849, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.03527343, + "balance_loss_mlp": 1.02072263, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 1.9557558260700731, + "language_loss": 0.69220221, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71342152, + "num_input_tokens_seen": 329692520, + "step": 15281, + "time_per_iteration": 2.50704026222229 + }, + { + "auxiliary_loss_clip": 0.0108689, + "auxiliary_loss_mlp": 0.01027941, + "balance_loss_clip": 1.03514874, + "balance_loss_mlp": 1.01471508, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 1.9528515516476832, + "language_loss": 0.84542298, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86657131, + "num_input_tokens_seen": 329713750, + "step": 15282, + "time_per_iteration": 2.5479536056518555 + }, + { + "auxiliary_loss_clip": 0.01086169, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.03557658, + "balance_loss_mlp": 1.02140188, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.8732709325332833, + "language_loss": 0.60043389, + "learning_rate": 6.861111726356194e-08, + "loss": 0.62164229, + "num_input_tokens_seen": 329730960, + "step": 15283, + "time_per_iteration": 2.5235373973846436 + }, + { + "auxiliary_loss_clip": 0.01102194, + "auxiliary_loss_mlp": 0.00778704, + "balance_loss_clip": 1.03672981, + "balance_loss_mlp": 1.00061929, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 2.1953876954731504, + "language_loss": 0.65873265, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67754161, + "num_input_tokens_seen": 329750975, + "step": 15284, + "time_per_iteration": 2.481919050216675 + }, + { + "auxiliary_loss_clip": 0.0110555, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.034567, + "balance_loss_mlp": 1.02016234, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 1.8059449385774486, + "language_loss": 0.73238868, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75377119, + "num_input_tokens_seen": 329769645, + "step": 15285, + "time_per_iteration": 2.4325435161590576 + }, + { + "auxiliary_loss_clip": 0.01106437, + "auxiliary_loss_mlp": 0.01034337, + "balance_loss_clip": 1.03455651, + "balance_loss_mlp": 1.02186131, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 2.053004581081291, + "language_loss": 0.7173931, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73880082, + "num_input_tokens_seen": 329788185, + "step": 15286, + "time_per_iteration": 2.4044361114501953 + }, + { + "auxiliary_loss_clip": 0.01107532, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.0347302, + "balance_loss_mlp": 1.01952291, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 1.817721669598594, + "language_loss": 0.737526, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75892448, + "num_input_tokens_seen": 329806780, + "step": 15287, + "time_per_iteration": 2.440807819366455 + }, + { + "auxiliary_loss_clip": 0.01110968, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.0383215, + "balance_loss_mlp": 1.01963806, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 8.825697665117383, + "language_loss": 0.65304232, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67448419, + "num_input_tokens_seen": 329826350, + "step": 15288, + "time_per_iteration": 2.392451524734497 + }, + { + "auxiliary_loss_clip": 0.01110748, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.03895736, + "balance_loss_mlp": 1.02237034, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 2.154258826685672, + "language_loss": 0.7128408, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73429215, + "num_input_tokens_seen": 329846160, + "step": 15289, + "time_per_iteration": 2.5000555515289307 + }, + { + "auxiliary_loss_clip": 0.01070832, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.03286409, + "balance_loss_mlp": 1.01974368, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 1.961530718721178, + "language_loss": 0.74277818, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76381445, + "num_input_tokens_seen": 329862020, + "step": 15290, + "time_per_iteration": 2.483347177505493 + }, + { + "auxiliary_loss_clip": 0.01065805, + "auxiliary_loss_mlp": 0.01026289, + "balance_loss_clip": 1.03412008, + "balance_loss_mlp": 1.0148505, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 2.0479193382269734, + "language_loss": 0.71966779, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74058872, + "num_input_tokens_seen": 329880185, + "step": 15291, + "time_per_iteration": 2.6605021953582764 + }, + { + "auxiliary_loss_clip": 0.01084949, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.03528059, + "balance_loss_mlp": 1.01581454, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.6515299999951436, + "language_loss": 0.7106728, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73179805, + "num_input_tokens_seen": 329900255, + "step": 15292, + "time_per_iteration": 2.522050619125366 + }, + { + "auxiliary_loss_clip": 0.01087309, + "auxiliary_loss_mlp": 0.01030009, + "balance_loss_clip": 1.03419781, + "balance_loss_mlp": 1.01722431, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 1.7832037004349963, + "language_loss": 0.72651356, + "learning_rate": 6.760342165443988e-08, + "loss": 0.74768674, + "num_input_tokens_seen": 329919095, + "step": 15293, + "time_per_iteration": 4.0057361125946045 + }, + { + "auxiliary_loss_clip": 0.01105703, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.03522849, + "balance_loss_mlp": 1.01736546, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 1.8659344173148098, + "language_loss": 0.78428602, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80563998, + "num_input_tokens_seen": 329936505, + "step": 15294, + "time_per_iteration": 2.3984169960021973 + }, + { + "auxiliary_loss_clip": 0.01087726, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.03365493, + "balance_loss_mlp": 1.01886094, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 1.9079656750872096, + "language_loss": 0.77062356, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79182374, + "num_input_tokens_seen": 329956795, + "step": 15295, + "time_per_iteration": 2.5447895526885986 + }, + { + "auxiliary_loss_clip": 0.01104999, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.03676045, + "balance_loss_mlp": 1.02091765, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 1.9966449864741538, + "language_loss": 0.71636647, + "learning_rate": 6.730254169322114e-08, + "loss": 0.7377305, + "num_input_tokens_seen": 329977195, + "step": 15296, + "time_per_iteration": 2.4809553623199463 + }, + { + "auxiliary_loss_clip": 0.01109286, + "auxiliary_loss_mlp": 0.01041372, + "balance_loss_clip": 1.03718257, + "balance_loss_mlp": 1.02962399, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.032552378714203, + "language_loss": 0.7521981, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77370465, + "num_input_tokens_seen": 329992095, + "step": 15297, + "time_per_iteration": 2.3955142498016357 + }, + { + "auxiliary_loss_clip": 0.01096291, + "auxiliary_loss_mlp": 0.00777108, + "balance_loss_clip": 1.03970444, + "balance_loss_mlp": 1.00053811, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 5.556040210980383, + "language_loss": 0.73422575, + "learning_rate": 6.710232148647676e-08, + "loss": 0.75295973, + "num_input_tokens_seen": 330011490, + "step": 15298, + "time_per_iteration": 2.5328543186187744 + }, + { + "auxiliary_loss_clip": 0.01083678, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.03804255, + "balance_loss_mlp": 1.02405453, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.4641683196145174, + "language_loss": 0.79191881, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81312323, + "num_input_tokens_seen": 330027885, + "step": 15299, + "time_per_iteration": 2.4525399208068848 + }, + { + "auxiliary_loss_clip": 0.01081685, + "auxiliary_loss_mlp": 0.01022823, + "balance_loss_clip": 1.03888202, + "balance_loss_mlp": 1.01158786, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 2.13022688728762, + "language_loss": 0.63854134, + "learning_rate": 6.690239446242385e-08, + "loss": 0.65958643, + "num_input_tokens_seen": 330046230, + "step": 15300, + "time_per_iteration": 2.5255942344665527 + }, + { + "auxiliary_loss_clip": 0.01080841, + "auxiliary_loss_mlp": 0.0077453, + "balance_loss_clip": 1.03578997, + "balance_loss_mlp": 1.00045884, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 1.9426624475724263, + "language_loss": 0.69686985, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71542358, + "num_input_tokens_seen": 330065535, + "step": 15301, + "time_per_iteration": 2.5038797855377197 + }, + { + "auxiliary_loss_clip": 0.01096896, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_clip": 1.03453732, + "balance_loss_mlp": 1.02863955, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 2.817379401990614, + "language_loss": 0.7114324, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73283839, + "num_input_tokens_seen": 330082920, + "step": 15302, + "time_per_iteration": 2.4334652423858643 + }, + { + "auxiliary_loss_clip": 0.01109178, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.03654671, + "balance_loss_mlp": 1.01887584, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.6711363274075857, + "language_loss": 0.7668587, + "learning_rate": 6.660305371021579e-08, + "loss": 0.78826046, + "num_input_tokens_seen": 330101165, + "step": 15303, + "time_per_iteration": 2.4765450954437256 + }, + { + "auxiliary_loss_clip": 0.01088238, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.03660107, + "balance_loss_mlp": 1.01896906, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 2.4138317180740483, + "language_loss": 0.87937737, + "learning_rate": 6.650342008365006e-08, + "loss": 0.90056729, + "num_input_tokens_seen": 330118775, + "step": 15304, + "time_per_iteration": 2.4661407470703125 + }, + { + "auxiliary_loss_clip": 0.01048696, + "auxiliary_loss_mlp": 0.01035012, + "balance_loss_clip": 1.03308725, + "balance_loss_mlp": 1.01989627, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 2.8823313209108954, + "language_loss": 0.77563506, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79647219, + "num_input_tokens_seen": 330135570, + "step": 15305, + "time_per_iteration": 4.131372928619385 + }, + { + "auxiliary_loss_clip": 0.01090944, + "auxiliary_loss_mlp": 0.01039608, + "balance_loss_clip": 1.03471279, + "balance_loss_mlp": 1.02688265, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 2.04137654491133, + "language_loss": 0.81662989, + "learning_rate": 6.630437278944501e-08, + "loss": 0.83793533, + "num_input_tokens_seen": 330152840, + "step": 15306, + "time_per_iteration": 2.5447356700897217 + }, + { + "auxiliary_loss_clip": 0.01073876, + "auxiliary_loss_mlp": 0.01034641, + "balance_loss_clip": 1.03815925, + "balance_loss_mlp": 1.023417, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 1.9602920030815445, + "language_loss": 0.72257113, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74365628, + "num_input_tokens_seen": 330168605, + "step": 15307, + "time_per_iteration": 2.5342042446136475 + }, + { + "auxiliary_loss_clip": 0.01099047, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.03542626, + "balance_loss_mlp": 1.02050281, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 1.8199077345179686, + "language_loss": 0.78735709, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80867672, + "num_input_tokens_seen": 330186160, + "step": 15308, + "time_per_iteration": 3.8610944747924805 + }, + { + "auxiliary_loss_clip": 0.01084422, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.03240049, + "balance_loss_mlp": 1.0203867, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 1.7989604622947781, + "language_loss": 0.77811337, + "learning_rate": 6.600635180204484e-08, + "loss": 0.79928726, + "num_input_tokens_seen": 330201780, + "step": 15309, + "time_per_iteration": 2.462707281112671 + }, + { + "auxiliary_loss_clip": 0.01054985, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.02998376, + "balance_loss_mlp": 1.01845527, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 2.853099476833449, + "language_loss": 0.66500068, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68586206, + "num_input_tokens_seen": 330219165, + "step": 15310, + "time_per_iteration": 2.536832332611084 + }, + { + "auxiliary_loss_clip": 0.01046668, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.03253317, + "balance_loss_mlp": 1.0195756, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.5883210494000726, + "language_loss": 0.66089135, + "learning_rate": 6.580803782366495e-08, + "loss": 0.681683, + "num_input_tokens_seen": 330238975, + "step": 15311, + "time_per_iteration": 2.741082191467285 + }, + { + "auxiliary_loss_clip": 0.01098262, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.03585005, + "balance_loss_mlp": 1.01878262, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.6519657774456802, + "language_loss": 0.75928164, + "learning_rate": 6.570899084972503e-08, + "loss": 0.78057486, + "num_input_tokens_seen": 330259755, + "step": 15312, + "time_per_iteration": 2.870088815689087 + }, + { + "auxiliary_loss_clip": 0.01094888, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.03650904, + "balance_loss_mlp": 1.02111745, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 2.230166738447699, + "language_loss": 0.79455042, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81582314, + "num_input_tokens_seen": 330277660, + "step": 15313, + "time_per_iteration": 2.4528515338897705 + }, + { + "auxiliary_loss_clip": 0.01099678, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.0352397, + "balance_loss_mlp": 1.01747835, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 1.7931717686472122, + "language_loss": 0.78352302, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80481637, + "num_input_tokens_seen": 330295455, + "step": 15314, + "time_per_iteration": 2.4655792713165283 + }, + { + "auxiliary_loss_clip": 0.01091474, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.03604531, + "balance_loss_mlp": 1.0221833, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 2.059803761087819, + "language_loss": 0.79208654, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81335676, + "num_input_tokens_seen": 330315310, + "step": 15315, + "time_per_iteration": 2.5026450157165527 + }, + { + "auxiliary_loss_clip": 0.01089718, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.03592801, + "balance_loss_mlp": 1.0180223, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.8795214156723536, + "language_loss": 0.76095319, + "learning_rate": 6.531353647657156e-08, + "loss": 0.78215325, + "num_input_tokens_seen": 330333260, + "step": 15316, + "time_per_iteration": 2.4734766483306885 + }, + { + "auxiliary_loss_clip": 0.01108067, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.03423905, + "balance_loss_mlp": 1.02100134, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.6966036412842218, + "language_loss": 0.69325244, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71467423, + "num_input_tokens_seen": 330352465, + "step": 15317, + "time_per_iteration": 2.4520299434661865 + }, + { + "auxiliary_loss_clip": 0.01098493, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.03716552, + "balance_loss_mlp": 1.0200721, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 2.040785223218539, + "language_loss": 0.83808154, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85938984, + "num_input_tokens_seen": 330372685, + "step": 15318, + "time_per_iteration": 2.596917152404785 + }, + { + "auxiliary_loss_clip": 0.010879, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.03725648, + "balance_loss_mlp": 1.01954472, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 2.038579820574787, + "language_loss": 0.8561002, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87729532, + "num_input_tokens_seen": 330388860, + "step": 15319, + "time_per_iteration": 4.056842088699341 + }, + { + "auxiliary_loss_clip": 0.01026161, + "auxiliary_loss_mlp": 0.01000968, + "balance_loss_clip": 1.00285888, + "balance_loss_mlp": 0.99980015, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.768048908665613, + "language_loss": 0.56172967, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58200097, + "num_input_tokens_seen": 330448735, + "step": 15320, + "time_per_iteration": 3.0500380992889404 + }, + { + "auxiliary_loss_clip": 0.01063578, + "auxiliary_loss_mlp": 0.01048223, + "balance_loss_clip": 1.0377748, + "balance_loss_mlp": 1.03302383, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 2.0231620893951523, + "language_loss": 0.63819301, + "learning_rate": 6.482086921695384e-08, + "loss": 0.65931106, + "num_input_tokens_seen": 330465600, + "step": 15321, + "time_per_iteration": 2.528482437133789 + }, + { + "auxiliary_loss_clip": 0.01069524, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.03441632, + "balance_loss_mlp": 1.01671696, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.4943872662424018, + "language_loss": 0.71793222, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73890829, + "num_input_tokens_seen": 330485770, + "step": 15322, + "time_per_iteration": 2.5678231716156006 + }, + { + "auxiliary_loss_clip": 0.01059399, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.03552198, + "balance_loss_mlp": 1.02051187, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 1.7274331191696481, + "language_loss": 0.69814622, + "learning_rate": 6.462431596227725e-08, + "loss": 0.71905982, + "num_input_tokens_seen": 330504255, + "step": 15323, + "time_per_iteration": 2.5457961559295654 + }, + { + "auxiliary_loss_clip": 0.01082657, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.03221202, + "balance_loss_mlp": 1.02437723, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 2.1340149502429226, + "language_loss": 0.74482882, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76603758, + "num_input_tokens_seen": 330520705, + "step": 15324, + "time_per_iteration": 2.4767775535583496 + }, + { + "auxiliary_loss_clip": 0.0109901, + "auxiliary_loss_mlp": 0.01043237, + "balance_loss_clip": 1.03651142, + "balance_loss_mlp": 1.03086877, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 1.814448777181626, + "language_loss": 0.71262473, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73404717, + "num_input_tokens_seen": 330539245, + "step": 15325, + "time_per_iteration": 2.466200351715088 + }, + { + "auxiliary_loss_clip": 0.01082089, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.03600526, + "balance_loss_mlp": 1.02244794, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.5616160089948417, + "language_loss": 0.78364784, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80481499, + "num_input_tokens_seen": 330561815, + "step": 15326, + "time_per_iteration": 2.5684008598327637 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.037166, + "balance_loss_mlp": 1.02054024, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 2.3458715424724312, + "language_loss": 0.7151854, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73652226, + "num_input_tokens_seen": 330579760, + "step": 15327, + "time_per_iteration": 2.4409968852996826 + }, + { + "auxiliary_loss_clip": 0.01103856, + "auxiliary_loss_mlp": 0.0104018, + "balance_loss_clip": 1.0393157, + "balance_loss_mlp": 1.02639318, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 1.75381883746106, + "language_loss": 0.77838147, + "learning_rate": 6.413421720937906e-08, + "loss": 0.79982185, + "num_input_tokens_seen": 330598545, + "step": 15328, + "time_per_iteration": 2.5037851333618164 + }, + { + "auxiliary_loss_clip": 0.01087092, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.03580296, + "balance_loss_mlp": 1.0208385, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 2.0556791764944298, + "language_loss": 0.71147418, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73267025, + "num_input_tokens_seen": 330616700, + "step": 15329, + "time_per_iteration": 2.532440185546875 + }, + { + "auxiliary_loss_clip": 0.01093736, + "auxiliary_loss_mlp": 0.01024155, + "balance_loss_clip": 1.03383315, + "balance_loss_mlp": 1.01293719, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 1.9071044329590103, + "language_loss": 0.87029254, + "learning_rate": 6.393869153979192e-08, + "loss": 0.89147139, + "num_input_tokens_seen": 330633355, + "step": 15330, + "time_per_iteration": 2.4624977111816406 + }, + { + "auxiliary_loss_clip": 0.01076788, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.03242254, + "balance_loss_mlp": 1.0193038, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.4978828792342105, + "language_loss": 0.75662935, + "learning_rate": 6.384103882660397e-08, + "loss": 0.77771366, + "num_input_tokens_seen": 330651470, + "step": 15331, + "time_per_iteration": 2.5196588039398193 + }, + { + "auxiliary_loss_clip": 0.01095775, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.03407311, + "balance_loss_mlp": 1.01642036, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.873780259360794, + "language_loss": 0.75488836, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77613044, + "num_input_tokens_seen": 330669170, + "step": 15332, + "time_per_iteration": 3.966493844985962 + }, + { + "auxiliary_loss_clip": 0.01059416, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.03385115, + "balance_loss_mlp": 1.02011585, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 1.9044503791481544, + "language_loss": 0.74653578, + "learning_rate": 6.364595366195358e-08, + "loss": 0.76744425, + "num_input_tokens_seen": 330686635, + "step": 15333, + "time_per_iteration": 2.5497591495513916 + }, + { + "auxiliary_loss_clip": 0.01015746, + "auxiliary_loss_mlp": 0.0100704, + "balance_loss_clip": 1.00533712, + "balance_loss_mlp": 1.0059309, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.8133838952731816, + "language_loss": 0.52865934, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54888725, + "num_input_tokens_seen": 330749160, + "step": 15334, + "time_per_iteration": 3.004093885421753 + }, + { + "auxiliary_loss_clip": 0.01085006, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.03939295, + "balance_loss_mlp": 1.01839375, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 2.1780163674683255, + "language_loss": 0.62490726, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64605534, + "num_input_tokens_seen": 330766840, + "step": 15335, + "time_per_iteration": 2.478280782699585 + }, + { + "auxiliary_loss_clip": 0.0106113, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.03221226, + "balance_loss_mlp": 1.02047324, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 1.7229048970038088, + "language_loss": 0.71516275, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73610896, + "num_input_tokens_seen": 330785585, + "step": 15336, + "time_per_iteration": 2.5863640308380127 + }, + { + "auxiliary_loss_clip": 0.01083358, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.0352993, + "balance_loss_mlp": 1.01953566, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 2.2883050732418817, + "language_loss": 0.71824628, + "learning_rate": 6.325666448306433e-08, + "loss": 0.73937994, + "num_input_tokens_seen": 330800750, + "step": 15337, + "time_per_iteration": 2.456634044647217 + }, + { + "auxiliary_loss_clip": 0.01017798, + "auxiliary_loss_mlp": 0.00998496, + "balance_loss_clip": 1.00346684, + "balance_loss_mlp": 0.9972021, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.881048558507811, + "language_loss": 0.65348059, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67364353, + "num_input_tokens_seen": 330863640, + "step": 15338, + "time_per_iteration": 2.997159004211426 + }, + { + "auxiliary_loss_clip": 0.01100755, + "auxiliary_loss_mlp": 0.01036344, + "balance_loss_clip": 1.03801346, + "balance_loss_mlp": 1.02397013, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 2.2159751976588256, + "language_loss": 0.67266983, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69404078, + "num_input_tokens_seen": 330884675, + "step": 15339, + "time_per_iteration": 2.560986042022705 + }, + { + "auxiliary_loss_clip": 0.0110777, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.03645778, + "balance_loss_mlp": 1.01867294, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 2.544901126386359, + "language_loss": 0.72367519, + "learning_rate": 6.296546872173513e-08, + "loss": 0.74506044, + "num_input_tokens_seen": 330904125, + "step": 15340, + "time_per_iteration": 2.4900543689727783 + }, + { + "auxiliary_loss_clip": 0.01074004, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.03391433, + "balance_loss_mlp": 1.01695859, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.7227195678843277, + "language_loss": 0.7033509, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72437823, + "num_input_tokens_seen": 330925140, + "step": 15341, + "time_per_iteration": 2.6346561908721924 + }, + { + "auxiliary_loss_clip": 0.01056463, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.03652656, + "balance_loss_mlp": 1.01698315, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.6636954166946383, + "language_loss": 0.67347175, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69431412, + "num_input_tokens_seen": 330946625, + "step": 15342, + "time_per_iteration": 2.6444993019104004 + }, + { + "auxiliary_loss_clip": 0.01060027, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.03709805, + "balance_loss_mlp": 1.02103961, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 1.9262261956278082, + "language_loss": 0.69488746, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71580803, + "num_input_tokens_seen": 330967795, + "step": 15343, + "time_per_iteration": 2.8398144245147705 + }, + { + "auxiliary_loss_clip": 0.01009272, + "auxiliary_loss_mlp": 0.01000122, + "balance_loss_clip": 1.00459576, + "balance_loss_mlp": 0.99888861, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7234472494357167, + "language_loss": 0.52009767, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54019159, + "num_input_tokens_seen": 331040850, + "step": 15344, + "time_per_iteration": 4.798701286315918 + }, + { + "auxiliary_loss_clip": 0.01103132, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.03586948, + "balance_loss_mlp": 1.02079892, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.6685378178963117, + "language_loss": 0.70404249, + "learning_rate": 6.248161155266162e-08, + "loss": 0.72539121, + "num_input_tokens_seen": 331060595, + "step": 15345, + "time_per_iteration": 2.435863971710205 + }, + { + "auxiliary_loss_clip": 0.01087077, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.03558636, + "balance_loss_mlp": 1.02083635, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 1.904809571773224, + "language_loss": 0.77487272, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79607582, + "num_input_tokens_seen": 331080195, + "step": 15346, + "time_per_iteration": 2.512554168701172 + }, + { + "auxiliary_loss_clip": 0.01088343, + "auxiliary_loss_mlp": 0.01036889, + "balance_loss_clip": 1.03762627, + "balance_loss_mlp": 1.0241034, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 1.8718872076752737, + "language_loss": 0.75915879, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78041112, + "num_input_tokens_seen": 331097645, + "step": 15347, + "time_per_iteration": 3.9394052028656006 + }, + { + "auxiliary_loss_clip": 0.01093751, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.03651094, + "balance_loss_mlp": 1.02000403, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.4933571614071188, + "language_loss": 0.76929539, + "learning_rate": 6.219217887256367e-08, + "loss": 0.79054207, + "num_input_tokens_seen": 331116830, + "step": 15348, + "time_per_iteration": 2.464174747467041 + }, + { + "auxiliary_loss_clip": 0.01086537, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.03348708, + "balance_loss_mlp": 1.01598275, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 2.0864157891545614, + "language_loss": 0.67716825, + "learning_rate": 6.209584827138959e-08, + "loss": 0.69831884, + "num_input_tokens_seen": 331137235, + "step": 15349, + "time_per_iteration": 2.516444683074951 + }, + { + "auxiliary_loss_clip": 0.01068606, + "auxiliary_loss_mlp": 0.01029121, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.01687217, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 3.3079323589480127, + "language_loss": 0.87072253, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89169979, + "num_input_tokens_seen": 331153155, + "step": 15350, + "time_per_iteration": 2.4912309646606445 + }, + { + "auxiliary_loss_clip": 0.01010911, + "auxiliary_loss_mlp": 0.01000428, + "balance_loss_clip": 1.00642192, + "balance_loss_mlp": 0.99930173, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.7689349231908875, + "language_loss": 0.60338378, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62349725, + "num_input_tokens_seen": 331214895, + "step": 15351, + "time_per_iteration": 3.0365006923675537 + }, + { + "auxiliary_loss_clip": 0.01082022, + "auxiliary_loss_mlp": 0.01026327, + "balance_loss_clip": 1.03290892, + "balance_loss_mlp": 1.01463807, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 2.1658339253130667, + "language_loss": 0.77778232, + "learning_rate": 6.180729739558233e-08, + "loss": 0.7988658, + "num_input_tokens_seen": 331232185, + "step": 15352, + "time_per_iteration": 2.486499309539795 + }, + { + "auxiliary_loss_clip": 0.01075605, + "auxiliary_loss_mlp": 0.01044758, + "balance_loss_clip": 1.0331856, + "balance_loss_mlp": 1.03027415, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 2.087910142669834, + "language_loss": 0.59483951, + "learning_rate": 6.171126075837585e-08, + "loss": 0.61604309, + "num_input_tokens_seen": 331251065, + "step": 15353, + "time_per_iteration": 2.53031325340271 + }, + { + "auxiliary_loss_clip": 0.01083399, + "auxiliary_loss_mlp": 0.01026293, + "balance_loss_clip": 1.03387499, + "balance_loss_mlp": 1.01523638, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.7296929361769253, + "language_loss": 0.74640882, + "learning_rate": 6.161529762127293e-08, + "loss": 0.76750565, + "num_input_tokens_seen": 331269110, + "step": 15354, + "time_per_iteration": 2.4797470569610596 + }, + { + "auxiliary_loss_clip": 0.01111355, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.03619409, + "balance_loss_mlp": 1.02239299, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 3.343481215166121, + "language_loss": 0.65244806, + "learning_rate": 6.1519407987912e-08, + "loss": 0.67391729, + "num_input_tokens_seen": 331286555, + "step": 15355, + "time_per_iteration": 2.409888744354248 + }, + { + "auxiliary_loss_clip": 0.01081948, + "auxiliary_loss_mlp": 0.01039839, + "balance_loss_clip": 1.03388691, + "balance_loss_mlp": 1.02751255, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.5245533902379724, + "language_loss": 0.74253875, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76375663, + "num_input_tokens_seen": 331307660, + "step": 15356, + "time_per_iteration": 2.5358123779296875 + }, + { + "auxiliary_loss_clip": 0.01085483, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.03451133, + "balance_loss_mlp": 1.01736999, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 2.052540174626389, + "language_loss": 0.60949147, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63064897, + "num_input_tokens_seen": 331324885, + "step": 15357, + "time_per_iteration": 2.4500174522399902 + }, + { + "auxiliary_loss_clip": 0.01081487, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.03618288, + "balance_loss_mlp": 1.01963663, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.5120949685986143, + "language_loss": 0.70007193, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72120726, + "num_input_tokens_seen": 331345885, + "step": 15358, + "time_per_iteration": 4.015915393829346 + }, + { + "auxiliary_loss_clip": 0.0110757, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.03554821, + "balance_loss_mlp": 1.0210228, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 1.933301233745224, + "language_loss": 0.73816317, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75956756, + "num_input_tokens_seen": 331364320, + "step": 15359, + "time_per_iteration": 2.4575767517089844 + }, + { + "auxiliary_loss_clip": 0.01051726, + "auxiliary_loss_mlp": 0.01028187, + "balance_loss_clip": 1.03816676, + "balance_loss_mlp": 1.01665354, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 2.10871048681733, + "language_loss": 0.64700568, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66780484, + "num_input_tokens_seen": 331384135, + "step": 15360, + "time_per_iteration": 2.9052889347076416 + }, + { + "auxiliary_loss_clip": 0.01017384, + "auxiliary_loss_mlp": 0.0075278, + "balance_loss_clip": 1.0037539, + "balance_loss_mlp": 1.00024879, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7665495117158443, + "language_loss": 0.55096447, + "learning_rate": 6.094561396976083e-08, + "loss": 0.5686661, + "num_input_tokens_seen": 331440645, + "step": 15361, + "time_per_iteration": 3.332604169845581 + }, + { + "auxiliary_loss_clip": 0.01078248, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.03291535, + "balance_loss_mlp": 1.01860535, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 1.8460122379833415, + "language_loss": 0.7009033, + "learning_rate": 6.085023896425112e-08, + "loss": 0.72200054, + "num_input_tokens_seen": 331459580, + "step": 15362, + "time_per_iteration": 2.5357673168182373 + }, + { + "auxiliary_loss_clip": 0.01095121, + "auxiliary_loss_mlp": 0.01035842, + "balance_loss_clip": 1.03369296, + "balance_loss_mlp": 1.02070284, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.5097429602834307, + "language_loss": 0.76054364, + "learning_rate": 6.075493749149463e-08, + "loss": 0.78185326, + "num_input_tokens_seen": 331481560, + "step": 15363, + "time_per_iteration": 2.5183544158935547 + }, + { + "auxiliary_loss_clip": 0.01108496, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.03673244, + "balance_loss_mlp": 1.02042878, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 2.0523862328059814, + "language_loss": 0.83202839, + "learning_rate": 6.065970955510514e-08, + "loss": 0.8534379, + "num_input_tokens_seen": 331499090, + "step": 15364, + "time_per_iteration": 2.4703540802001953 + }, + { + "auxiliary_loss_clip": 0.01073232, + "auxiliary_loss_mlp": 0.0102376, + "balance_loss_clip": 1.03452444, + "balance_loss_mlp": 1.01301956, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.6276920616560466, + "language_loss": 0.67649245, + "learning_rate": 6.056455515869419e-08, + "loss": 0.69746232, + "num_input_tokens_seen": 331519420, + "step": 15365, + "time_per_iteration": 2.554506778717041 + }, + { + "auxiliary_loss_clip": 0.01110074, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.03728294, + "balance_loss_mlp": 1.01948476, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 2.339638982325558, + "language_loss": 0.62826836, + "learning_rate": 6.046947430586913e-08, + "loss": 0.64968503, + "num_input_tokens_seen": 331538720, + "step": 15366, + "time_per_iteration": 2.47196888923645 + }, + { + "auxiliary_loss_clip": 0.01076524, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.03729367, + "balance_loss_mlp": 1.01814342, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.4082896367876045, + "language_loss": 0.74722987, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76830071, + "num_input_tokens_seen": 331558505, + "step": 15367, + "time_per_iteration": 2.548966646194458 + }, + { + "auxiliary_loss_clip": 0.0108375, + "auxiliary_loss_mlp": 0.00775609, + "balance_loss_clip": 1.03512478, + "balance_loss_mlp": 1.00049591, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 1.843775506910525, + "language_loss": 0.6476382, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66623175, + "num_input_tokens_seen": 331578440, + "step": 15368, + "time_per_iteration": 2.533946990966797 + }, + { + "auxiliary_loss_clip": 0.01099326, + "auxiliary_loss_mlp": 0.01032663, + "balance_loss_clip": 1.03450692, + "balance_loss_mlp": 1.02037835, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 1.6757655552050952, + "language_loss": 0.74415034, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76547027, + "num_input_tokens_seen": 331598945, + "step": 15369, + "time_per_iteration": 2.4940338134765625 + }, + { + "auxiliary_loss_clip": 0.01103197, + "auxiliary_loss_mlp": 0.01040222, + "balance_loss_clip": 1.0384407, + "balance_loss_mlp": 1.02647138, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 2.524189560446268, + "language_loss": 0.76551396, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78694814, + "num_input_tokens_seen": 331616700, + "step": 15370, + "time_per_iteration": 2.4478073120117188 + }, + { + "auxiliary_loss_clip": 0.01109051, + "auxiliary_loss_mlp": 0.01031354, + "balance_loss_clip": 1.03641665, + "balance_loss_mlp": 1.01922476, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.101307344135711, + "language_loss": 0.66988707, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69129109, + "num_input_tokens_seen": 331635625, + "step": 15371, + "time_per_iteration": 2.4410483837127686 + }, + { + "auxiliary_loss_clip": 0.01013736, + "auxiliary_loss_mlp": 0.01003366, + "balance_loss_clip": 1.00845957, + "balance_loss_mlp": 1.00211453, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7238697510341162, + "language_loss": 0.57745385, + "learning_rate": 5.99005338059464e-08, + "loss": 0.5976249, + "num_input_tokens_seen": 331698595, + "step": 15372, + "time_per_iteration": 4.4880592823028564 + }, + { + "auxiliary_loss_clip": 0.01106847, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.03753674, + "balance_loss_mlp": 1.01989055, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 1.83208524322613, + "language_loss": 0.69776559, + "learning_rate": 5.98059678590237e-08, + "loss": 0.71914339, + "num_input_tokens_seen": 331717975, + "step": 15373, + "time_per_iteration": 2.412245750427246 + }, + { + "auxiliary_loss_clip": 0.01095041, + "auxiliary_loss_mlp": 0.01039429, + "balance_loss_clip": 1.0346365, + "balance_loss_mlp": 1.02719235, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 2.4541914440119568, + "language_loss": 0.75498068, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77632535, + "num_input_tokens_seen": 331737220, + "step": 15374, + "time_per_iteration": 2.430351734161377 + }, + { + "auxiliary_loss_clip": 0.01071183, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.0319258, + "balance_loss_mlp": 1.01945806, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 1.6910972038665462, + "language_loss": 0.64710379, + "learning_rate": 5.961705668581784e-08, + "loss": 0.66813558, + "num_input_tokens_seen": 331757300, + "step": 15375, + "time_per_iteration": 2.5540120601654053 + }, + { + "auxiliary_loss_clip": 0.01082574, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.03970671, + "balance_loss_mlp": 1.02225041, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 2.141790813960873, + "language_loss": 0.66037339, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68154091, + "num_input_tokens_seen": 331776995, + "step": 15376, + "time_per_iteration": 2.548936605453491 + }, + { + "auxiliary_loss_clip": 0.01026335, + "auxiliary_loss_mlp": 0.01004869, + "balance_loss_clip": 1.00299644, + "balance_loss_mlp": 1.00368297, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6502461018083978, + "language_loss": 0.6120106, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63232267, + "num_input_tokens_seen": 331845015, + "step": 15377, + "time_per_iteration": 3.0333194732666016 + }, + { + "auxiliary_loss_clip": 0.01070545, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.03413177, + "balance_loss_mlp": 1.02223182, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 3.604305019714901, + "language_loss": 0.73920393, + "learning_rate": 5.933424178131341e-08, + "loss": 0.76025248, + "num_input_tokens_seen": 331862795, + "step": 15378, + "time_per_iteration": 2.5251142978668213 + }, + { + "auxiliary_loss_clip": 0.01110719, + "auxiliary_loss_mlp": 0.01028573, + "balance_loss_clip": 1.0377847, + "balance_loss_mlp": 1.01572227, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 1.993255290924553, + "language_loss": 0.62436026, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64575315, + "num_input_tokens_seen": 331882535, + "step": 15379, + "time_per_iteration": 2.5240044593811035 + }, + { + "auxiliary_loss_clip": 0.01028171, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.02954054, + "balance_loss_mlp": 1.0200727, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 2.154984358010696, + "language_loss": 0.84013635, + "learning_rate": 5.914606645688591e-08, + "loss": 0.86075234, + "num_input_tokens_seen": 331899335, + "step": 15380, + "time_per_iteration": 2.61576509475708 + }, + { + "auxiliary_loss_clip": 0.01109831, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.03549302, + "balance_loss_mlp": 1.02092385, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.6695508092348768, + "language_loss": 0.73245573, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75389415, + "num_input_tokens_seen": 331919030, + "step": 15381, + "time_per_iteration": 2.4453284740448 + }, + { + "auxiliary_loss_clip": 0.01093568, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.03829718, + "balance_loss_mlp": 1.01862574, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.9585210571621274, + "language_loss": 0.78491139, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80615509, + "num_input_tokens_seen": 331936465, + "step": 15382, + "time_per_iteration": 2.46868634223938 + }, + { + "auxiliary_loss_clip": 0.01089622, + "auxiliary_loss_mlp": 0.01035097, + "balance_loss_clip": 1.03827751, + "balance_loss_mlp": 1.02213931, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.8651983391241451, + "language_loss": 0.75362599, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77487314, + "num_input_tokens_seen": 331954625, + "step": 15383, + "time_per_iteration": 2.5021679401397705 + }, + { + "auxiliary_loss_clip": 0.01085506, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.03374732, + "balance_loss_mlp": 1.02053785, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 2.9386443487719394, + "language_loss": 0.75358307, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77475846, + "num_input_tokens_seen": 331975865, + "step": 15384, + "time_per_iteration": 4.309897422790527 + }, + { + "auxiliary_loss_clip": 0.01083185, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.04330611, + "balance_loss_mlp": 1.02093077, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 2.8247974968659446, + "language_loss": 0.66534054, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68650919, + "num_input_tokens_seen": 331992760, + "step": 15385, + "time_per_iteration": 2.4926397800445557 + }, + { + "auxiliary_loss_clip": 0.01105774, + "auxiliary_loss_mlp": 0.01035677, + "balance_loss_clip": 1.03413117, + "balance_loss_mlp": 1.02357101, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 2.267959486684944, + "language_loss": 0.80964422, + "learning_rate": 5.85833069345496e-08, + "loss": 0.83105874, + "num_input_tokens_seen": 332011890, + "step": 15386, + "time_per_iteration": 2.4462993144989014 + }, + { + "auxiliary_loss_clip": 0.01095787, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.03495336, + "balance_loss_mlp": 1.02671003, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.9353131336140892, + "language_loss": 0.75170326, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.77305567, + "num_input_tokens_seen": 332029485, + "step": 15387, + "time_per_iteration": 3.967172622680664 + }, + { + "auxiliary_loss_clip": 0.01090703, + "auxiliary_loss_mlp": 0.01033624, + "balance_loss_clip": 1.03468359, + "balance_loss_mlp": 1.02257311, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.6740452459184594, + "language_loss": 0.70038283, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72162616, + "num_input_tokens_seen": 332052970, + "step": 15388, + "time_per_iteration": 2.5650463104248047 + }, + { + "auxiliary_loss_clip": 0.01098434, + "auxiliary_loss_mlp": 0.01025005, + "balance_loss_clip": 1.03587472, + "balance_loss_mlp": 1.0125711, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 1.9718208734768743, + "language_loss": 0.82146394, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84269834, + "num_input_tokens_seen": 332070395, + "step": 15389, + "time_per_iteration": 2.4669137001037598 + }, + { + "auxiliary_loss_clip": 0.01104431, + "auxiliary_loss_mlp": 0.01035219, + "balance_loss_clip": 1.03690791, + "balance_loss_mlp": 1.02183735, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 1.739040071629117, + "language_loss": 0.79501945, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81641591, + "num_input_tokens_seen": 332090185, + "step": 15390, + "time_per_iteration": 2.442143678665161 + }, + { + "auxiliary_loss_clip": 0.0107842, + "auxiliary_loss_mlp": 0.01037745, + "balance_loss_clip": 1.03511322, + "balance_loss_mlp": 1.02441788, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 1.827802661885133, + "language_loss": 0.75507581, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77623749, + "num_input_tokens_seen": 332109050, + "step": 15391, + "time_per_iteration": 2.5333926677703857 + }, + { + "auxiliary_loss_clip": 0.01088395, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.03434896, + "balance_loss_mlp": 1.02050638, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 3.2475814306099227, + "language_loss": 0.52616113, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54738361, + "num_input_tokens_seen": 332131180, + "step": 15392, + "time_per_iteration": 2.6073827743530273 + }, + { + "auxiliary_loss_clip": 0.01106962, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.03467572, + "balance_loss_mlp": 1.01832592, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.6647332820548029, + "language_loss": 0.77281868, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79418945, + "num_input_tokens_seen": 332149555, + "step": 15393, + "time_per_iteration": 2.4740684032440186 + }, + { + "auxiliary_loss_clip": 0.01079062, + "auxiliary_loss_mlp": 0.01033495, + "balance_loss_clip": 1.03429866, + "balance_loss_mlp": 1.02105594, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 3.168188403362759, + "language_loss": 0.69358116, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71470678, + "num_input_tokens_seen": 332165830, + "step": 15394, + "time_per_iteration": 2.4587347507476807 + }, + { + "auxiliary_loss_clip": 0.01108071, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.03610897, + "balance_loss_mlp": 1.01800442, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.7171396096182145, + "language_loss": 0.72937101, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.75075364, + "num_input_tokens_seen": 332185130, + "step": 15395, + "time_per_iteration": 2.414682149887085 + }, + { + "auxiliary_loss_clip": 0.01061617, + "auxiliary_loss_mlp": 0.01029732, + "balance_loss_clip": 1.03659248, + "balance_loss_mlp": 1.01893735, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 1.9987575036228042, + "language_loss": 0.71747434, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73838782, + "num_input_tokens_seen": 332203695, + "step": 15396, + "time_per_iteration": 2.587261199951172 + }, + { + "auxiliary_loss_clip": 0.01106691, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.0353533, + "balance_loss_mlp": 1.02041936, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.5923511027497352, + "language_loss": 0.87430573, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89570016, + "num_input_tokens_seen": 332224850, + "step": 15397, + "time_per_iteration": 2.4577910900115967 + }, + { + "auxiliary_loss_clip": 0.01026268, + "auxiliary_loss_mlp": 0.01001077, + "balance_loss_clip": 1.00299048, + "balance_loss_mlp": 0.99989128, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.7977749630599392, + "language_loss": 0.55183834, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57211179, + "num_input_tokens_seen": 332278085, + "step": 15398, + "time_per_iteration": 4.301177740097046 + }, + { + "auxiliary_loss_clip": 0.01090173, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.03441095, + "balance_loss_mlp": 1.01712382, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 2.1055448979059315, + "language_loss": 0.76331234, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78453314, + "num_input_tokens_seen": 332297875, + "step": 15399, + "time_per_iteration": 2.568031072616577 + }, + { + "auxiliary_loss_clip": 0.01078427, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03120649, + "balance_loss_mlp": 1.02133906, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.432741833095188, + "language_loss": 0.78173172, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80284017, + "num_input_tokens_seen": 332318500, + "step": 15400, + "time_per_iteration": 2.5476009845733643 + }, + { + "auxiliary_loss_clip": 0.01020633, + "auxiliary_loss_mlp": 0.01006953, + "balance_loss_clip": 1.00576591, + "balance_loss_mlp": 1.00574303, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.7376527935075847, + "language_loss": 0.51333004, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53360587, + "num_input_tokens_seen": 332381980, + "step": 15401, + "time_per_iteration": 2.999906539916992 + }, + { + "auxiliary_loss_clip": 0.01094064, + "auxiliary_loss_mlp": 0.01032335, + "balance_loss_clip": 1.0359925, + "balance_loss_mlp": 1.02136803, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.648203705339853, + "language_loss": 0.8248595, + "learning_rate": 5.709557384259378e-08, + "loss": 0.84612346, + "num_input_tokens_seen": 332399510, + "step": 15402, + "time_per_iteration": 2.4650683403015137 + }, + { + "auxiliary_loss_clip": 0.01026443, + "auxiliary_loss_mlp": 0.00999021, + "balance_loss_clip": 1.00311005, + "balance_loss_mlp": 0.99784702, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7394856537478897, + "language_loss": 0.5111497, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53140438, + "num_input_tokens_seen": 332459130, + "step": 15403, + "time_per_iteration": 3.0901122093200684 + }, + { + "auxiliary_loss_clip": 0.01007574, + "auxiliary_loss_mlp": 0.01000176, + "balance_loss_clip": 1.00430346, + "balance_loss_mlp": 0.99901372, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.7109691665612401, + "language_loss": 0.58673096, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60680842, + "num_input_tokens_seen": 332526555, + "step": 15404, + "time_per_iteration": 3.093224287033081 + }, + { + "auxiliary_loss_clip": 0.01084063, + "auxiliary_loss_mlp": 0.01033748, + "balance_loss_clip": 1.03731906, + "balance_loss_mlp": 1.02038431, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 1.9908823197053547, + "language_loss": 0.71466553, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73584366, + "num_input_tokens_seen": 332544005, + "step": 15405, + "time_per_iteration": 2.497215509414673 + }, + { + "auxiliary_loss_clip": 0.01066939, + "auxiliary_loss_mlp": 0.0103678, + "balance_loss_clip": 1.03861797, + "balance_loss_mlp": 1.0239706, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.7509845492085396, + "language_loss": 0.68491793, + "learning_rate": 5.672658701232458e-08, + "loss": 0.70595509, + "num_input_tokens_seen": 332563070, + "step": 15406, + "time_per_iteration": 2.5979366302490234 + }, + { + "auxiliary_loss_clip": 0.01059656, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_clip": 1.03262186, + "balance_loss_mlp": 1.02848363, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 2.1908920221030717, + "language_loss": 0.76367855, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78470659, + "num_input_tokens_seen": 332579620, + "step": 15407, + "time_per_iteration": 2.5484113693237305 + }, + { + "auxiliary_loss_clip": 0.01079366, + "auxiliary_loss_mlp": 0.01039784, + "balance_loss_clip": 1.03341842, + "balance_loss_mlp": 1.02670646, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 2.03980774399627, + "language_loss": 0.72705436, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74824584, + "num_input_tokens_seen": 332597795, + "step": 15408, + "time_per_iteration": 2.5642759799957275 + }, + { + "auxiliary_loss_clip": 0.01078234, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.03332162, + "balance_loss_mlp": 1.01673222, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 1.713400715341724, + "language_loss": 0.68380117, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70485771, + "num_input_tokens_seen": 332620375, + "step": 15409, + "time_per_iteration": 2.729883909225464 + }, + { + "auxiliary_loss_clip": 0.01077582, + "auxiliary_loss_mlp": 0.01032374, + "balance_loss_clip": 1.03772426, + "balance_loss_mlp": 1.01908803, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 2.0008724263039963, + "language_loss": 0.75180203, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77290159, + "num_input_tokens_seen": 332639510, + "step": 15410, + "time_per_iteration": 2.5559113025665283 + }, + { + "auxiliary_loss_clip": 0.01063001, + "auxiliary_loss_mlp": 0.01029847, + "balance_loss_clip": 1.04222751, + "balance_loss_mlp": 1.01805758, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.55988184360268, + "language_loss": 0.8210935, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.842022, + "num_input_tokens_seen": 332658350, + "step": 15411, + "time_per_iteration": 4.002631425857544 + }, + { + "auxiliary_loss_clip": 0.01087175, + "auxiliary_loss_mlp": 0.0103635, + "balance_loss_clip": 1.03842044, + "balance_loss_mlp": 1.02399385, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 1.9187542047558481, + "language_loss": 0.75304872, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77428401, + "num_input_tokens_seen": 332676715, + "step": 15412, + "time_per_iteration": 2.49084734916687 + }, + { + "auxiliary_loss_clip": 0.01106077, + "auxiliary_loss_mlp": 0.01029074, + "balance_loss_clip": 1.03397429, + "balance_loss_mlp": 1.0170579, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 2.4915000972917682, + "language_loss": 0.667579, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.68893051, + "num_input_tokens_seen": 332701470, + "step": 15413, + "time_per_iteration": 2.520751953125 + }, + { + "auxiliary_loss_clip": 0.01052902, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.03452969, + "balance_loss_mlp": 1.02171552, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.6661821091506142, + "language_loss": 0.75945085, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78032917, + "num_input_tokens_seen": 332719060, + "step": 15414, + "time_per_iteration": 2.5820870399475098 + }, + { + "auxiliary_loss_clip": 0.01097174, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.04054904, + "balance_loss_mlp": 1.01831174, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 3.3687921665613607, + "language_loss": 0.81532598, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83659893, + "num_input_tokens_seen": 332736345, + "step": 15415, + "time_per_iteration": 2.462582588195801 + }, + { + "auxiliary_loss_clip": 0.01088383, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.03257966, + "balance_loss_mlp": 1.02236032, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.528070412569881, + "language_loss": 0.54161406, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56284845, + "num_input_tokens_seen": 332756270, + "step": 15416, + "time_per_iteration": 2.527031421661377 + }, + { + "auxiliary_loss_clip": 0.01067631, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.03293729, + "balance_loss_mlp": 1.02262402, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.561783289443835, + "language_loss": 0.71712327, + "learning_rate": 5.571795325221807e-08, + "loss": 0.73814225, + "num_input_tokens_seen": 332775185, + "step": 15417, + "time_per_iteration": 2.4839494228363037 + }, + { + "auxiliary_loss_clip": 0.01097814, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.04125142, + "balance_loss_mlp": 1.01870775, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 1.978979395658231, + "language_loss": 0.75826991, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77956223, + "num_input_tokens_seen": 332794320, + "step": 15418, + "time_per_iteration": 2.474437713623047 + }, + { + "auxiliary_loss_clip": 0.01093862, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.0332557, + "balance_loss_mlp": 1.01535618, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.680996694804255, + "language_loss": 0.76037717, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78159446, + "num_input_tokens_seen": 332818095, + "step": 15419, + "time_per_iteration": 2.5229082107543945 + }, + { + "auxiliary_loss_clip": 0.01103368, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.03400052, + "balance_loss_mlp": 1.02096105, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 2.0719449199238285, + "language_loss": 0.75499296, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.77634704, + "num_input_tokens_seen": 332839860, + "step": 15420, + "time_per_iteration": 2.5089504718780518 + }, + { + "auxiliary_loss_clip": 0.01099725, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.03520954, + "balance_loss_mlp": 1.01872241, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.6467798556053233, + "language_loss": 0.76746833, + "learning_rate": 5.535338891759389e-08, + "loss": 0.78878164, + "num_input_tokens_seen": 332861155, + "step": 15421, + "time_per_iteration": 2.4884719848632812 + }, + { + "auxiliary_loss_clip": 0.01084161, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.03747427, + "balance_loss_mlp": 1.01845527, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 2.0287045360177576, + "language_loss": 0.72873545, + "learning_rate": 5.526243217829041e-08, + "loss": 0.74988008, + "num_input_tokens_seen": 332881110, + "step": 15422, + "time_per_iteration": 2.5327179431915283 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.03587055, + "balance_loss_mlp": 1.02256608, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 2.69121678470813, + "language_loss": 0.77382857, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79519856, + "num_input_tokens_seen": 332899350, + "step": 15423, + "time_per_iteration": 4.01328182220459 + }, + { + "auxiliary_loss_clip": 0.01098809, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.03468525, + "balance_loss_mlp": 1.01838779, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 1.878112804527716, + "language_loss": 0.74910539, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77040124, + "num_input_tokens_seen": 332918105, + "step": 15424, + "time_per_iteration": 2.4632246494293213 + }, + { + "auxiliary_loss_clip": 0.01017843, + "auxiliary_loss_mlp": 0.01000856, + "balance_loss_clip": 1.00346565, + "balance_loss_mlp": 0.99972993, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.8260107406150862, + "language_loss": 0.6067549, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62694192, + "num_input_tokens_seen": 332969490, + "step": 15425, + "time_per_iteration": 2.832519054412842 + }, + { + "auxiliary_loss_clip": 0.01086036, + "auxiliary_loss_mlp": 0.00777666, + "balance_loss_clip": 1.03609407, + "balance_loss_mlp": 1.00051129, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 1.4293204488548696, + "language_loss": 0.70631242, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72494942, + "num_input_tokens_seen": 332988805, + "step": 15426, + "time_per_iteration": 3.986506462097168 + }, + { + "auxiliary_loss_clip": 0.0108433, + "auxiliary_loss_mlp": 0.01027385, + "balance_loss_clip": 1.03579593, + "balance_loss_mlp": 1.01600075, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 1.8637235026322043, + "language_loss": 0.83040577, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85152292, + "num_input_tokens_seen": 333007960, + "step": 15427, + "time_per_iteration": 2.498709201812744 + }, + { + "auxiliary_loss_clip": 0.01078012, + "auxiliary_loss_mlp": 0.01036264, + "balance_loss_clip": 1.0363344, + "balance_loss_mlp": 1.02414656, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.5858663971430818, + "language_loss": 0.77041841, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79156119, + "num_input_tokens_seen": 333026035, + "step": 15428, + "time_per_iteration": 2.552846670150757 + }, + { + "auxiliary_loss_clip": 0.01068477, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.0313344, + "balance_loss_mlp": 1.02373028, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 2.192791476306929, + "language_loss": 0.7473281, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76837432, + "num_input_tokens_seen": 333045590, + "step": 15429, + "time_per_iteration": 2.5365922451019287 + }, + { + "auxiliary_loss_clip": 0.01071255, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.03219485, + "balance_loss_mlp": 1.02290916, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 2.0317123952941687, + "language_loss": 0.74935257, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77041441, + "num_input_tokens_seen": 333063355, + "step": 15430, + "time_per_iteration": 2.5464835166931152 + }, + { + "auxiliary_loss_clip": 0.01097495, + "auxiliary_loss_mlp": 0.0103273, + "balance_loss_clip": 1.03429174, + "balance_loss_mlp": 1.01953959, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.4076046374677453, + "language_loss": 0.76501405, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78631634, + "num_input_tokens_seen": 333088045, + "step": 15431, + "time_per_iteration": 2.659334659576416 + }, + { + "auxiliary_loss_clip": 0.01095213, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.03666949, + "balance_loss_mlp": 1.01830387, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 1.7917572881444772, + "language_loss": 0.70806682, + "learning_rate": 5.4356921308363e-08, + "loss": 0.72931516, + "num_input_tokens_seen": 333108005, + "step": 15432, + "time_per_iteration": 2.4995481967926025 + }, + { + "auxiliary_loss_clip": 0.01064126, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.03768492, + "balance_loss_mlp": 1.01943874, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.684967764546083, + "language_loss": 0.82635391, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84730428, + "num_input_tokens_seen": 333124335, + "step": 15433, + "time_per_iteration": 2.534179449081421 + }, + { + "auxiliary_loss_clip": 0.01103334, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.03573394, + "balance_loss_mlp": 1.0204978, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 2.169887132843168, + "language_loss": 0.66724718, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68859482, + "num_input_tokens_seen": 333143995, + "step": 15434, + "time_per_iteration": 2.461414337158203 + }, + { + "auxiliary_loss_clip": 0.0107709, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.03198504, + "balance_loss_mlp": 1.01616263, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.7295354941340304, + "language_loss": 0.69416893, + "learning_rate": 5.40867065815529e-08, + "loss": 0.71521389, + "num_input_tokens_seen": 333162805, + "step": 15435, + "time_per_iteration": 2.498135566711426 + }, + { + "auxiliary_loss_clip": 0.01109498, + "auxiliary_loss_mlp": 0.01031799, + "balance_loss_clip": 1.03700686, + "balance_loss_mlp": 1.01915097, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 2.084427558998929, + "language_loss": 0.72078383, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74219686, + "num_input_tokens_seen": 333175770, + "step": 15436, + "time_per_iteration": 2.3884236812591553 + }, + { + "auxiliary_loss_clip": 0.01088432, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.04020202, + "balance_loss_mlp": 1.01942182, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 2.1438101311078173, + "language_loss": 0.67295909, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69416285, + "num_input_tokens_seen": 333194775, + "step": 15437, + "time_per_iteration": 3.955960750579834 + }, + { + "auxiliary_loss_clip": 0.01094901, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.03456235, + "balance_loss_mlp": 1.02203727, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 2.1008357478539357, + "language_loss": 0.71370578, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73500693, + "num_input_tokens_seen": 333208920, + "step": 15438, + "time_per_iteration": 2.4153122901916504 + }, + { + "auxiliary_loss_clip": 0.01108004, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.03603911, + "balance_loss_mlp": 1.01991212, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 1.9023830460150029, + "language_loss": 0.64581633, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.66721505, + "num_input_tokens_seen": 333229350, + "step": 15439, + "time_per_iteration": 2.430742025375366 + }, + { + "auxiliary_loss_clip": 0.01085967, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03482318, + "balance_loss_mlp": 1.020648, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 1.8926961774404778, + "language_loss": 0.70585859, + "learning_rate": 5.363782453347876e-08, + "loss": 0.7270425, + "num_input_tokens_seen": 333246125, + "step": 15440, + "time_per_iteration": 2.5614969730377197 + }, + { + "auxiliary_loss_clip": 0.01074655, + "auxiliary_loss_mlp": 0.00779539, + "balance_loss_clip": 1.03399491, + "balance_loss_mlp": 1.00052953, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.668914719753688, + "language_loss": 0.76917714, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78771907, + "num_input_tokens_seen": 333263685, + "step": 15441, + "time_per_iteration": 2.5655946731567383 + }, + { + "auxiliary_loss_clip": 0.01090843, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.03378153, + "balance_loss_mlp": 1.01754999, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 1.5996046528418215, + "language_loss": 0.64071476, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66189992, + "num_input_tokens_seen": 333282435, + "step": 15442, + "time_per_iteration": 2.476954936981201 + }, + { + "auxiliary_loss_clip": 0.01067371, + "auxiliary_loss_mlp": 0.01046393, + "balance_loss_clip": 1.03411341, + "balance_loss_mlp": 1.03224325, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 1.902123153605536, + "language_loss": 0.8069315, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.82806915, + "num_input_tokens_seen": 333300400, + "step": 15443, + "time_per_iteration": 2.5259392261505127 + }, + { + "auxiliary_loss_clip": 0.01098814, + "auxiliary_loss_mlp": 0.00776848, + "balance_loss_clip": 1.03560901, + "balance_loss_mlp": 1.00068557, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 2.0032539680941417, + "language_loss": 0.65435153, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67310816, + "num_input_tokens_seen": 333318980, + "step": 15444, + "time_per_iteration": 2.491889476776123 + }, + { + "auxiliary_loss_clip": 0.01066129, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.03310347, + "balance_loss_mlp": 1.01955163, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 2.1568525035480897, + "language_loss": 0.73673314, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.75771129, + "num_input_tokens_seen": 333334135, + "step": 15445, + "time_per_iteration": 2.5095372200012207 + }, + { + "auxiliary_loss_clip": 0.01094427, + "auxiliary_loss_mlp": 0.01040451, + "balance_loss_clip": 1.03560138, + "balance_loss_mlp": 1.02712333, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.6621767982808122, + "language_loss": 0.71453196, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73588073, + "num_input_tokens_seen": 333353325, + "step": 15446, + "time_per_iteration": 2.5193421840667725 + }, + { + "auxiliary_loss_clip": 0.0105718, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.03951323, + "balance_loss_mlp": 1.01845217, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 2.4066996339386373, + "language_loss": 0.69212782, + "learning_rate": 5.301248962337523e-08, + "loss": 0.7130121, + "num_input_tokens_seen": 333371110, + "step": 15447, + "time_per_iteration": 2.79167103767395 + }, + { + "auxiliary_loss_clip": 0.01100476, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.03384805, + "balance_loss_mlp": 1.0185976, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 3.0874767825464438, + "language_loss": 0.72375935, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74505997, + "num_input_tokens_seen": 333391420, + "step": 15448, + "time_per_iteration": 2.8124289512634277 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.03638232, + "balance_loss_mlp": 1.01625788, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.6792655116160728, + "language_loss": 0.74417901, + "learning_rate": 5.283448692511072e-08, + "loss": 0.7655549, + "num_input_tokens_seen": 333410365, + "step": 15449, + "time_per_iteration": 2.425783157348633 + }, + { + "auxiliary_loss_clip": 0.01108419, + "auxiliary_loss_mlp": 0.00778683, + "balance_loss_clip": 1.03603208, + "balance_loss_mlp": 1.000669, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 2.4044763932747575, + "language_loss": 0.68326062, + "learning_rate": 5.27455963293586e-08, + "loss": 0.70213169, + "num_input_tokens_seen": 333430000, + "step": 15450, + "time_per_iteration": 3.9708917140960693 + }, + { + "auxiliary_loss_clip": 0.01077553, + "auxiliary_loss_mlp": 0.01026466, + "balance_loss_clip": 1.035478, + "balance_loss_mlp": 1.01459849, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 2.151148282932138, + "language_loss": 0.72254449, + "learning_rate": 5.265677957368875e-08, + "loss": 0.74358463, + "num_input_tokens_seen": 333445800, + "step": 15451, + "time_per_iteration": 2.5147781372070312 + }, + { + "auxiliary_loss_clip": 0.01083865, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_clip": 1.03227448, + "balance_loss_mlp": 1.031178, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 2.22659991935067, + "language_loss": 0.733899, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75518394, + "num_input_tokens_seen": 333461550, + "step": 15452, + "time_per_iteration": 2.451446771621704 + }, + { + "auxiliary_loss_clip": 0.01089613, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.03780735, + "balance_loss_mlp": 1.01789224, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 1.9980537664104125, + "language_loss": 0.74237359, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76357234, + "num_input_tokens_seen": 333478835, + "step": 15453, + "time_per_iteration": 2.4806289672851562 + }, + { + "auxiliary_loss_clip": 0.00993762, + "auxiliary_loss_mlp": 0.01002523, + "balance_loss_clip": 1.00833118, + "balance_loss_mlp": 1.00129497, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.8366569357354987, + "language_loss": 0.60545504, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62541789, + "num_input_tokens_seen": 333535250, + "step": 15454, + "time_per_iteration": 3.009436845779419 + }, + { + "auxiliary_loss_clip": 0.01084815, + "auxiliary_loss_mlp": 0.01038311, + "balance_loss_clip": 1.03234315, + "balance_loss_mlp": 1.02506697, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 2.4442229257720878, + "language_loss": 0.69259673, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71382797, + "num_input_tokens_seen": 333553805, + "step": 15455, + "time_per_iteration": 2.504382848739624 + }, + { + "auxiliary_loss_clip": 0.01070581, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.03945839, + "balance_loss_mlp": 1.02101421, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 1.953604330152468, + "language_loss": 0.64637768, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66742754, + "num_input_tokens_seen": 333572800, + "step": 15456, + "time_per_iteration": 2.533820152282715 + }, + { + "auxiliary_loss_clip": 0.01064313, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.03338814, + "balance_loss_mlp": 1.01777363, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 1.6970122656635749, + "language_loss": 0.68070775, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70165503, + "num_input_tokens_seen": 333588520, + "step": 15457, + "time_per_iteration": 2.5490548610687256 + }, + { + "auxiliary_loss_clip": 0.0108731, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.0351429, + "balance_loss_mlp": 1.01724935, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 2.01908303447966, + "language_loss": 0.80747873, + "learning_rate": 5.203713008885291e-08, + "loss": 0.82864344, + "num_input_tokens_seen": 333603435, + "step": 15458, + "time_per_iteration": 2.47458815574646 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.03687668, + "balance_loss_mlp": 1.01976204, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.6067819521375135, + "language_loss": 0.72404528, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74535918, + "num_input_tokens_seen": 333623305, + "step": 15459, + "time_per_iteration": 2.5105838775634766 + }, + { + "auxiliary_loss_clip": 0.01070759, + "auxiliary_loss_mlp": 0.01035339, + "balance_loss_clip": 1.03244638, + "balance_loss_mlp": 1.02313161, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 2.5362112922342956, + "language_loss": 0.58851749, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.60957849, + "num_input_tokens_seen": 333641205, + "step": 15460, + "time_per_iteration": 2.4914917945861816 + }, + { + "auxiliary_loss_clip": 0.01065296, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.03677273, + "balance_loss_mlp": 1.0205158, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 1.8098360974399026, + "language_loss": 0.80358499, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82457685, + "num_input_tokens_seen": 333659615, + "step": 15461, + "time_per_iteration": 2.6253297328948975 + }, + { + "auxiliary_loss_clip": 0.01082891, + "auxiliary_loss_mlp": 0.01024947, + "balance_loss_clip": 1.03476703, + "balance_loss_mlp": 1.01325226, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 3.1667069878000436, + "language_loss": 0.78267437, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80375278, + "num_input_tokens_seen": 333678985, + "step": 15462, + "time_per_iteration": 4.083110332489014 + }, + { + "auxiliary_loss_clip": 0.01068215, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.02974606, + "balance_loss_mlp": 1.01460266, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 1.993526244744498, + "language_loss": 0.62729144, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64824474, + "num_input_tokens_seen": 333696410, + "step": 15463, + "time_per_iteration": 2.528648853302002 + }, + { + "auxiliary_loss_clip": 0.01082487, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.03039885, + "balance_loss_mlp": 1.01846337, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.4166373336558444, + "language_loss": 0.71032441, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73145056, + "num_input_tokens_seen": 333716615, + "step": 15464, + "time_per_iteration": 2.5781071186065674 + }, + { + "auxiliary_loss_clip": 0.01082983, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.0324055, + "balance_loss_mlp": 1.0188421, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 3.801741486903871, + "language_loss": 0.77272224, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79386002, + "num_input_tokens_seen": 333732800, + "step": 15465, + "time_per_iteration": 3.786510705947876 + }, + { + "auxiliary_loss_clip": 0.00983724, + "auxiliary_loss_mlp": 0.01002404, + "balance_loss_clip": 1.01223147, + "balance_loss_mlp": 1.00128961, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6904827886620468, + "language_loss": 0.56467164, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58453298, + "num_input_tokens_seen": 333799300, + "step": 15466, + "time_per_iteration": 3.7379150390625 + }, + { + "auxiliary_loss_clip": 0.01086019, + "auxiliary_loss_mlp": 0.01038509, + "balance_loss_clip": 1.03295243, + "balance_loss_mlp": 1.02505636, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 2.2604975623784385, + "language_loss": 0.72988063, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.75112593, + "num_input_tokens_seen": 333820360, + "step": 15467, + "time_per_iteration": 2.692340850830078 + }, + { + "auxiliary_loss_clip": 0.01081577, + "auxiliary_loss_mlp": 0.01035731, + "balance_loss_clip": 1.0340929, + "balance_loss_mlp": 1.02243352, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 1.8040594670182764, + "language_loss": 0.72178626, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.74295932, + "num_input_tokens_seen": 333840415, + "step": 15468, + "time_per_iteration": 2.549288511276245 + }, + { + "auxiliary_loss_clip": 0.0109508, + "auxiliary_loss_mlp": 0.01036433, + "balance_loss_clip": 1.03273034, + "balance_loss_mlp": 1.02202082, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 2.386700572695392, + "language_loss": 0.75737691, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77869201, + "num_input_tokens_seen": 333859910, + "step": 15469, + "time_per_iteration": 2.477156162261963 + }, + { + "auxiliary_loss_clip": 0.01083781, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.0383178, + "balance_loss_mlp": 1.02086711, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 1.8694608400443236, + "language_loss": 0.7560572, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77722621, + "num_input_tokens_seen": 333880495, + "step": 15470, + "time_per_iteration": 2.5394275188446045 + }, + { + "auxiliary_loss_clip": 0.01067936, + "auxiliary_loss_mlp": 0.01029826, + "balance_loss_clip": 1.03418767, + "balance_loss_mlp": 1.01820898, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 2.3229026101196673, + "language_loss": 0.74592817, + "learning_rate": 5.089595604367902e-08, + "loss": 0.76690578, + "num_input_tokens_seen": 333897640, + "step": 15471, + "time_per_iteration": 2.519763231277466 + }, + { + "auxiliary_loss_clip": 0.01097462, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.03992915, + "balance_loss_mlp": 1.01666796, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 2.1208690111533275, + "language_loss": 0.69320154, + "learning_rate": 5.080869070341487e-08, + "loss": 0.71446663, + "num_input_tokens_seen": 333913670, + "step": 15472, + "time_per_iteration": 2.428255319595337 + }, + { + "auxiliary_loss_clip": 0.0107869, + "auxiliary_loss_mlp": 0.01027954, + "balance_loss_clip": 1.03296065, + "balance_loss_mlp": 1.01674223, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.583030584337285, + "language_loss": 0.88677102, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90783751, + "num_input_tokens_seen": 333934105, + "step": 15473, + "time_per_iteration": 2.4800283908843994 + }, + { + "auxiliary_loss_clip": 0.01091787, + "auxiliary_loss_mlp": 0.01036821, + "balance_loss_clip": 1.04079223, + "balance_loss_mlp": 1.02289152, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 1.8727683777325437, + "language_loss": 0.64004958, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66133559, + "num_input_tokens_seen": 333953635, + "step": 15474, + "time_per_iteration": 2.5244028568267822 + }, + { + "auxiliary_loss_clip": 0.01107451, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.0359304, + "balance_loss_mlp": 1.02191031, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 1.9630939908184872, + "language_loss": 0.74694616, + "learning_rate": 5.054733817702339e-08, + "loss": 0.76836526, + "num_input_tokens_seen": 333971825, + "step": 15475, + "time_per_iteration": 2.3867549896240234 + }, + { + "auxiliary_loss_clip": 0.01095039, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.03372502, + "balance_loss_mlp": 1.01756918, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 1.792522944795141, + "language_loss": 0.66487527, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68612003, + "num_input_tokens_seen": 333990120, + "step": 15476, + "time_per_iteration": 2.5563266277313232 + }, + { + "auxiliary_loss_clip": 0.01064642, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.03600323, + "balance_loss_mlp": 1.02059627, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 2.1017877243024854, + "language_loss": 0.6889832, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.70995641, + "num_input_tokens_seen": 334007970, + "step": 15477, + "time_per_iteration": 4.335944175720215 + }, + { + "auxiliary_loss_clip": 0.01087184, + "auxiliary_loss_mlp": 0.01028746, + "balance_loss_clip": 1.03674841, + "balance_loss_mlp": 1.01734972, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 2.281914771660848, + "language_loss": 0.5868175, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60797679, + "num_input_tokens_seen": 334027120, + "step": 15478, + "time_per_iteration": 2.5847978591918945 + }, + { + "auxiliary_loss_clip": 0.01088781, + "auxiliary_loss_mlp": 0.01035249, + "balance_loss_clip": 1.03825068, + "balance_loss_mlp": 1.02134919, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 2.544947702969849, + "language_loss": 0.78793263, + "learning_rate": 5.01999030853566e-08, + "loss": 0.80917299, + "num_input_tokens_seen": 334042785, + "step": 15479, + "time_per_iteration": 2.4479176998138428 + }, + { + "auxiliary_loss_clip": 0.01107551, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.03604007, + "balance_loss_mlp": 1.02104771, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 3.2495550543024643, + "language_loss": 0.68676078, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70816028, + "num_input_tokens_seen": 334063480, + "step": 15480, + "time_per_iteration": 2.589008092880249 + }, + { + "auxiliary_loss_clip": 0.01108015, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.03574753, + "balance_loss_mlp": 1.02133417, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.7650715379968978, + "language_loss": 0.67636716, + "learning_rate": 5.002662914604583e-08, + "loss": 0.69778335, + "num_input_tokens_seen": 334082005, + "step": 15481, + "time_per_iteration": 2.4282407760620117 + }, + { + "auxiliary_loss_clip": 0.01082074, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.03271532, + "balance_loss_mlp": 1.02030563, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 1.8048857818145165, + "language_loss": 0.74619389, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76734388, + "num_input_tokens_seen": 334101375, + "step": 15482, + "time_per_iteration": 2.5334062576293945 + }, + { + "auxiliary_loss_clip": 0.01095259, + "auxiliary_loss_mlp": 0.01029864, + "balance_loss_clip": 1.03421164, + "balance_loss_mlp": 1.01835418, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 1.879699555473255, + "language_loss": 0.80382431, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82507551, + "num_input_tokens_seen": 334119460, + "step": 15483, + "time_per_iteration": 2.489919424057007 + }, + { + "auxiliary_loss_clip": 0.01086999, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.04089975, + "balance_loss_mlp": 1.01882374, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 1.9171415517765493, + "language_loss": 0.74757648, + "learning_rate": 4.976727281916782e-08, + "loss": 0.76875609, + "num_input_tokens_seen": 334136065, + "step": 15484, + "time_per_iteration": 2.5602245330810547 + }, + { + "auxiliary_loss_clip": 0.01085267, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.03615117, + "balance_loss_mlp": 1.01936424, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.88400152739517, + "language_loss": 0.76046753, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78163958, + "num_input_tokens_seen": 334153690, + "step": 15485, + "time_per_iteration": 2.4711194038391113 + }, + { + "auxiliary_loss_clip": 0.0106505, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.03222167, + "balance_loss_mlp": 1.02288246, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 1.9415761921601662, + "language_loss": 0.78399932, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80501902, + "num_input_tokens_seen": 334171880, + "step": 15486, + "time_per_iteration": 2.6526761054992676 + }, + { + "auxiliary_loss_clip": 0.01079806, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.03815234, + "balance_loss_mlp": 1.01910007, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 4.271293257904309, + "language_loss": 0.7657373, + "learning_rate": 4.950858206945674e-08, + "loss": 0.78685367, + "num_input_tokens_seen": 334190005, + "step": 15487, + "time_per_iteration": 2.5403778553009033 + }, + { + "auxiliary_loss_clip": 0.01081486, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.03773713, + "balance_loss_mlp": 1.01653361, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 2.1356412034109167, + "language_loss": 0.67213202, + "learning_rate": 4.942249974085633e-08, + "loss": 0.69323993, + "num_input_tokens_seen": 334209545, + "step": 15488, + "time_per_iteration": 2.695751667022705 + }, + { + "auxiliary_loss_clip": 0.01083645, + "auxiliary_loss_mlp": 0.010315, + "balance_loss_clip": 1.03563452, + "balance_loss_mlp": 1.01916766, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 1.80620001062391, + "language_loss": 0.75301814, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77416956, + "num_input_tokens_seen": 334228900, + "step": 15489, + "time_per_iteration": 2.5110411643981934 + }, + { + "auxiliary_loss_clip": 0.01108917, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.03516245, + "balance_loss_mlp": 1.01859236, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 3.1536721303351993, + "language_loss": 0.8091231, + "learning_rate": 4.925055698519931e-08, + "loss": 0.83052319, + "num_input_tokens_seen": 334245500, + "step": 15490, + "time_per_iteration": 3.8750827312469482 + }, + { + "auxiliary_loss_clip": 0.01063332, + "auxiliary_loss_mlp": 0.01030205, + "balance_loss_clip": 1.03656387, + "balance_loss_mlp": 1.01764035, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 1.6445340802629227, + "language_loss": 0.71929359, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.74022895, + "num_input_tokens_seen": 334264370, + "step": 15491, + "time_per_iteration": 2.5750741958618164 + }, + { + "auxiliary_loss_clip": 0.01081884, + "auxiliary_loss_mlp": 0.00775534, + "balance_loss_clip": 1.03263569, + "balance_loss_mlp": 1.00055408, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 2.0325466212529233, + "language_loss": 0.74770039, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76627463, + "num_input_tokens_seen": 334283905, + "step": 15492, + "time_per_iteration": 2.559342384338379 + }, + { + "auxiliary_loss_clip": 0.01017187, + "auxiliary_loss_mlp": 0.01001047, + "balance_loss_clip": 1.00291467, + "balance_loss_mlp": 0.99993211, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.7144307449981582, + "language_loss": 0.53504193, + "learning_rate": 4.899319765445442e-08, + "loss": 0.5552243, + "num_input_tokens_seen": 334339925, + "step": 15493, + "time_per_iteration": 2.9176368713378906 + }, + { + "auxiliary_loss_clip": 0.01096907, + "auxiliary_loss_mlp": 0.0103094, + "balance_loss_clip": 1.03443718, + "balance_loss_mlp": 1.01955581, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.7182488735977701, + "language_loss": 0.70730156, + "learning_rate": 4.890755917128531e-08, + "loss": 0.72858006, + "num_input_tokens_seen": 334357225, + "step": 15494, + "time_per_iteration": 2.449857473373413 + }, + { + "auxiliary_loss_clip": 0.01094867, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.03615928, + "balance_loss_mlp": 1.01671147, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.6796034889301947, + "language_loss": 0.67958695, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70083117, + "num_input_tokens_seen": 334375945, + "step": 15495, + "time_per_iteration": 2.5220651626586914 + }, + { + "auxiliary_loss_clip": 0.01105308, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.0346545, + "balance_loss_mlp": 1.01931858, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 2.2952903228746386, + "language_loss": 0.61581671, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63717794, + "num_input_tokens_seen": 334395310, + "step": 15496, + "time_per_iteration": 2.518156051635742 + }, + { + "auxiliary_loss_clip": 0.01098671, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.03665721, + "balance_loss_mlp": 1.0206542, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 2.082112068410919, + "language_loss": 0.76967859, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79099423, + "num_input_tokens_seen": 334416965, + "step": 15497, + "time_per_iteration": 2.5703935623168945 + }, + { + "auxiliary_loss_clip": 0.0110095, + "auxiliary_loss_mlp": 0.00777992, + "balance_loss_clip": 1.03786159, + "balance_loss_mlp": 1.00053096, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.6717153950076704, + "language_loss": 0.66419607, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68298548, + "num_input_tokens_seen": 334435620, + "step": 15498, + "time_per_iteration": 2.507507562637329 + }, + { + "auxiliary_loss_clip": 0.01089458, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.0384202, + "balance_loss_mlp": 1.02177179, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 2.0057812095610417, + "language_loss": 0.79853046, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.81976849, + "num_input_tokens_seen": 334456210, + "step": 15499, + "time_per_iteration": 2.5446932315826416 + }, + { + "auxiliary_loss_clip": 0.01065078, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.03401971, + "balance_loss_mlp": 1.0226568, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.7292660640579816, + "language_loss": 0.76937115, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.79037166, + "num_input_tokens_seen": 334475485, + "step": 15500, + "time_per_iteration": 2.643320083618164 + }, + { + "auxiliary_loss_clip": 0.01074722, + "auxiliary_loss_mlp": 0.01025997, + "balance_loss_clip": 1.03414011, + "balance_loss_mlp": 1.014148, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 1.6166371367581198, + "language_loss": 0.71974587, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74075305, + "num_input_tokens_seen": 334494740, + "step": 15501, + "time_per_iteration": 4.0738205909729 + }, + { + "auxiliary_loss_clip": 0.01110721, + "auxiliary_loss_mlp": 0.01032629, + "balance_loss_clip": 1.03687358, + "balance_loss_mlp": 1.02031493, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 2.0416529338812315, + "language_loss": 0.66339564, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68482912, + "num_input_tokens_seen": 334511910, + "step": 15502, + "time_per_iteration": 2.458104133605957 + }, + { + "auxiliary_loss_clip": 0.01099028, + "auxiliary_loss_mlp": 0.00778359, + "balance_loss_clip": 1.03598976, + "balance_loss_mlp": 1.00064468, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.5038883400365644, + "language_loss": 0.6578685, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67664242, + "num_input_tokens_seen": 334533150, + "step": 15503, + "time_per_iteration": 2.513901710510254 + }, + { + "auxiliary_loss_clip": 0.0107399, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.03108621, + "balance_loss_mlp": 1.02259958, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.6187201087960292, + "language_loss": 0.75306481, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77416539, + "num_input_tokens_seen": 334550940, + "step": 15504, + "time_per_iteration": 3.913098096847534 + }, + { + "auxiliary_loss_clip": 0.01098879, + "auxiliary_loss_mlp": 0.00778163, + "balance_loss_clip": 1.03680074, + "balance_loss_mlp": 1.00063741, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 2.3447465095268267, + "language_loss": 0.7122519, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73102236, + "num_input_tokens_seen": 334570935, + "step": 15505, + "time_per_iteration": 2.512044906616211 + }, + { + "auxiliary_loss_clip": 0.01089409, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.03642249, + "balance_loss_mlp": 1.0162394, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 2.1821118361571155, + "language_loss": 0.75367069, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77485484, + "num_input_tokens_seen": 334589315, + "step": 15506, + "time_per_iteration": 2.5063626766204834 + }, + { + "auxiliary_loss_clip": 0.0106931, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.03388917, + "balance_loss_mlp": 1.01650798, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 1.9956704778075665, + "language_loss": 0.83133352, + "learning_rate": 4.780099275981597e-08, + "loss": 0.85230523, + "num_input_tokens_seen": 334608990, + "step": 15507, + "time_per_iteration": 2.588789939880371 + }, + { + "auxiliary_loss_clip": 0.01109487, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.03622961, + "balance_loss_mlp": 1.01806331, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.595504511682712, + "language_loss": 0.67881495, + "learning_rate": 4.771639036957742e-08, + "loss": 0.70021331, + "num_input_tokens_seen": 334628655, + "step": 15508, + "time_per_iteration": 2.4733479022979736 + }, + { + "auxiliary_loss_clip": 0.01074507, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.03548026, + "balance_loss_mlp": 1.02132535, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.820916611507995, + "language_loss": 0.72535145, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.74642932, + "num_input_tokens_seen": 334648295, + "step": 15509, + "time_per_iteration": 2.5809009075164795 + }, + { + "auxiliary_loss_clip": 0.01097004, + "auxiliary_loss_mlp": 0.01031581, + "balance_loss_clip": 1.03505874, + "balance_loss_mlp": 1.01958275, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 1.8431415955335153, + "language_loss": 0.74590433, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76719022, + "num_input_tokens_seen": 334666280, + "step": 15510, + "time_per_iteration": 2.4979965686798096 + }, + { + "auxiliary_loss_clip": 0.0109943, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.03371453, + "balance_loss_mlp": 1.01542807, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 4.768747303649567, + "language_loss": 0.70200503, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72327209, + "num_input_tokens_seen": 334688830, + "step": 15511, + "time_per_iteration": 2.554572105407715 + }, + { + "auxiliary_loss_clip": 0.01083211, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.0355041, + "balance_loss_mlp": 1.02455544, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 2.2774201669916887, + "language_loss": 0.78659999, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80780387, + "num_input_tokens_seen": 334705205, + "step": 15512, + "time_per_iteration": 2.5117998123168945 + }, + { + "auxiliary_loss_clip": 0.01106497, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.03497267, + "balance_loss_mlp": 1.01687491, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 1.4905332622868082, + "language_loss": 0.80402249, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82538033, + "num_input_tokens_seen": 334723830, + "step": 15513, + "time_per_iteration": 2.4647395610809326 + }, + { + "auxiliary_loss_clip": 0.01088551, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.03889966, + "balance_loss_mlp": 1.01620507, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 2.4968729941838137, + "language_loss": 0.79785609, + "learning_rate": 4.721033078682768e-08, + "loss": 0.81903899, + "num_input_tokens_seen": 334740825, + "step": 15514, + "time_per_iteration": 2.4878666400909424 + }, + { + "auxiliary_loss_clip": 0.01080645, + "auxiliary_loss_mlp": 0.01035743, + "balance_loss_clip": 1.04093122, + "balance_loss_mlp": 1.02376842, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 6.589190737471826, + "language_loss": 0.71464437, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.73580825, + "num_input_tokens_seen": 334765825, + "step": 15515, + "time_per_iteration": 2.707303524017334 + }, + { + "auxiliary_loss_clip": 0.01093476, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.03695965, + "balance_loss_mlp": 1.02186167, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.20549786518613, + "language_loss": 0.80914283, + "learning_rate": 4.704223662500806e-08, + "loss": 0.83043081, + "num_input_tokens_seen": 334782680, + "step": 15516, + "time_per_iteration": 3.9295542240142822 + }, + { + "auxiliary_loss_clip": 0.01072772, + "auxiliary_loss_mlp": 0.01038849, + "balance_loss_clip": 1.03128433, + "balance_loss_mlp": 1.02493691, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.6182650418585305, + "language_loss": 0.80672204, + "learning_rate": 4.695830062703643e-08, + "loss": 0.82783818, + "num_input_tokens_seen": 334800160, + "step": 15517, + "time_per_iteration": 2.5188121795654297 + }, + { + "auxiliary_loss_clip": 0.01087272, + "auxiliary_loss_mlp": 0.01034455, + "balance_loss_clip": 1.03453541, + "balance_loss_mlp": 1.02060854, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 1.9346092467921217, + "language_loss": 0.74871421, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76993144, + "num_input_tokens_seen": 334815840, + "step": 15518, + "time_per_iteration": 2.4727530479431152 + }, + { + "auxiliary_loss_clip": 0.01084872, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.03401947, + "balance_loss_mlp": 1.02466154, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 2.2722368558494286, + "language_loss": 0.75822949, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77945006, + "num_input_tokens_seen": 334834735, + "step": 15519, + "time_per_iteration": 2.4914474487304688 + }, + { + "auxiliary_loss_clip": 0.01056154, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.03353858, + "balance_loss_mlp": 1.02093494, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.9601603124170945, + "language_loss": 0.83353114, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.85442817, + "num_input_tokens_seen": 334853490, + "step": 15520, + "time_per_iteration": 2.591661214828491 + }, + { + "auxiliary_loss_clip": 0.01095206, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03361583, + "balance_loss_mlp": 1.01784825, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.6912283284367278, + "language_loss": 0.76530594, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78655624, + "num_input_tokens_seen": 334873675, + "step": 15521, + "time_per_iteration": 2.484238624572754 + }, + { + "auxiliary_loss_clip": 0.01100285, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.03785098, + "balance_loss_mlp": 1.0205245, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 1.9309757806799885, + "language_loss": 0.77626503, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79759425, + "num_input_tokens_seen": 334890970, + "step": 15522, + "time_per_iteration": 2.4720571041107178 + }, + { + "auxiliary_loss_clip": 0.01076973, + "auxiliary_loss_mlp": 0.00778555, + "balance_loss_clip": 1.03548336, + "balance_loss_mlp": 1.00063264, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 2.4391168414672983, + "language_loss": 0.63160127, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.6501565, + "num_input_tokens_seen": 334906635, + "step": 15523, + "time_per_iteration": 2.559946298599243 + }, + { + "auxiliary_loss_clip": 0.01085706, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.03464246, + "balance_loss_mlp": 1.01883006, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 1.6205124379887645, + "language_loss": 0.68058121, + "learning_rate": 4.63728224861577e-08, + "loss": 0.70174706, + "num_input_tokens_seen": 334926230, + "step": 15524, + "time_per_iteration": 2.5241055488586426 + }, + { + "auxiliary_loss_clip": 0.01068604, + "auxiliary_loss_mlp": 0.01035448, + "balance_loss_clip": 1.03486359, + "balance_loss_mlp": 1.02293086, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 1.6675289935542903, + "language_loss": 0.74068153, + "learning_rate": 4.628947905336589e-08, + "loss": 0.76172203, + "num_input_tokens_seen": 334946680, + "step": 15525, + "time_per_iteration": 2.6392576694488525 + }, + { + "auxiliary_loss_clip": 0.01059343, + "auxiliary_loss_mlp": 0.01038633, + "balance_loss_clip": 1.03667128, + "balance_loss_mlp": 1.02627707, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.7250597454279821, + "language_loss": 0.8414799, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.86245966, + "num_input_tokens_seen": 334964785, + "step": 15526, + "time_per_iteration": 2.5868260860443115 + }, + { + "auxiliary_loss_clip": 0.01064383, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.03442121, + "balance_loss_mlp": 1.01534557, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 1.9705358839193445, + "language_loss": 0.6892997, + "learning_rate": 4.61230144456366e-08, + "loss": 0.71022308, + "num_input_tokens_seen": 334982400, + "step": 15527, + "time_per_iteration": 2.5586447715759277 + }, + { + "auxiliary_loss_clip": 0.01112428, + "auxiliary_loss_mlp": 0.01029162, + "balance_loss_clip": 1.0378927, + "balance_loss_mlp": 1.01505327, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 1.9817010046181196, + "language_loss": 0.65147996, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67289585, + "num_input_tokens_seen": 334999685, + "step": 15528, + "time_per_iteration": 3.838190793991089 + }, + { + "auxiliary_loss_clip": 0.01109983, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.03576827, + "balance_loss_mlp": 1.02216673, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 2.181020618454941, + "language_loss": 0.75112545, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.77257717, + "num_input_tokens_seen": 335019160, + "step": 15529, + "time_per_iteration": 2.4113893508911133 + }, + { + "auxiliary_loss_clip": 0.0106182, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.03137493, + "balance_loss_mlp": 1.01724255, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.6867123125550367, + "language_loss": 0.62626684, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.64717418, + "num_input_tokens_seen": 335037350, + "step": 15530, + "time_per_iteration": 2.576155424118042 + }, + { + "auxiliary_loss_clip": 0.01085647, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.03672457, + "balance_loss_mlp": 1.01731515, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 1.8354884435963859, + "language_loss": 0.72610438, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74725032, + "num_input_tokens_seen": 335056060, + "step": 15531, + "time_per_iteration": 2.5093743801116943 + }, + { + "auxiliary_loss_clip": 0.01088851, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.03580058, + "balance_loss_mlp": 1.02031696, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 1.7418869216854247, + "language_loss": 0.7104404, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.73165774, + "num_input_tokens_seen": 335075410, + "step": 15532, + "time_per_iteration": 2.559561252593994 + }, + { + "auxiliary_loss_clip": 0.01109947, + "auxiliary_loss_mlp": 0.00777733, + "balance_loss_clip": 1.03668678, + "balance_loss_mlp": 1.00062585, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.5666187352345482, + "language_loss": 0.73217052, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75104731, + "num_input_tokens_seen": 335095190, + "step": 15533, + "time_per_iteration": 2.441983222961426 + }, + { + "auxiliary_loss_clip": 0.01074218, + "auxiliary_loss_mlp": 0.01028583, + "balance_loss_clip": 1.03394294, + "balance_loss_mlp": 1.0166738, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.894808408261455, + "language_loss": 0.79594874, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81697679, + "num_input_tokens_seen": 335113825, + "step": 15534, + "time_per_iteration": 2.5227549076080322 + }, + { + "auxiliary_loss_clip": 0.01102067, + "auxiliary_loss_mlp": 0.01025754, + "balance_loss_clip": 1.03629041, + "balance_loss_mlp": 1.01485205, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 1.8657682122149222, + "language_loss": 0.74091554, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76219368, + "num_input_tokens_seen": 335136425, + "step": 15535, + "time_per_iteration": 2.4930713176727295 + }, + { + "auxiliary_loss_clip": 0.01095926, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.03911006, + "balance_loss_mlp": 1.01835442, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 1.8805905442345283, + "language_loss": 0.77525973, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79652488, + "num_input_tokens_seen": 335157925, + "step": 15536, + "time_per_iteration": 2.5130815505981445 + }, + { + "auxiliary_loss_clip": 0.01081522, + "auxiliary_loss_mlp": 0.01027884, + "balance_loss_clip": 1.03746378, + "balance_loss_mlp": 1.01592755, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.5502133378632834, + "language_loss": 0.80766779, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.82876182, + "num_input_tokens_seen": 335177840, + "step": 15537, + "time_per_iteration": 2.596482038497925 + }, + { + "auxiliary_loss_clip": 0.01089859, + "auxiliary_loss_mlp": 0.01033789, + "balance_loss_clip": 1.03768873, + "balance_loss_mlp": 1.02161813, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 3.126815409978637, + "language_loss": 0.77663577, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.7978723, + "num_input_tokens_seen": 335199470, + "step": 15538, + "time_per_iteration": 2.550943613052368 + }, + { + "auxiliary_loss_clip": 0.01085248, + "auxiliary_loss_mlp": 0.01025844, + "balance_loss_clip": 1.03992915, + "balance_loss_mlp": 1.01391149, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.5631530043256565, + "language_loss": 0.73303223, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75414324, + "num_input_tokens_seen": 335218885, + "step": 15539, + "time_per_iteration": 2.5527899265289307 + }, + { + "auxiliary_loss_clip": 0.01061547, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.03596056, + "balance_loss_mlp": 1.01787055, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 1.436066485166516, + "language_loss": 0.64860165, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66950333, + "num_input_tokens_seen": 335239485, + "step": 15540, + "time_per_iteration": 2.689180374145508 + }, + { + "auxiliary_loss_clip": 0.0109681, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.03607798, + "balance_loss_mlp": 1.01762033, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.97597647603906, + "language_loss": 0.76617229, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78743321, + "num_input_tokens_seen": 335258355, + "step": 15541, + "time_per_iteration": 3.976714849472046 + }, + { + "auxiliary_loss_clip": 0.01099973, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.04189479, + "balance_loss_mlp": 1.0217303, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 2.1047114670440132, + "language_loss": 0.67106587, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.6924063, + "num_input_tokens_seen": 335276835, + "step": 15542, + "time_per_iteration": 2.529348850250244 + }, + { + "auxiliary_loss_clip": 0.01066445, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.03388417, + "balance_loss_mlp": 1.01700187, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 1.742923444003615, + "language_loss": 0.69830322, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71926653, + "num_input_tokens_seen": 335296220, + "step": 15543, + "time_per_iteration": 2.5117247104644775 + }, + { + "auxiliary_loss_clip": 0.0109951, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.03390193, + "balance_loss_mlp": 1.01788533, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 1.985769558351653, + "language_loss": 0.69619203, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71750045, + "num_input_tokens_seen": 335316335, + "step": 15544, + "time_per_iteration": 3.8678243160247803 + }, + { + "auxiliary_loss_clip": 0.01096014, + "auxiliary_loss_mlp": 0.01045995, + "balance_loss_clip": 1.03450191, + "balance_loss_mlp": 1.03220236, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 2.1224486232050226, + "language_loss": 0.76754034, + "learning_rate": 4.463817240903789e-08, + "loss": 0.7889604, + "num_input_tokens_seen": 335335545, + "step": 15545, + "time_per_iteration": 2.5133230686187744 + }, + { + "auxiliary_loss_clip": 0.01100533, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.03827643, + "balance_loss_mlp": 1.01723099, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.558512224198438, + "language_loss": 0.69006097, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71135247, + "num_input_tokens_seen": 335355350, + "step": 15546, + "time_per_iteration": 2.4695944786071777 + }, + { + "auxiliary_loss_clip": 0.01069897, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.03304172, + "balance_loss_mlp": 1.01801288, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 2.2317799584577966, + "language_loss": 0.8229534, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84395021, + "num_input_tokens_seen": 335375160, + "step": 15547, + "time_per_iteration": 2.601797103881836 + }, + { + "auxiliary_loss_clip": 0.01091039, + "auxiliary_loss_mlp": 0.01039644, + "balance_loss_clip": 1.03237092, + "balance_loss_mlp": 1.02695441, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 1.9221214062318317, + "language_loss": 0.83663857, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85794544, + "num_input_tokens_seen": 335394080, + "step": 15548, + "time_per_iteration": 2.469444513320923 + }, + { + "auxiliary_loss_clip": 0.01101193, + "auxiliary_loss_mlp": 0.01035654, + "balance_loss_clip": 1.03666389, + "balance_loss_mlp": 1.02237368, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 1.6311798608655743, + "language_loss": 0.65250409, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.67387259, + "num_input_tokens_seen": 335414230, + "step": 15549, + "time_per_iteration": 2.6033146381378174 + }, + { + "auxiliary_loss_clip": 0.01100629, + "auxiliary_loss_mlp": 0.01034521, + "balance_loss_clip": 1.03824723, + "balance_loss_mlp": 1.02150965, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 1.70913680479929, + "language_loss": 0.80154145, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82289296, + "num_input_tokens_seen": 335432890, + "step": 15550, + "time_per_iteration": 2.4941141605377197 + }, + { + "auxiliary_loss_clip": 0.01094552, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.03720188, + "balance_loss_mlp": 1.01920521, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.6197601717853618, + "language_loss": 0.75523782, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77649057, + "num_input_tokens_seen": 335452085, + "step": 15551, + "time_per_iteration": 2.4685089588165283 + }, + { + "auxiliary_loss_clip": 0.01053465, + "auxiliary_loss_mlp": 0.0102976, + "balance_loss_clip": 1.03276992, + "balance_loss_mlp": 1.01937699, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.5851065106371325, + "language_loss": 0.73634553, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75717771, + "num_input_tokens_seen": 335472130, + "step": 15552, + "time_per_iteration": 2.6536097526550293 + }, + { + "auxiliary_loss_clip": 0.01061532, + "auxiliary_loss_mlp": 0.01037628, + "balance_loss_clip": 1.0315305, + "balance_loss_mlp": 1.02448511, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 1.6776309829565768, + "language_loss": 0.77160466, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79259622, + "num_input_tokens_seen": 335489970, + "step": 15553, + "time_per_iteration": 2.5507595539093018 + }, + { + "auxiliary_loss_clip": 0.01081648, + "auxiliary_loss_mlp": 0.01038465, + "balance_loss_clip": 1.03880966, + "balance_loss_mlp": 1.02493477, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.5804646294738174, + "language_loss": 0.78287649, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80407763, + "num_input_tokens_seen": 335509125, + "step": 15554, + "time_per_iteration": 2.5279855728149414 + }, + { + "auxiliary_loss_clip": 0.01076868, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.0311166, + "balance_loss_mlp": 1.02151155, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 1.5862709968507351, + "language_loss": 0.69358146, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71467853, + "num_input_tokens_seen": 335525620, + "step": 15555, + "time_per_iteration": 3.954073905944824 + }, + { + "auxiliary_loss_clip": 0.01018244, + "auxiliary_loss_mlp": 0.01049511, + "balance_loss_clip": 1.02980185, + "balance_loss_mlp": 1.03593254, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.6381430112077053, + "language_loss": 0.75605071, + "learning_rate": 4.374259430715965e-08, + "loss": 0.77672827, + "num_input_tokens_seen": 335547565, + "step": 15556, + "time_per_iteration": 2.834887981414795 + }, + { + "auxiliary_loss_clip": 0.01087131, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.03520823, + "balance_loss_mlp": 1.01912093, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.621452376735092, + "language_loss": 0.72162628, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74280488, + "num_input_tokens_seen": 335570285, + "step": 15557, + "time_per_iteration": 2.912574052810669 + }, + { + "auxiliary_loss_clip": 0.01108152, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.03561997, + "balance_loss_mlp": 1.02138197, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.6055547029352693, + "language_loss": 0.63287687, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65430033, + "num_input_tokens_seen": 335588600, + "step": 15558, + "time_per_iteration": 2.4309613704681396 + }, + { + "auxiliary_loss_clip": 0.01086131, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.03691351, + "balance_loss_mlp": 1.02069736, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 2.341760240024036, + "language_loss": 0.73567748, + "learning_rate": 4.34999033724388e-08, + "loss": 0.7568754, + "num_input_tokens_seen": 335606235, + "step": 15559, + "time_per_iteration": 2.504533529281616 + }, + { + "auxiliary_loss_clip": 0.01054909, + "auxiliary_loss_mlp": 0.00776077, + "balance_loss_clip": 1.03101993, + "balance_loss_mlp": 1.00052118, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.5547119990580522, + "language_loss": 0.63324183, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65155166, + "num_input_tokens_seen": 335628240, + "step": 15560, + "time_per_iteration": 2.7083990573883057 + }, + { + "auxiliary_loss_clip": 0.01051617, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.03766751, + "balance_loss_mlp": 1.0206455, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 2.9246536248046917, + "language_loss": 0.6370672, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.65793347, + "num_input_tokens_seen": 335643755, + "step": 15561, + "time_per_iteration": -0.037011146545410156 + }, + { + "auxiliary_loss_clip": 0.01109456, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.03814304, + "balance_loss_mlp": 1.02158165, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.7884165220658612, + "language_loss": 0.75516975, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77661014, + "num_input_tokens_seen": 335665160, + "step": 15562, + "time_per_iteration": 2.4564132690429688 + }, + { + "auxiliary_loss_clip": 0.01018914, + "auxiliary_loss_mlp": 0.00999088, + "balance_loss_clip": 1.00629723, + "balance_loss_mlp": 0.99785978, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.941847763334074, + "language_loss": 0.62382817, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64400822, + "num_input_tokens_seen": 335715240, + "step": 15563, + "time_per_iteration": 2.875746250152588 + }, + { + "auxiliary_loss_clip": 0.01058597, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.03523409, + "balance_loss_mlp": 1.02176833, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 6.987721702566407, + "language_loss": 0.78427047, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80519283, + "num_input_tokens_seen": 335734970, + "step": 15564, + "time_per_iteration": 2.613420248031616 + }, + { + "auxiliary_loss_clip": 0.01109569, + "auxiliary_loss_mlp": 0.01033535, + "balance_loss_clip": 1.03508353, + "balance_loss_mlp": 1.01968288, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 2.006083461707491, + "language_loss": 0.78163934, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80307037, + "num_input_tokens_seen": 335753435, + "step": 15565, + "time_per_iteration": 2.466670036315918 + }, + { + "auxiliary_loss_clip": 0.01093994, + "auxiliary_loss_mlp": 0.0103136, + "balance_loss_clip": 1.03442311, + "balance_loss_mlp": 1.01923656, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 1.9385497777459664, + "language_loss": 0.7167902, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.73804373, + "num_input_tokens_seen": 335772105, + "step": 15566, + "time_per_iteration": 2.4554226398468018 + }, + { + "auxiliary_loss_clip": 0.01070508, + "auxiliary_loss_mlp": 0.00779553, + "balance_loss_clip": 1.03212762, + "balance_loss_mlp": 1.00059152, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 1.8453126219354152, + "language_loss": 0.67908007, + "learning_rate": 4.285599216057889e-08, + "loss": 0.69758064, + "num_input_tokens_seen": 335789125, + "step": 15567, + "time_per_iteration": 2.555123805999756 + }, + { + "auxiliary_loss_clip": 0.01083554, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.03950524, + "balance_loss_mlp": 1.02066135, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 1.8511029718734175, + "language_loss": 0.62270272, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64386868, + "num_input_tokens_seen": 335810995, + "step": 15568, + "time_per_iteration": 4.039544582366943 + }, + { + "auxiliary_loss_clip": 0.01084117, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.03104222, + "balance_loss_mlp": 1.02257049, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.7104352386836639, + "language_loss": 0.78809941, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80928266, + "num_input_tokens_seen": 335830580, + "step": 15569, + "time_per_iteration": 2.5174639225006104 + }, + { + "auxiliary_loss_clip": 0.01091692, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.03737164, + "balance_loss_mlp": 1.02202058, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.340161490493661, + "language_loss": 0.69628197, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71754539, + "num_input_tokens_seen": 335846515, + "step": 15570, + "time_per_iteration": 2.4926228523254395 + }, + { + "auxiliary_loss_clip": 0.01094539, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.03753328, + "balance_loss_mlp": 1.01739681, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 1.9949637020642081, + "language_loss": 0.78919244, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.81043506, + "num_input_tokens_seen": 335863350, + "step": 15571, + "time_per_iteration": 2.4452483654022217 + }, + { + "auxiliary_loss_clip": 0.01076429, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.03326714, + "balance_loss_mlp": 1.0197438, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 2.361022182262002, + "language_loss": 0.77903825, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.80012441, + "num_input_tokens_seen": 335880510, + "step": 15572, + "time_per_iteration": 2.5101170539855957 + }, + { + "auxiliary_loss_clip": 0.01083747, + "auxiliary_loss_mlp": 0.01040548, + "balance_loss_clip": 1.03323781, + "balance_loss_mlp": 1.02857399, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 1.980747653661365, + "language_loss": 0.77808285, + "learning_rate": 4.237617570010688e-08, + "loss": 0.79932576, + "num_input_tokens_seen": 335899440, + "step": 15573, + "time_per_iteration": 2.4929497241973877 + }, + { + "auxiliary_loss_clip": 0.0107567, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.03325677, + "balance_loss_mlp": 1.01696396, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 1.5637587109383753, + "language_loss": 0.74445462, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76550227, + "num_input_tokens_seen": 335919540, + "step": 15574, + "time_per_iteration": 2.5712833404541016 + }, + { + "auxiliary_loss_clip": 0.01051304, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.03298283, + "balance_loss_mlp": 1.02328253, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.8806037084309328, + "language_loss": 0.68027079, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70114285, + "num_input_tokens_seen": 335939665, + "step": 15575, + "time_per_iteration": 2.605764865875244 + }, + { + "auxiliary_loss_clip": 0.01080105, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.03335619, + "balance_loss_mlp": 1.02414584, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 1.904634727769336, + "language_loss": 0.65378106, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67494559, + "num_input_tokens_seen": 335958580, + "step": 15576, + "time_per_iteration": 2.486945152282715 + }, + { + "auxiliary_loss_clip": 0.01093534, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.03099871, + "balance_loss_mlp": 1.01592875, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 2.225953001997501, + "language_loss": 0.75700963, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.77824086, + "num_input_tokens_seen": 335974965, + "step": 15577, + "time_per_iteration": 2.4200172424316406 + }, + { + "auxiliary_loss_clip": 0.01060779, + "auxiliary_loss_mlp": 0.0102999, + "balance_loss_clip": 1.03073418, + "balance_loss_mlp": 1.0175029, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 2.0249030414757208, + "language_loss": 0.52105045, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.54195815, + "num_input_tokens_seen": 335996575, + "step": 15578, + "time_per_iteration": 2.640340566635132 + }, + { + "auxiliary_loss_clip": 0.01047863, + "auxiliary_loss_mlp": 0.01037328, + "balance_loss_clip": 1.02970552, + "balance_loss_mlp": 1.02505541, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.8610566749660065, + "language_loss": 0.70760578, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72845769, + "num_input_tokens_seen": 336017265, + "step": 15579, + "time_per_iteration": 2.6088812351226807 + }, + { + "auxiliary_loss_clip": 0.0108369, + "auxiliary_loss_mlp": 0.01028881, + "balance_loss_clip": 1.03377652, + "balance_loss_mlp": 1.0168891, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 1.8729338521518526, + "language_loss": 0.76280129, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78392696, + "num_input_tokens_seen": 336035905, + "step": 15580, + "time_per_iteration": 4.653178453445435 + }, + { + "auxiliary_loss_clip": 0.01098472, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.03526962, + "balance_loss_mlp": 1.02264118, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 1.8479202947301758, + "language_loss": 0.66599977, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68734479, + "num_input_tokens_seen": 336055585, + "step": 15581, + "time_per_iteration": 2.482905864715576 + }, + { + "auxiliary_loss_clip": 0.01098193, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.03673935, + "balance_loss_mlp": 1.01726794, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.7262854634846965, + "language_loss": 0.7678411, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78912044, + "num_input_tokens_seen": 336076695, + "step": 15582, + "time_per_iteration": 2.4836268424987793 + }, + { + "auxiliary_loss_clip": 0.01039151, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.03454185, + "balance_loss_mlp": 1.02061415, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.6790922064105922, + "language_loss": 0.7361722, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.75689948, + "num_input_tokens_seen": 336094740, + "step": 15583, + "time_per_iteration": 3.9825563430786133 + }, + { + "auxiliary_loss_clip": 0.01113271, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.03780103, + "balance_loss_mlp": 1.02295506, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.2211089530414165, + "language_loss": 0.84165597, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.86315417, + "num_input_tokens_seen": 336113985, + "step": 15584, + "time_per_iteration": 2.4703614711761475 + }, + { + "auxiliary_loss_clip": 0.01103201, + "auxiliary_loss_mlp": 0.00779876, + "balance_loss_clip": 1.04146457, + "balance_loss_mlp": 1.00072169, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.449078726362711, + "language_loss": 0.72003233, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.73886317, + "num_input_tokens_seen": 336136395, + "step": 15585, + "time_per_iteration": 2.625683546066284 + }, + { + "auxiliary_loss_clip": 0.01075172, + "auxiliary_loss_mlp": 0.01024218, + "balance_loss_clip": 1.03486156, + "balance_loss_mlp": 1.01324451, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 2.269762162629465, + "language_loss": 0.80545771, + "learning_rate": 4.134574204836316e-08, + "loss": 0.8264516, + "num_input_tokens_seen": 336156345, + "step": 15586, + "time_per_iteration": 2.5495667457580566 + }, + { + "auxiliary_loss_clip": 0.01068828, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.03422678, + "balance_loss_mlp": 1.02311289, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.6835051845835272, + "language_loss": 0.76533622, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78638113, + "num_input_tokens_seen": 336176760, + "step": 15587, + "time_per_iteration": 2.548109292984009 + }, + { + "auxiliary_loss_clip": 0.01089464, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.0346694, + "balance_loss_mlp": 1.01740217, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 1.9088945803025732, + "language_loss": 0.87627387, + "learning_rate": 4.118832771491387e-08, + "loss": 0.89746505, + "num_input_tokens_seen": 336193285, + "step": 15588, + "time_per_iteration": 2.4612457752227783 + }, + { + "auxiliary_loss_clip": 0.01104706, + "auxiliary_loss_mlp": 0.00777332, + "balance_loss_clip": 1.0356884, + "balance_loss_mlp": 1.00059915, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.809504358448194, + "language_loss": 0.78244281, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80126321, + "num_input_tokens_seen": 336211425, + "step": 15589, + "time_per_iteration": 2.406064510345459 + }, + { + "auxiliary_loss_clip": 0.01107015, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.03705215, + "balance_loss_mlp": 1.02236843, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 1.837058965106575, + "language_loss": 0.78124261, + "learning_rate": 4.103121049480163e-08, + "loss": 0.8026607, + "num_input_tokens_seen": 336230205, + "step": 15590, + "time_per_iteration": 2.4036991596221924 + }, + { + "auxiliary_loss_clip": 0.01080587, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.03286612, + "balance_loss_mlp": 1.02630377, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 1.7334540737600295, + "language_loss": 0.71507108, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73627961, + "num_input_tokens_seen": 336252440, + "step": 15591, + "time_per_iteration": 2.52824330329895 + }, + { + "auxiliary_loss_clip": 0.01103889, + "auxiliary_loss_mlp": 0.00779268, + "balance_loss_clip": 1.04232049, + "balance_loss_mlp": 1.0007143, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 2.307018459511826, + "language_loss": 0.53759432, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.55642581, + "num_input_tokens_seen": 336273845, + "step": 15592, + "time_per_iteration": 2.5134024620056152 + }, + { + "auxiliary_loss_clip": 0.01091906, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.03726983, + "balance_loss_mlp": 1.01652336, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 2.541734206991857, + "language_loss": 0.67237896, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69357747, + "num_input_tokens_seen": 336292790, + "step": 15593, + "time_per_iteration": 2.4567477703094482 + }, + { + "auxiliary_loss_clip": 0.01086488, + "auxiliary_loss_mlp": 0.01027135, + "balance_loss_clip": 1.03415728, + "balance_loss_mlp": 1.0154283, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.526499578295182, + "language_loss": 0.74141711, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76255333, + "num_input_tokens_seen": 336312600, + "step": 15594, + "time_per_iteration": 4.032057046890259 + }, + { + "auxiliary_loss_clip": 0.01094581, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.03446114, + "balance_loss_mlp": 1.01703548, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.617917219960583, + "language_loss": 0.73751909, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75874823, + "num_input_tokens_seen": 336332770, + "step": 15595, + "time_per_iteration": 2.488760471343994 + }, + { + "auxiliary_loss_clip": 0.01082463, + "auxiliary_loss_mlp": 0.01032448, + "balance_loss_clip": 1.03548717, + "balance_loss_mlp": 1.02005029, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 2.1428562946679826, + "language_loss": 0.75948191, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78063107, + "num_input_tokens_seen": 336351445, + "step": 15596, + "time_per_iteration": 2.487480640411377 + }, + { + "auxiliary_loss_clip": 0.01081361, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.0356617, + "balance_loss_mlp": 1.02064753, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.6530740205678145, + "language_loss": 0.78803462, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80917454, + "num_input_tokens_seen": 336368690, + "step": 15597, + "time_per_iteration": 2.4761788845062256 + }, + { + "auxiliary_loss_clip": 0.01111343, + "auxiliary_loss_mlp": 0.01031839, + "balance_loss_clip": 1.03643489, + "balance_loss_mlp": 1.018857, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.5189141792979242, + "language_loss": 0.80788136, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.82931316, + "num_input_tokens_seen": 336388165, + "step": 15598, + "time_per_iteration": 2.402562379837036 + }, + { + "auxiliary_loss_clip": 0.01076594, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.03464365, + "balance_loss_mlp": 1.0171423, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 2.108974304999322, + "language_loss": 0.62984997, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65091455, + "num_input_tokens_seen": 336406475, + "step": 15599, + "time_per_iteration": 2.5272300243377686 + }, + { + "auxiliary_loss_clip": 0.01070338, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.03496563, + "balance_loss_mlp": 1.01887512, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 1.8054635891412272, + "language_loss": 0.73499566, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75601435, + "num_input_tokens_seen": 336424690, + "step": 15600, + "time_per_iteration": 2.5131051540374756 + }, + { + "auxiliary_loss_clip": 0.01080926, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.03509212, + "balance_loss_mlp": 1.0171349, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 2.7117168905148876, + "language_loss": 0.69516933, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71626246, + "num_input_tokens_seen": 336443055, + "step": 15601, + "time_per_iteration": 2.4776718616485596 + }, + { + "auxiliary_loss_clip": 0.01017957, + "auxiliary_loss_mlp": 0.01002288, + "balance_loss_clip": 1.00366092, + "balance_loss_mlp": 1.00101817, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7571126186505602, + "language_loss": 0.5817138, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60191625, + "num_input_tokens_seen": 336510190, + "step": 15602, + "time_per_iteration": 3.198756456375122 + }, + { + "auxiliary_loss_clip": 0.01038026, + "auxiliary_loss_mlp": 0.01036189, + "balance_loss_clip": 1.0345782, + "balance_loss_mlp": 1.02345777, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 2.0283043548194186, + "language_loss": 0.72162598, + "learning_rate": 4.001719234324663e-08, + "loss": 0.7423681, + "num_input_tokens_seen": 336529250, + "step": 15603, + "time_per_iteration": 2.646679401397705 + }, + { + "auxiliary_loss_clip": 0.01099374, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.03292358, + "balance_loss_mlp": 1.01764083, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.633691961308071, + "language_loss": 0.76321101, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78449005, + "num_input_tokens_seen": 336548530, + "step": 15604, + "time_per_iteration": 2.4217472076416016 + }, + { + "auxiliary_loss_clip": 0.01084684, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.03255427, + "balance_loss_mlp": 1.02159953, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 2.3380479061267185, + "language_loss": 0.65507501, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67628872, + "num_input_tokens_seen": 336568510, + "step": 15605, + "time_per_iteration": 2.5233893394470215 + }, + { + "auxiliary_loss_clip": 0.01081308, + "auxiliary_loss_mlp": 0.00779132, + "balance_loss_clip": 1.03996253, + "balance_loss_mlp": 1.00061095, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 1.9389972632343142, + "language_loss": 0.67239386, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69099826, + "num_input_tokens_seen": 336592020, + "step": 15606, + "time_per_iteration": 2.7383737564086914 + }, + { + "auxiliary_loss_clip": 0.01090901, + "auxiliary_loss_mlp": 0.01026476, + "balance_loss_clip": 1.0330373, + "balance_loss_mlp": 1.01496017, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 2.325199678331552, + "language_loss": 0.77668095, + "learning_rate": 3.970771343058166e-08, + "loss": 0.79785466, + "num_input_tokens_seen": 336610010, + "step": 15607, + "time_per_iteration": 2.4468932151794434 + }, + { + "auxiliary_loss_clip": 0.01099612, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.03565907, + "balance_loss_mlp": 1.01982915, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 3.3114232092504547, + "language_loss": 0.82948768, + "learning_rate": 3.963052953128776e-08, + "loss": 0.85079688, + "num_input_tokens_seen": 336628520, + "step": 15608, + "time_per_iteration": 3.880030393600464 + }, + { + "auxiliary_loss_clip": 0.01101053, + "auxiliary_loss_mlp": 0.01038849, + "balance_loss_clip": 1.03912902, + "balance_loss_mlp": 1.02639794, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 1.7920536039819985, + "language_loss": 0.68787742, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.70927644, + "num_input_tokens_seen": 336647365, + "step": 15609, + "time_per_iteration": 2.437242269515991 + }, + { + "auxiliary_loss_clip": 0.01079459, + "auxiliary_loss_mlp": 0.01029329, + "balance_loss_clip": 1.03512859, + "balance_loss_mlp": 1.01659799, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 2.2605390479802483, + "language_loss": 0.75374818, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77483606, + "num_input_tokens_seen": 336667165, + "step": 15610, + "time_per_iteration": 2.5468623638153076 + }, + { + "auxiliary_loss_clip": 0.01051426, + "auxiliary_loss_mlp": 0.01027955, + "balance_loss_clip": 1.0407815, + "balance_loss_mlp": 1.01646924, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 2.9851808157569413, + "language_loss": 0.75217509, + "learning_rate": 3.939942386953987e-08, + "loss": 0.77296889, + "num_input_tokens_seen": 336684130, + "step": 15611, + "time_per_iteration": 2.5791547298431396 + }, + { + "auxiliary_loss_clip": 0.01069463, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.03730786, + "balance_loss_mlp": 1.02153969, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 2.1989989098783767, + "language_loss": 0.66467786, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68571198, + "num_input_tokens_seen": 336701520, + "step": 15612, + "time_per_iteration": 2.5250468254089355 + }, + { + "auxiliary_loss_clip": 0.01094915, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.03588092, + "balance_loss_mlp": 1.01749635, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 1.6311374425068745, + "language_loss": 0.57225245, + "learning_rate": 3.924572515435742e-08, + "loss": 0.59349334, + "num_input_tokens_seen": 336720675, + "step": 15613, + "time_per_iteration": 2.4672820568084717 + }, + { + "auxiliary_loss_clip": 0.01084461, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.0333811, + "balance_loss_mlp": 1.02759874, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 2.1053597773163553, + "language_loss": 0.70781082, + "learning_rate": 3.916898732330764e-08, + "loss": 0.72905195, + "num_input_tokens_seen": 336741005, + "step": 15614, + "time_per_iteration": 2.54583740234375 + }, + { + "auxiliary_loss_clip": 0.01100831, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.03716052, + "balance_loss_mlp": 1.01926994, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 1.8712765880518167, + "language_loss": 0.81082249, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83215207, + "num_input_tokens_seen": 336757990, + "step": 15615, + "time_per_iteration": 2.4373414516448975 + }, + { + "auxiliary_loss_clip": 0.01080897, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.03230429, + "balance_loss_mlp": 1.01687956, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 1.7263729609351175, + "language_loss": 0.71640551, + "learning_rate": 3.901573472884134e-08, + "loss": 0.73750293, + "num_input_tokens_seen": 336777705, + "step": 15616, + "time_per_iteration": 2.520124912261963 + }, + { + "auxiliary_loss_clip": 0.01108111, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.03662705, + "balance_loss_mlp": 1.0174886, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 1.8147592933078531, + "language_loss": 0.65712845, + "learning_rate": 3.89392199712355e-08, + "loss": 0.67850602, + "num_input_tokens_seen": 336798275, + "step": 15617, + "time_per_iteration": 2.4143147468566895 + }, + { + "auxiliary_loss_clip": 0.01102069, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.03764534, + "balance_loss_mlp": 1.02152443, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 2.013966606846006, + "language_loss": 0.73568499, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75705719, + "num_input_tokens_seen": 336813835, + "step": 15618, + "time_per_iteration": 2.441922664642334 + }, + { + "auxiliary_loss_clip": 0.01113696, + "auxiliary_loss_mlp": 0.01037649, + "balance_loss_clip": 1.03746164, + "balance_loss_mlp": 1.02314186, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 2.2720135964164068, + "language_loss": 0.69661099, + "learning_rate": 3.878641354978662e-08, + "loss": 0.71812439, + "num_input_tokens_seen": 336832210, + "step": 15619, + "time_per_iteration": 3.9831550121307373 + }, + { + "auxiliary_loss_clip": 0.01085301, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.03331661, + "balance_loss_mlp": 1.01900971, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.7764241310859075, + "language_loss": 0.77592683, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79709852, + "num_input_tokens_seen": 336851380, + "step": 15620, + "time_per_iteration": 2.519092082977295 + }, + { + "auxiliary_loss_clip": 0.01092566, + "auxiliary_loss_mlp": 0.01026101, + "balance_loss_clip": 1.03475261, + "balance_loss_mlp": 1.01483583, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 2.9125994512987203, + "language_loss": 0.74210131, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.76328796, + "num_input_tokens_seen": 336868525, + "step": 15621, + "time_per_iteration": 2.425565004348755 + }, + { + "auxiliary_loss_clip": 0.0108005, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.03376591, + "balance_loss_mlp": 1.01753342, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 2.4009981882790674, + "language_loss": 0.66311985, + "learning_rate": 3.855776169545688e-08, + "loss": 0.68422788, + "num_input_tokens_seen": 336886200, + "step": 15622, + "time_per_iteration": 2.502143621444702 + }, + { + "auxiliary_loss_clip": 0.0108193, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.03175032, + "balance_loss_mlp": 1.02356696, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 2.210223816833379, + "language_loss": 0.71635276, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73754418, + "num_input_tokens_seen": 336905815, + "step": 15623, + "time_per_iteration": 3.988457679748535 + }, + { + "auxiliary_loss_clip": 0.01101407, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.03881097, + "balance_loss_mlp": 1.02018905, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 2.122981069964808, + "language_loss": 0.72712338, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74846148, + "num_input_tokens_seen": 336928460, + "step": 15624, + "time_per_iteration": 2.580709457397461 + }, + { + "auxiliary_loss_clip": 0.01073783, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.034518, + "balance_loss_mlp": 1.01960206, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 1.9356204322089527, + "language_loss": 0.89546514, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91652042, + "num_input_tokens_seen": 336948320, + "step": 15625, + "time_per_iteration": 2.562150478363037 + }, + { + "auxiliary_loss_clip": 0.01098046, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.03647351, + "balance_loss_mlp": 1.02033329, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 2.670337931773544, + "language_loss": 0.83957005, + "learning_rate": 3.825393386298592e-08, + "loss": 0.86088532, + "num_input_tokens_seen": 336967670, + "step": 15626, + "time_per_iteration": 2.482745885848999 + }, + { + "auxiliary_loss_clip": 0.01010027, + "auxiliary_loss_mlp": 0.01001894, + "balance_loss_clip": 1.00449753, + "balance_loss_mlp": 1.00071955, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.7746879883974805, + "language_loss": 0.5611853, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58130455, + "num_input_tokens_seen": 337028395, + "step": 15627, + "time_per_iteration": 3.020810842514038 + }, + { + "auxiliary_loss_clip": 0.01061908, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.03433263, + "balance_loss_mlp": 1.02023602, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 1.402250734784148, + "language_loss": 0.69717598, + "learning_rate": 3.810246627288105e-08, + "loss": 0.71812946, + "num_input_tokens_seen": 337048150, + "step": 15628, + "time_per_iteration": 2.5808098316192627 + }, + { + "auxiliary_loss_clip": 0.01096109, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.03589439, + "balance_loss_mlp": 1.01610351, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.5350148057308328, + "language_loss": 0.75353307, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.7747792, + "num_input_tokens_seen": 337069315, + "step": 15629, + "time_per_iteration": 2.5734450817108154 + }, + { + "auxiliary_loss_clip": 0.01048426, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.03157032, + "balance_loss_mlp": 1.02354658, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.6860974447861938, + "language_loss": 0.7437368, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76458651, + "num_input_tokens_seen": 337087765, + "step": 15630, + "time_per_iteration": 2.6006577014923096 + }, + { + "auxiliary_loss_clip": 0.01074698, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.03527367, + "balance_loss_mlp": 1.02008212, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 2.3609651457881076, + "language_loss": 0.69301766, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71408677, + "num_input_tokens_seen": 337106265, + "step": 15631, + "time_per_iteration": 2.4778285026550293 + }, + { + "auxiliary_loss_clip": 0.0106032, + "auxiliary_loss_mlp": 0.01035527, + "balance_loss_clip": 1.03539896, + "balance_loss_mlp": 1.02390397, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 2.0809787110981524, + "language_loss": 0.75497323, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77593172, + "num_input_tokens_seen": 337126090, + "step": 15632, + "time_per_iteration": 2.593066453933716 + }, + { + "auxiliary_loss_clip": 0.01104182, + "auxiliary_loss_mlp": 0.01033672, + "balance_loss_clip": 1.03741646, + "balance_loss_mlp": 1.01970625, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 1.5695631772182002, + "language_loss": 0.74599671, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76737523, + "num_input_tokens_seen": 337145655, + "step": 15633, + "time_per_iteration": 2.4968926906585693 + }, + { + "auxiliary_loss_clip": 0.01110094, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.0362829, + "balance_loss_mlp": 1.02351916, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 2.8743328504904784, + "language_loss": 0.72448915, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74595916, + "num_input_tokens_seen": 337164805, + "step": 15634, + "time_per_iteration": 3.9701569080352783 + }, + { + "auxiliary_loss_clip": 0.01098842, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.03310621, + "balance_loss_mlp": 1.01558471, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 2.1784146288989925, + "language_loss": 0.68645394, + "learning_rate": 3.75746733114144e-08, + "loss": 0.70772702, + "num_input_tokens_seen": 337182280, + "step": 15635, + "time_per_iteration": 2.44036865234375 + }, + { + "auxiliary_loss_clip": 0.01058662, + "auxiliary_loss_mlp": 0.01026686, + "balance_loss_clip": 1.03761148, + "balance_loss_mlp": 1.01514041, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.590453517067877, + "language_loss": 0.74402547, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76487899, + "num_input_tokens_seen": 337203495, + "step": 15636, + "time_per_iteration": 2.568079948425293 + }, + { + "auxiliary_loss_clip": 0.01099973, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.03833055, + "balance_loss_mlp": 1.01933384, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 2.1909414651686925, + "language_loss": 0.82998121, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.85129881, + "num_input_tokens_seen": 337220435, + "step": 15637, + "time_per_iteration": 2.436159372329712 + }, + { + "auxiliary_loss_clip": 0.01064113, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.0365907, + "balance_loss_mlp": 1.01878095, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.359239742258821, + "language_loss": 0.69583583, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.71678889, + "num_input_tokens_seen": 337238095, + "step": 15638, + "time_per_iteration": 2.5541141033172607 + }, + { + "auxiliary_loss_clip": 0.01091724, + "auxiliary_loss_mlp": 0.01035308, + "balance_loss_clip": 1.034778, + "balance_loss_mlp": 1.02485347, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.627018170537538, + "language_loss": 0.84668088, + "learning_rate": 3.727471440859498e-08, + "loss": 0.86795121, + "num_input_tokens_seen": 337256645, + "step": 15639, + "time_per_iteration": 2.489975690841675 + }, + { + "auxiliary_loss_clip": 0.01084089, + "auxiliary_loss_mlp": 0.00777433, + "balance_loss_clip": 1.03219366, + "balance_loss_mlp": 1.00049496, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.4726232597313378, + "language_loss": 0.7840538, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80266905, + "num_input_tokens_seen": 337278360, + "step": 15640, + "time_per_iteration": 2.5457215309143066 + }, + { + "auxiliary_loss_clip": 0.01100541, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.03603816, + "balance_loss_mlp": 1.01988804, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.4567393889429685, + "language_loss": 0.74137139, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76270139, + "num_input_tokens_seen": 337302480, + "step": 15641, + "time_per_iteration": 2.538071870803833 + }, + { + "auxiliary_loss_clip": 0.01101512, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.0359515, + "balance_loss_mlp": 1.02016604, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 1.9570137824914744, + "language_loss": 0.82700098, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84835899, + "num_input_tokens_seen": 337316600, + "step": 15642, + "time_per_iteration": 2.42134428024292 + }, + { + "auxiliary_loss_clip": 0.01094692, + "auxiliary_loss_mlp": 0.0102952, + "balance_loss_clip": 1.03931761, + "balance_loss_mlp": 1.01805854, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 2.200335226708085, + "language_loss": 0.68333745, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70457959, + "num_input_tokens_seen": 337336895, + "step": 15643, + "time_per_iteration": 2.504387140274048 + }, + { + "auxiliary_loss_clip": 0.01099169, + "auxiliary_loss_mlp": 0.01038132, + "balance_loss_clip": 1.03561687, + "balance_loss_mlp": 1.02521539, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 1.8597751504952111, + "language_loss": 0.76763237, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.78900534, + "num_input_tokens_seen": 337355105, + "step": 15644, + "time_per_iteration": 2.436079740524292 + }, + { + "auxiliary_loss_clip": 0.01089473, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03311968, + "balance_loss_mlp": 1.02084947, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.5183894994167717, + "language_loss": 0.67574942, + "learning_rate": 3.682700891311974e-08, + "loss": 0.6969682, + "num_input_tokens_seen": 337374905, + "step": 15645, + "time_per_iteration": 2.473172187805176 + }, + { + "auxiliary_loss_clip": 0.01077243, + "auxiliary_loss_mlp": 0.00777659, + "balance_loss_clip": 1.03401065, + "balance_loss_mlp": 1.00048435, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 1.4622398427077559, + "language_loss": 0.70352721, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.7220763, + "num_input_tokens_seen": 337397130, + "step": 15646, + "time_per_iteration": 2.539458990097046 + }, + { + "auxiliary_loss_clip": 0.01091226, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.03237534, + "balance_loss_mlp": 1.01914763, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 1.67042227649491, + "language_loss": 0.74043739, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76166016, + "num_input_tokens_seen": 337418660, + "step": 15647, + "time_per_iteration": 3.9061765670776367 + }, + { + "auxiliary_loss_clip": 0.01011734, + "auxiliary_loss_mlp": 0.01004684, + "balance_loss_clip": 1.00765681, + "balance_loss_mlp": 1.00347948, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.8887984861731415, + "language_loss": 0.63537812, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65554231, + "num_input_tokens_seen": 337478055, + "step": 15648, + "time_per_iteration": 3.167670726776123 + }, + { + "auxiliary_loss_clip": 0.01104762, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.03631246, + "balance_loss_mlp": 1.02078056, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.510225791084952, + "language_loss": 0.66489482, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68625975, + "num_input_tokens_seen": 337499405, + "step": 15649, + "time_per_iteration": 2.4522366523742676 + }, + { + "auxiliary_loss_clip": 0.01076693, + "auxiliary_loss_mlp": 0.01030217, + "balance_loss_clip": 1.03301036, + "balance_loss_mlp": 1.01802802, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 2.2824695274709392, + "language_loss": 0.77685249, + "learning_rate": 3.645596817637586e-08, + "loss": 0.7979216, + "num_input_tokens_seen": 337517195, + "step": 15650, + "time_per_iteration": 2.5080599784851074 + }, + { + "auxiliary_loss_clip": 0.01063206, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.03581667, + "balance_loss_mlp": 1.02101564, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 8.197882139317155, + "language_loss": 0.74140316, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76236254, + "num_input_tokens_seen": 337535245, + "step": 15651, + "time_per_iteration": 2.6139330863952637 + }, + { + "auxiliary_loss_clip": 0.01106355, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.035151, + "balance_loss_mlp": 1.01861954, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 1.8220810036397577, + "language_loss": 0.72315204, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74452817, + "num_input_tokens_seen": 337553040, + "step": 15652, + "time_per_iteration": 2.426327705383301 + }, + { + "auxiliary_loss_clip": 0.01075423, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.03588164, + "balance_loss_mlp": 1.02170575, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 2.5593379819022704, + "language_loss": 0.66328442, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68438697, + "num_input_tokens_seen": 337574580, + "step": 15653, + "time_per_iteration": 2.604823112487793 + }, + { + "auxiliary_loss_clip": 0.01109503, + "auxiliary_loss_mlp": 0.01037226, + "balance_loss_clip": 1.03631806, + "balance_loss_mlp": 1.02417827, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 1.8913990695448049, + "language_loss": 0.77817786, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.79964513, + "num_input_tokens_seen": 337593010, + "step": 15654, + "time_per_iteration": 2.4077870845794678 + }, + { + "auxiliary_loss_clip": 0.01104093, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.0365994, + "balance_loss_mlp": 1.01806343, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.5694602114961682, + "language_loss": 0.69938827, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72073555, + "num_input_tokens_seen": 337616170, + "step": 15655, + "time_per_iteration": 2.6046903133392334 + }, + { + "auxiliary_loss_clip": 0.01108396, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.03630042, + "balance_loss_mlp": 1.02491117, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 1.7361848976703205, + "language_loss": 0.72025585, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74172282, + "num_input_tokens_seen": 337635215, + "step": 15656, + "time_per_iteration": 2.3917629718780518 + }, + { + "auxiliary_loss_clip": 0.01074283, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.03292978, + "balance_loss_mlp": 1.01648879, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 1.8662066492602039, + "language_loss": 0.77819186, + "learning_rate": 3.593963845018377e-08, + "loss": 0.79921663, + "num_input_tokens_seen": 337654195, + "step": 15657, + "time_per_iteration": 2.501307487487793 + }, + { + "auxiliary_loss_clip": 0.010733, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.03435433, + "balance_loss_mlp": 1.01533067, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 2.2510489327632506, + "language_loss": 0.84488046, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86589372, + "num_input_tokens_seen": 337671810, + "step": 15658, + "time_per_iteration": 2.4867117404937744 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01032946, + "balance_loss_clip": 1.0382334, + "balance_loss_mlp": 1.01890337, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 1.9014283763645892, + "language_loss": 0.71046281, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.73193479, + "num_input_tokens_seen": 337689410, + "step": 15659, + "time_per_iteration": 3.8786754608154297 + }, + { + "auxiliary_loss_clip": 0.01084498, + "auxiliary_loss_mlp": 0.01039836, + "balance_loss_clip": 1.03902662, + "balance_loss_mlp": 1.02854705, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.7260830500840116, + "language_loss": 0.79533458, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81657791, + "num_input_tokens_seen": 337709950, + "step": 15660, + "time_per_iteration": 2.55120587348938 + }, + { + "auxiliary_loss_clip": 0.01071713, + "auxiliary_loss_mlp": 0.01027266, + "balance_loss_clip": 1.03237081, + "balance_loss_mlp": 1.01578641, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.4599031809405243, + "language_loss": 0.68090773, + "learning_rate": 3.564623133290201e-08, + "loss": 0.7018975, + "num_input_tokens_seen": 337731320, + "step": 15661, + "time_per_iteration": 2.5893383026123047 + }, + { + "auxiliary_loss_clip": 0.01093394, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.03237331, + "balance_loss_mlp": 1.01876402, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 4.872116358088935, + "language_loss": 0.66470611, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68595123, + "num_input_tokens_seen": 337747720, + "step": 15662, + "time_per_iteration": 3.884908437728882 + }, + { + "auxiliary_loss_clip": 0.01011043, + "auxiliary_loss_mlp": 0.01002433, + "balance_loss_clip": 1.00695801, + "balance_loss_mlp": 1.00122929, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7771884906503609, + "language_loss": 0.59256935, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61270416, + "num_input_tokens_seen": 337806930, + "step": 15663, + "time_per_iteration": 3.133052349090576 + }, + { + "auxiliary_loss_clip": 0.01104981, + "auxiliary_loss_mlp": 0.01038099, + "balance_loss_clip": 1.03780043, + "balance_loss_mlp": 1.0244019, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 1.8646162835072455, + "language_loss": 0.66943252, + "learning_rate": 3.542695811435914e-08, + "loss": 0.69086331, + "num_input_tokens_seen": 337828100, + "step": 15664, + "time_per_iteration": 2.5835442543029785 + }, + { + "auxiliary_loss_clip": 0.01081767, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.03611612, + "balance_loss_mlp": 1.02008677, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 1.9314648904175846, + "language_loss": 0.73366022, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75479144, + "num_input_tokens_seen": 337844805, + "step": 15665, + "time_per_iteration": 2.4839887619018555 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.03705251, + "balance_loss_mlp": 1.0193212, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 2.0086123251582118, + "language_loss": 0.63835579, + "learning_rate": 3.528114844807773e-08, + "loss": 0.65973032, + "num_input_tokens_seen": 337860490, + "step": 15666, + "time_per_iteration": 2.391085386276245 + }, + { + "auxiliary_loss_clip": 0.0107945, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.04112458, + "balance_loss_mlp": 1.0186826, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 1.6395150432167964, + "language_loss": 0.78971577, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81082165, + "num_input_tokens_seen": 337878360, + "step": 15667, + "time_per_iteration": 2.516324520111084 + }, + { + "auxiliary_loss_clip": 0.01104704, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.03465998, + "balance_loss_mlp": 1.02141082, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.8114908782751609, + "language_loss": 0.74860924, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.7699827, + "num_input_tokens_seen": 337895635, + "step": 15668, + "time_per_iteration": 2.4189746379852295 + }, + { + "auxiliary_loss_clip": 0.01062558, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.04072833, + "balance_loss_mlp": 1.01694655, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 2.96934084892423, + "language_loss": 0.58978796, + "learning_rate": 3.506299272306723e-08, + "loss": 0.61070961, + "num_input_tokens_seen": 337913940, + "step": 15669, + "time_per_iteration": 2.5695385932922363 + }, + { + "auxiliary_loss_clip": 0.01066813, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.03207135, + "balance_loss_mlp": 1.01756322, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 2.1852636051979073, + "language_loss": 0.77043319, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.79139113, + "num_input_tokens_seen": 337932015, + "step": 15670, + "time_per_iteration": 2.522922992706299 + }, + { + "auxiliary_loss_clip": 0.01110149, + "auxiliary_loss_mlp": 0.01037531, + "balance_loss_clip": 1.03790998, + "balance_loss_mlp": 1.02514505, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 1.802961443333546, + "language_loss": 0.65206677, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67354351, + "num_input_tokens_seen": 337953345, + "step": 15671, + "time_per_iteration": 2.517096996307373 + }, + { + "auxiliary_loss_clip": 0.01082913, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.03318965, + "balance_loss_mlp": 1.02089155, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 1.6421594250388147, + "language_loss": 0.79417288, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81533611, + "num_input_tokens_seen": 337973685, + "step": 15672, + "time_per_iteration": 2.5033950805664062 + }, + { + "auxiliary_loss_clip": 0.01075081, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.03315103, + "balance_loss_mlp": 1.02201664, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.3069831465743817, + "language_loss": 0.73467517, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75578111, + "num_input_tokens_seen": 337989175, + "step": 15673, + "time_per_iteration": 3.993823289871216 + }, + { + "auxiliary_loss_clip": 0.01091618, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.03341854, + "balance_loss_mlp": 1.01876688, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.6163065913068064, + "language_loss": 0.70113546, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72236747, + "num_input_tokens_seen": 338011800, + "step": 15674, + "time_per_iteration": 2.5348711013793945 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.01026124, + "balance_loss_clip": 1.03490436, + "balance_loss_mlp": 1.01425695, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.8011922376746723, + "language_loss": 0.81302786, + "learning_rate": 3.462869313364125e-08, + "loss": 0.83435464, + "num_input_tokens_seen": 338032120, + "step": 15675, + "time_per_iteration": 2.416759729385376 + }, + { + "auxiliary_loss_clip": 0.01081335, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.03388023, + "balance_loss_mlp": 1.01795101, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.7294832128742514, + "language_loss": 0.62403613, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.64514869, + "num_input_tokens_seen": 338051880, + "step": 15676, + "time_per_iteration": 2.4763545989990234 + }, + { + "auxiliary_loss_clip": 0.01091495, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.03752327, + "balance_loss_mlp": 1.02665889, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 1.7899691395950026, + "language_loss": 0.6713261, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69263047, + "num_input_tokens_seen": 338069665, + "step": 15677, + "time_per_iteration": 2.4881410598754883 + }, + { + "auxiliary_loss_clip": 0.01073004, + "auxiliary_loss_mlp": 0.01035685, + "balance_loss_clip": 1.03210771, + "balance_loss_mlp": 1.02174902, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 2.071884344280398, + "language_loss": 0.64130652, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66239345, + "num_input_tokens_seen": 338090490, + "step": 15678, + "time_per_iteration": 2.594585418701172 + }, + { + "auxiliary_loss_clip": 0.0108299, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.039693, + "balance_loss_mlp": 1.01621139, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.5383390450498426, + "language_loss": 0.74409246, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76520634, + "num_input_tokens_seen": 338109825, + "step": 15679, + "time_per_iteration": 2.5800459384918213 + }, + { + "auxiliary_loss_clip": 0.01091825, + "auxiliary_loss_mlp": 0.010366, + "balance_loss_clip": 1.03703976, + "balance_loss_mlp": 1.02373719, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 2.405704746994103, + "language_loss": 0.77099121, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79227543, + "num_input_tokens_seen": 338125790, + "step": 15680, + "time_per_iteration": 2.4730987548828125 + }, + { + "auxiliary_loss_clip": 0.01098195, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.03717542, + "balance_loss_mlp": 1.02038622, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 1.8506049868614198, + "language_loss": 0.75235516, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77365834, + "num_input_tokens_seen": 338145610, + "step": 15681, + "time_per_iteration": 2.4686644077301025 + }, + { + "auxiliary_loss_clip": 0.01082347, + "auxiliary_loss_mlp": 0.01036946, + "balance_loss_clip": 1.03564215, + "balance_loss_mlp": 1.02417827, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 2.7402473878143088, + "language_loss": 0.65505618, + "learning_rate": 3.412540130236086e-08, + "loss": 0.67624909, + "num_input_tokens_seen": 338165960, + "step": 15682, + "time_per_iteration": 2.4970569610595703 + }, + { + "auxiliary_loss_clip": 0.01072364, + "auxiliary_loss_mlp": 0.01026849, + "balance_loss_clip": 1.03289449, + "balance_loss_mlp": 1.01503503, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 1.8388056711046852, + "language_loss": 0.76759601, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78858817, + "num_input_tokens_seen": 338187215, + "step": 15683, + "time_per_iteration": 2.590883255004883 + }, + { + "auxiliary_loss_clip": 0.01101568, + "auxiliary_loss_mlp": 0.01040321, + "balance_loss_clip": 1.03643537, + "balance_loss_mlp": 1.0265286, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 3.7685844932584813, + "language_loss": 0.7541073, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77552617, + "num_input_tokens_seen": 338201825, + "step": 15684, + "time_per_iteration": 2.4198784828186035 + }, + { + "auxiliary_loss_clip": 0.01103887, + "auxiliary_loss_mlp": 0.01024855, + "balance_loss_clip": 1.0349741, + "balance_loss_mlp": 1.01355958, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.6289682640211376, + "language_loss": 0.76995218, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79123962, + "num_input_tokens_seen": 338220865, + "step": 15685, + "time_per_iteration": 2.4761247634887695 + }, + { + "auxiliary_loss_clip": 0.01092748, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.03439164, + "balance_loss_mlp": 1.01768303, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 1.7601720820186666, + "language_loss": 0.75544024, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77665973, + "num_input_tokens_seen": 338240160, + "step": 15686, + "time_per_iteration": 3.945713996887207 + }, + { + "auxiliary_loss_clip": 0.01098546, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.03546894, + "balance_loss_mlp": 1.01615, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 1.7748138018461068, + "language_loss": 0.80542624, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82670081, + "num_input_tokens_seen": 338259305, + "step": 15687, + "time_per_iteration": 2.459555149078369 + }, + { + "auxiliary_loss_clip": 0.01091393, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.03778124, + "balance_loss_mlp": 1.02146316, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 2.199309677477579, + "language_loss": 0.75801909, + "learning_rate": 3.369691556873011e-08, + "loss": 0.7792815, + "num_input_tokens_seen": 338274950, + "step": 15688, + "time_per_iteration": 2.4848060607910156 + }, + { + "auxiliary_loss_clip": 0.01079437, + "auxiliary_loss_mlp": 0.01026516, + "balance_loss_clip": 1.0326004, + "balance_loss_mlp": 1.01358747, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.8428696703603165, + "language_loss": 0.68757099, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70863056, + "num_input_tokens_seen": 338295585, + "step": 15689, + "time_per_iteration": 2.5721821784973145 + }, + { + "auxiliary_loss_clip": 0.01096691, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.0360862, + "balance_loss_mlp": 1.0235616, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 1.767743992684186, + "language_loss": 0.80328572, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82459319, + "num_input_tokens_seen": 338314555, + "step": 15690, + "time_per_iteration": 2.468907117843628 + }, + { + "auxiliary_loss_clip": 0.01096477, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03653646, + "balance_loss_mlp": 1.01642418, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 1.8295785900088477, + "language_loss": 0.59909099, + "learning_rate": 3.348367925792317e-08, + "loss": 0.62033719, + "num_input_tokens_seen": 338336260, + "step": 15691, + "time_per_iteration": 2.5894172191619873 + }, + { + "auxiliary_loss_clip": 0.0107225, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.03426039, + "balance_loss_mlp": 1.01814723, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.6361454050064903, + "language_loss": 0.66907418, + "learning_rate": 3.341274962505514e-08, + "loss": 0.69010687, + "num_input_tokens_seen": 338354680, + "step": 15692, + "time_per_iteration": 2.556621551513672 + }, + { + "auxiliary_loss_clip": 0.01098618, + "auxiliary_loss_mlp": 0.01031886, + "balance_loss_clip": 1.04048944, + "balance_loss_mlp": 1.0197444, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.5060837528652486, + "language_loss": 0.74594164, + "learning_rate": 3.334189456537251e-08, + "loss": 0.76724672, + "num_input_tokens_seen": 338372490, + "step": 15693, + "time_per_iteration": 2.4963395595550537 + }, + { + "auxiliary_loss_clip": 0.01075861, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.03393674, + "balance_loss_mlp": 1.02174723, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 1.742102792335418, + "language_loss": 0.73274004, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75384456, + "num_input_tokens_seen": 338390870, + "step": 15694, + "time_per_iteration": 2.6138579845428467 + }, + { + "auxiliary_loss_clip": 0.00995377, + "auxiliary_loss_mlp": 0.01004929, + "balance_loss_clip": 1.00693023, + "balance_loss_mlp": 1.00366497, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.6897704436195069, + "language_loss": 0.50578821, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52579129, + "num_input_tokens_seen": 338453075, + "step": 15695, + "time_per_iteration": 3.159670114517212 + }, + { + "auxiliary_loss_clip": 0.01081925, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.03241253, + "balance_loss_mlp": 1.02365947, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.6582472391405034, + "language_loss": 0.6525017, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67367721, + "num_input_tokens_seen": 338471770, + "step": 15696, + "time_per_iteration": 2.5442445278167725 + }, + { + "auxiliary_loss_clip": 0.01097558, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.03617954, + "balance_loss_mlp": 1.01831484, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.698435798031903, + "language_loss": 0.66387928, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68515396, + "num_input_tokens_seen": 338492190, + "step": 15697, + "time_per_iteration": 2.513833522796631 + }, + { + "auxiliary_loss_clip": 0.01000424, + "auxiliary_loss_mlp": 0.01003526, + "balance_loss_clip": 1.00465596, + "balance_loss_mlp": 1.00215554, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8466258045416853, + "language_loss": 0.63408512, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65412462, + "num_input_tokens_seen": 338552560, + "step": 15698, + "time_per_iteration": 3.030127763748169 + }, + { + "auxiliary_loss_clip": 0.01090196, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.03763604, + "balance_loss_mlp": 1.02537918, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 1.9136284327953987, + "language_loss": 0.70024651, + "learning_rate": 3.291833039444092e-08, + "loss": 0.72153246, + "num_input_tokens_seen": 338571770, + "step": 15699, + "time_per_iteration": 5.433840751647949 + }, + { + "auxiliary_loss_clip": 0.01069837, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.03229427, + "balance_loss_mlp": 1.01595199, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 2.5339898995113552, + "language_loss": 0.73928761, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76026273, + "num_input_tokens_seen": 338587310, + "step": 15700, + "time_per_iteration": 2.542036533355713 + }, + { + "auxiliary_loss_clip": 0.01037682, + "auxiliary_loss_mlp": 0.01035512, + "balance_loss_clip": 1.03159976, + "balance_loss_mlp": 1.02377605, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 1.6041075782488718, + "language_loss": 0.70709091, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72782284, + "num_input_tokens_seen": 338606235, + "step": 15701, + "time_per_iteration": 4.003664255142212 + }, + { + "auxiliary_loss_clip": 0.01071208, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.03547263, + "balance_loss_mlp": 1.01728201, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 1.7777763872147014, + "language_loss": 0.7813884, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.80239296, + "num_input_tokens_seen": 338624090, + "step": 15702, + "time_per_iteration": 2.565476655960083 + }, + { + "auxiliary_loss_clip": 0.01095983, + "auxiliary_loss_mlp": 0.01040611, + "balance_loss_clip": 1.03446436, + "balance_loss_mlp": 1.0273726, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 1.7181530023071088, + "language_loss": 0.66875494, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.69012088, + "num_input_tokens_seen": 338643695, + "step": 15703, + "time_per_iteration": 2.484905958175659 + }, + { + "auxiliary_loss_clip": 0.01097093, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.03690851, + "balance_loss_mlp": 1.01540852, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 1.6388449873706283, + "language_loss": 0.73269153, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75394964, + "num_input_tokens_seen": 338664725, + "step": 15704, + "time_per_iteration": 2.539900302886963 + }, + { + "auxiliary_loss_clip": 0.01094492, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.0361743, + "balance_loss_mlp": 1.02149272, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 2.09207718416339, + "language_loss": 0.74151194, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76279581, + "num_input_tokens_seen": 338683990, + "step": 15705, + "time_per_iteration": 2.5081191062927246 + }, + { + "auxiliary_loss_clip": 0.01085553, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.0365665, + "balance_loss_mlp": 1.01973343, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 1.9775661397532938, + "language_loss": 0.77337617, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79453802, + "num_input_tokens_seen": 338702025, + "step": 15706, + "time_per_iteration": 2.471547842025757 + }, + { + "auxiliary_loss_clip": 0.01091727, + "auxiliary_loss_mlp": 0.01026634, + "balance_loss_clip": 1.03302276, + "balance_loss_mlp": 1.01521349, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.420226729107484, + "language_loss": 0.69252372, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71370733, + "num_input_tokens_seen": 338720920, + "step": 15707, + "time_per_iteration": 2.4880125522613525 + }, + { + "auxiliary_loss_clip": 0.01099846, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.03167045, + "balance_loss_mlp": 1.01870561, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 1.7059980856695283, + "language_loss": 0.69483137, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71612656, + "num_input_tokens_seen": 338739590, + "step": 15708, + "time_per_iteration": 2.400751829147339 + }, + { + "auxiliary_loss_clip": 0.01098004, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.03687477, + "balance_loss_mlp": 1.01869774, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 2.9552621549957117, + "language_loss": 0.70900935, + "learning_rate": 3.221835774749748e-08, + "loss": 0.73029631, + "num_input_tokens_seen": 338757240, + "step": 15709, + "time_per_iteration": 2.4467544555664062 + }, + { + "auxiliary_loss_clip": 0.01070457, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.03931046, + "balance_loss_mlp": 1.01931667, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 2.0632945501212476, + "language_loss": 0.85164124, + "learning_rate": 3.214877084074774e-08, + "loss": 0.8726511, + "num_input_tokens_seen": 338773750, + "step": 15710, + "time_per_iteration": 2.5920159816741943 + }, + { + "auxiliary_loss_clip": 0.01085075, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.03699267, + "balance_loss_mlp": 1.01891971, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.7582399729445743, + "language_loss": 0.71678174, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73795301, + "num_input_tokens_seen": 338792115, + "step": 15711, + "time_per_iteration": 2.5541958808898926 + }, + { + "auxiliary_loss_clip": 0.0109815, + "auxiliary_loss_mlp": 0.01029976, + "balance_loss_clip": 1.03633368, + "balance_loss_mlp": 1.01750708, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 2.0634507319792674, + "language_loss": 0.68763864, + "learning_rate": 3.200982089323179e-08, + "loss": 0.70891988, + "num_input_tokens_seen": 338812480, + "step": 15712, + "time_per_iteration": 2.5461037158966064 + }, + { + "auxiliary_loss_clip": 0.01101383, + "auxiliary_loss_mlp": 0.01038037, + "balance_loss_clip": 1.03760195, + "balance_loss_mlp": 1.02464402, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.3478145363562453, + "language_loss": 0.70957851, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.73097277, + "num_input_tokens_seen": 338829105, + "step": 15713, + "time_per_iteration": 3.9969615936279297 + }, + { + "auxiliary_loss_clip": 0.0108224, + "auxiliary_loss_mlp": 0.01034648, + "balance_loss_clip": 1.03231072, + "balance_loss_mlp": 1.021595, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.5064118748003936, + "language_loss": 0.76533687, + "learning_rate": 3.187116945125212e-08, + "loss": 0.7865057, + "num_input_tokens_seen": 338850670, + "step": 15714, + "time_per_iteration": 2.5565414428710938 + }, + { + "auxiliary_loss_clip": 0.01081521, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.0363481, + "balance_loss_mlp": 1.01904857, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 1.8832603546545608, + "language_loss": 0.67586243, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69699407, + "num_input_tokens_seen": 338867795, + "step": 15715, + "time_per_iteration": 2.533360004425049 + }, + { + "auxiliary_loss_clip": 0.01078221, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.03641355, + "balance_loss_mlp": 1.01990223, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 2.146001794419062, + "language_loss": 0.74405545, + "learning_rate": 3.173281653583948e-08, + "loss": 0.76516777, + "num_input_tokens_seen": 338887205, + "step": 15716, + "time_per_iteration": 2.6322333812713623 + }, + { + "auxiliary_loss_clip": 0.01092502, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.0397073, + "balance_loss_mlp": 1.01973975, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 1.7649830238437765, + "language_loss": 0.62682813, + "learning_rate": 3.166375203215565e-08, + "loss": 0.6480751, + "num_input_tokens_seen": 338906130, + "step": 15717, + "time_per_iteration": 2.5555386543273926 + }, + { + "auxiliary_loss_clip": 0.01093523, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.03809476, + "balance_loss_mlp": 1.02023888, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.6645560791941838, + "language_loss": 0.79157126, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81282651, + "num_input_tokens_seen": 338923045, + "step": 15718, + "time_per_iteration": 2.4422078132629395 + }, + { + "auxiliary_loss_clip": 0.01019406, + "auxiliary_loss_mlp": 0.01000314, + "balance_loss_clip": 1.00573111, + "balance_loss_mlp": 0.99907476, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.6987672473808519, + "language_loss": 0.57826096, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59845817, + "num_input_tokens_seen": 338987545, + "step": 15719, + "time_per_iteration": 3.084245443344116 + }, + { + "auxiliary_loss_clip": 0.0106934, + "auxiliary_loss_mlp": 0.00777772, + "balance_loss_clip": 1.0342021, + "balance_loss_mlp": 1.00060678, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.6748659721847656, + "language_loss": 0.7606715, + "learning_rate": 3.145700636861193e-08, + "loss": 0.77914262, + "num_input_tokens_seen": 339007830, + "step": 15720, + "time_per_iteration": 2.622236728668213 + }, + { + "auxiliary_loss_clip": 0.01092917, + "auxiliary_loss_mlp": 0.01028298, + "balance_loss_clip": 1.03524292, + "balance_loss_mlp": 1.01752782, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.762349334440659, + "language_loss": 0.72854638, + "learning_rate": 3.138824043864452e-08, + "loss": 0.74975854, + "num_input_tokens_seen": 339028980, + "step": 15721, + "time_per_iteration": 2.522859811782837 + }, + { + "auxiliary_loss_clip": 0.01064772, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.03242218, + "balance_loss_mlp": 1.02323687, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 2.8435946727636536, + "language_loss": 0.85306406, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87406981, + "num_input_tokens_seen": 339047950, + "step": 15722, + "time_per_iteration": 2.6081912517547607 + }, + { + "auxiliary_loss_clip": 0.01009689, + "auxiliary_loss_mlp": 0.00998958, + "balance_loss_clip": 1.00651765, + "balance_loss_mlp": 0.99778986, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.897613778414943, + "language_loss": 0.64523888, + "learning_rate": 3.125093253118005e-08, + "loss": 0.6653254, + "num_input_tokens_seen": 339104535, + "step": 15723, + "time_per_iteration": 3.0152461528778076 + }, + { + "auxiliary_loss_clip": 0.01076177, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.04241979, + "balance_loss_mlp": 1.01708758, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 2.0961335048370517, + "language_loss": 0.72761232, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.74867189, + "num_input_tokens_seen": 339122050, + "step": 15724, + "time_per_iteration": 2.5180556774139404 + }, + { + "auxiliary_loss_clip": 0.01076549, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.0168978, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 2.983325172248873, + "language_loss": 0.84947586, + "learning_rate": 3.111392324436024e-08, + "loss": 0.87053072, + "num_input_tokens_seen": 339138940, + "step": 15725, + "time_per_iteration": 2.5628128051757812 + }, + { + "auxiliary_loss_clip": 0.01090884, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.03837478, + "balance_loss_mlp": 1.01437235, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 5.5487568157894325, + "language_loss": 0.71017063, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73134029, + "num_input_tokens_seen": 339158245, + "step": 15726, + "time_per_iteration": 4.001742124557495 + }, + { + "auxiliary_loss_clip": 0.01083189, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.03387356, + "balance_loss_mlp": 1.02099741, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.6783132246530927, + "language_loss": 0.60449898, + "learning_rate": 3.097721259896735e-08, + "loss": 0.62567198, + "num_input_tokens_seen": 339178200, + "step": 15727, + "time_per_iteration": 2.562041759490967 + }, + { + "auxiliary_loss_clip": 0.0109453, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.03400517, + "balance_loss_mlp": 1.02165318, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.6773164442812016, + "language_loss": 0.81838679, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.83966368, + "num_input_tokens_seen": 339193950, + "step": 15728, + "time_per_iteration": 2.488950729370117 + }, + { + "auxiliary_loss_clip": 0.00986731, + "auxiliary_loss_mlp": 0.01017836, + "balance_loss_clip": 1.00841892, + "balance_loss_mlp": 1.01671505, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7439156604821978, + "language_loss": 0.5898841, + "learning_rate": 3.08408006157368e-08, + "loss": 0.60992974, + "num_input_tokens_seen": 339252330, + "step": 15729, + "time_per_iteration": 3.3259732723236084 + }, + { + "auxiliary_loss_clip": 0.01105088, + "auxiliary_loss_mlp": 0.01026742, + "balance_loss_clip": 1.03482604, + "balance_loss_mlp": 1.01446986, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 1.8695325978958715, + "language_loss": 0.76108038, + "learning_rate": 3.077270662890052e-08, + "loss": 0.7823987, + "num_input_tokens_seen": 339270325, + "step": 15730, + "time_per_iteration": 3.0792698860168457 + }, + { + "auxiliary_loss_clip": 0.01089948, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.04169691, + "balance_loss_mlp": 1.01905155, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.444517080553524, + "language_loss": 0.62470573, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64592302, + "num_input_tokens_seen": 339291980, + "step": 15731, + "time_per_iteration": 2.7393221855163574 + }, + { + "auxiliary_loss_clip": 0.01102191, + "auxiliary_loss_mlp": 0.01026824, + "balance_loss_clip": 1.03641844, + "balance_loss_mlp": 1.01387215, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 2.4457791095580244, + "language_loss": 0.63752186, + "learning_rate": 3.063674267769589e-08, + "loss": 0.65881199, + "num_input_tokens_seen": 339311795, + "step": 15732, + "time_per_iteration": 2.5497682094573975 + }, + { + "auxiliary_loss_clip": 0.01097226, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.03727996, + "balance_loss_mlp": 1.01889288, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 1.9303064505597765, + "language_loss": 0.83996999, + "learning_rate": 3.056887271848363e-08, + "loss": 0.86126292, + "num_input_tokens_seen": 339327745, + "step": 15733, + "time_per_iteration": 2.4949190616607666 + }, + { + "auxiliary_loss_clip": 0.01095395, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.03471589, + "balance_loss_mlp": 1.0177691, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 1.5199966167229428, + "language_loss": 0.72215176, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74339527, + "num_input_tokens_seen": 339346445, + "step": 15734, + "time_per_iteration": 2.4605014324188232 + }, + { + "auxiliary_loss_clip": 0.010901, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.03272891, + "balance_loss_mlp": 1.02128899, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.4463097472938087, + "language_loss": 0.86694956, + "learning_rate": 3.043335684570692e-08, + "loss": 0.88816565, + "num_input_tokens_seen": 339367945, + "step": 15735, + "time_per_iteration": 2.5474069118499756 + }, + { + "auxiliary_loss_clip": 0.01089565, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.03571403, + "balance_loss_mlp": 1.01868796, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 1.9895002707888296, + "language_loss": 0.67188972, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69308633, + "num_input_tokens_seen": 339386060, + "step": 15736, + "time_per_iteration": 2.497803211212158 + }, + { + "auxiliary_loss_clip": 0.00996506, + "auxiliary_loss_mlp": 0.01008523, + "balance_loss_clip": 1.01836276, + "balance_loss_mlp": 1.0073905, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8638811196841726, + "language_loss": 0.65300882, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67305911, + "num_input_tokens_seen": 339446695, + "step": 15737, + "time_per_iteration": 3.2024567127227783 + }, + { + "auxiliary_loss_clip": 0.0101605, + "auxiliary_loss_mlp": 0.01004208, + "balance_loss_clip": 1.00570083, + "balance_loss_mlp": 1.00300431, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.7993399983287941, + "language_loss": 0.58762622, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.6078288, + "num_input_tokens_seen": 339510080, + "step": 15738, + "time_per_iteration": 5.330617904663086 + }, + { + "auxiliary_loss_clip": 0.01093998, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.03322184, + "balance_loss_mlp": 1.01900959, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 1.6541183688717949, + "language_loss": 0.71353006, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73477256, + "num_input_tokens_seen": 339529335, + "step": 15739, + "time_per_iteration": 2.527710199356079 + }, + { + "auxiliary_loss_clip": 0.01095159, + "auxiliary_loss_mlp": 0.01031491, + "balance_loss_clip": 1.03281081, + "balance_loss_mlp": 1.01871753, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 2.3599263430478756, + "language_loss": 0.6432389, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66450542, + "num_input_tokens_seen": 339548820, + "step": 15740, + "time_per_iteration": 3.9686853885650635 + }, + { + "auxiliary_loss_clip": 0.01085341, + "auxiliary_loss_mlp": 0.01029939, + "balance_loss_clip": 1.03741813, + "balance_loss_mlp": 1.01807165, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.6406759426349837, + "language_loss": 0.66576421, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.68691695, + "num_input_tokens_seen": 339566775, + "step": 15741, + "time_per_iteration": 2.5667598247528076 + }, + { + "auxiliary_loss_clip": 0.01097461, + "auxiliary_loss_mlp": 0.0102902, + "balance_loss_clip": 1.03628635, + "balance_loss_mlp": 1.01718855, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 2.1427111245986734, + "language_loss": 0.75624472, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.77750957, + "num_input_tokens_seen": 339581905, + "step": 15742, + "time_per_iteration": 2.4763681888580322 + }, + { + "auxiliary_loss_clip": 0.01094903, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.03432405, + "balance_loss_mlp": 1.01880169, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 2.5962827104135693, + "language_loss": 0.72217101, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74342382, + "num_input_tokens_seen": 339599870, + "step": 15743, + "time_per_iteration": 2.505790948867798 + }, + { + "auxiliary_loss_clip": 0.01075822, + "auxiliary_loss_mlp": 0.01030949, + "balance_loss_clip": 1.03816998, + "balance_loss_mlp": 1.01803267, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 2.971221011989486, + "language_loss": 0.7965551, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81762284, + "num_input_tokens_seen": 339620250, + "step": 15744, + "time_per_iteration": 2.614361047744751 + }, + { + "auxiliary_loss_clip": 0.01085277, + "auxiliary_loss_mlp": 0.01038514, + "balance_loss_clip": 1.03534365, + "balance_loss_mlp": 1.02563953, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 1.6565822590547106, + "language_loss": 0.78224504, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80348301, + "num_input_tokens_seen": 339639900, + "step": 15745, + "time_per_iteration": 2.565837860107422 + }, + { + "auxiliary_loss_clip": 0.01083309, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.03219163, + "balance_loss_mlp": 1.01839566, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.7459590309100481, + "language_loss": 0.70280373, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.72394741, + "num_input_tokens_seen": 339658970, + "step": 15746, + "time_per_iteration": 2.560877561569214 + }, + { + "auxiliary_loss_clip": 0.01088878, + "auxiliary_loss_mlp": 0.0102912, + "balance_loss_clip": 1.04208326, + "balance_loss_mlp": 1.01623988, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 2.2039192887653782, + "language_loss": 0.56540537, + "learning_rate": 2.962653596305964e-08, + "loss": 0.5865854, + "num_input_tokens_seen": 339675600, + "step": 15747, + "time_per_iteration": 2.531754732131958 + }, + { + "auxiliary_loss_clip": 0.00977018, + "auxiliary_loss_mlp": 0.01007116, + "balance_loss_clip": 1.00918508, + "balance_loss_mlp": 1.00578737, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6694501730513387, + "language_loss": 0.53207946, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55192077, + "num_input_tokens_seen": 339744505, + "step": 15748, + "time_per_iteration": 3.5835442543029785 + }, + { + "auxiliary_loss_clip": 0.01087599, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.03491163, + "balance_loss_mlp": 1.0248872, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 2.743411129565363, + "language_loss": 0.66668421, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68793154, + "num_input_tokens_seen": 339765810, + "step": 15749, + "time_per_iteration": 3.0480875968933105 + }, + { + "auxiliary_loss_clip": 0.0107836, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.03400183, + "balance_loss_mlp": 1.0168829, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 1.9723866442057583, + "language_loss": 0.76095819, + "learning_rate": 2.942651169791621e-08, + "loss": 0.78204215, + "num_input_tokens_seen": 339784125, + "step": 15750, + "time_per_iteration": 2.5725502967834473 + }, + { + "auxiliary_loss_clip": 0.01097154, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.03723431, + "balance_loss_mlp": 1.01774645, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 1.6395055534563514, + "language_loss": 0.680893, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70216334, + "num_input_tokens_seen": 339803450, + "step": 15751, + "time_per_iteration": 2.521315336227417 + }, + { + "auxiliary_loss_clip": 0.0107711, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.03390622, + "balance_loss_mlp": 1.01779532, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 1.5647775793909666, + "language_loss": 0.65164518, + "learning_rate": 2.929353580532723e-08, + "loss": 0.67271125, + "num_input_tokens_seen": 339823215, + "step": 15752, + "time_per_iteration": 4.35968542098999 + }, + { + "auxiliary_loss_clip": 0.01093688, + "auxiliary_loss_mlp": 0.01041059, + "balance_loss_clip": 1.03412402, + "balance_loss_mlp": 1.02724826, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 2.0944461714404397, + "language_loss": 0.71822965, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73957717, + "num_input_tokens_seen": 339842230, + "step": 15753, + "time_per_iteration": 2.58721661567688 + }, + { + "auxiliary_loss_clip": 0.01109801, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.03492725, + "balance_loss_mlp": 1.01926494, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 1.9825597515359086, + "language_loss": 0.70121193, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72264552, + "num_input_tokens_seen": 339861640, + "step": 15754, + "time_per_iteration": 2.5178780555725098 + }, + { + "auxiliary_loss_clip": 0.01109795, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.03572941, + "balance_loss_mlp": 1.01948762, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.2543441760041203, + "language_loss": 0.78676683, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.80817533, + "num_input_tokens_seen": 339878210, + "step": 15755, + "time_per_iteration": 2.418285846710205 + }, + { + "auxiliary_loss_clip": 0.01074697, + "auxiliary_loss_mlp": 0.01037161, + "balance_loss_clip": 1.03687978, + "balance_loss_mlp": 1.02199721, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.802634963904495, + "language_loss": 0.74431622, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.76543474, + "num_input_tokens_seen": 339894255, + "step": 15756, + "time_per_iteration": 2.582991361618042 + }, + { + "auxiliary_loss_clip": 0.01086337, + "auxiliary_loss_mlp": 0.01035777, + "balance_loss_clip": 1.03235304, + "balance_loss_mlp": 1.02344465, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 2.1415191690079807, + "language_loss": 0.74695039, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.76817155, + "num_input_tokens_seen": 339912425, + "step": 15757, + "time_per_iteration": 2.5182487964630127 + }, + { + "auxiliary_loss_clip": 0.0109161, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.03654218, + "balance_loss_mlp": 1.01716328, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.276870431413278, + "language_loss": 0.7970224, + "learning_rate": 2.889640171327512e-08, + "loss": 0.8182404, + "num_input_tokens_seen": 339929635, + "step": 15758, + "time_per_iteration": 2.566288709640503 + }, + { + "auxiliary_loss_clip": 0.01080333, + "auxiliary_loss_mlp": 0.00777612, + "balance_loss_clip": 1.03550923, + "balance_loss_mlp": 1.00052285, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.5583090325235482, + "language_loss": 0.718413, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.73699248, + "num_input_tokens_seen": 339951200, + "step": 15759, + "time_per_iteration": 2.6160149574279785 + }, + { + "auxiliary_loss_clip": 0.01090129, + "auxiliary_loss_mlp": 0.0102814, + "balance_loss_clip": 1.03645563, + "balance_loss_mlp": 1.01808488, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.675276057799933, + "language_loss": 0.75384927, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77503198, + "num_input_tokens_seen": 339971820, + "step": 15760, + "time_per_iteration": 2.545927047729492 + }, + { + "auxiliary_loss_clip": 0.01108582, + "auxiliary_loss_mlp": 0.00776706, + "balance_loss_clip": 1.03751373, + "balance_loss_mlp": 1.00061715, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 1.9374313433145651, + "language_loss": 0.73068625, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.74953914, + "num_input_tokens_seen": 339989420, + "step": 15761, + "time_per_iteration": 2.5029962062835693 + }, + { + "auxiliary_loss_clip": 0.01086279, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.03798461, + "balance_loss_mlp": 1.02407789, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.3954782291816596, + "language_loss": 0.71953118, + "learning_rate": 2.863314050734722e-08, + "loss": 0.74075019, + "num_input_tokens_seen": 340006690, + "step": 15762, + "time_per_iteration": 2.4827866554260254 + }, + { + "auxiliary_loss_clip": 0.01111684, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.03537846, + "balance_loss_mlp": 1.02514553, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 2.0006038808818127, + "language_loss": 0.6700657, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69157374, + "num_input_tokens_seen": 340025480, + "step": 15763, + "time_per_iteration": 2.4549410343170166 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.03527164, + "balance_loss_mlp": 1.01995945, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 2.0776289795664944, + "language_loss": 0.69847381, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.71988404, + "num_input_tokens_seen": 340043785, + "step": 15764, + "time_per_iteration": 2.461200475692749 + }, + { + "auxiliary_loss_clip": 0.01094772, + "auxiliary_loss_mlp": 0.00775789, + "balance_loss_clip": 1.03742266, + "balance_loss_mlp": 1.00053704, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.6456379589976342, + "language_loss": 0.7096585, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.72836411, + "num_input_tokens_seen": 340064360, + "step": 15765, + "time_per_iteration": 4.354465007781982 + }, + { + "auxiliary_loss_clip": 0.01007865, + "auxiliary_loss_mlp": 0.01003449, + "balance_loss_clip": 1.00326157, + "balance_loss_mlp": 1.00238216, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.8411795384798554, + "language_loss": 0.59104902, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61116213, + "num_input_tokens_seen": 340114425, + "step": 15766, + "time_per_iteration": 2.84167742729187 + }, + { + "auxiliary_loss_clip": 0.01057925, + "auxiliary_loss_mlp": 0.01041689, + "balance_loss_clip": 1.03331482, + "balance_loss_mlp": 1.02917218, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 1.9038268407039263, + "language_loss": 0.74519986, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76619601, + "num_input_tokens_seen": 340132200, + "step": 15767, + "time_per_iteration": 2.59185528755188 + }, + { + "auxiliary_loss_clip": 0.01082837, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.03691757, + "balance_loss_mlp": 1.02128839, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 2.1927902657197493, + "language_loss": 0.73241544, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.75358725, + "num_input_tokens_seen": 340149175, + "step": 15768, + "time_per_iteration": 2.5485620498657227 + }, + { + "auxiliary_loss_clip": 0.00995458, + "auxiliary_loss_mlp": 0.00999686, + "balance_loss_clip": 1.01050425, + "balance_loss_mlp": 0.99842197, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7348751715297173, + "language_loss": 0.55266327, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57261473, + "num_input_tokens_seen": 340208155, + "step": 15769, + "time_per_iteration": 3.2267913818359375 + }, + { + "auxiliary_loss_clip": 0.01061795, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.03350163, + "balance_loss_mlp": 1.01691067, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.5000840021979511, + "language_loss": 0.77441978, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79532343, + "num_input_tokens_seen": 340229275, + "step": 15770, + "time_per_iteration": 2.6468636989593506 + }, + { + "auxiliary_loss_clip": 0.01092302, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.04026127, + "balance_loss_mlp": 1.02268386, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 2.0336126958646425, + "language_loss": 0.80046177, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.82174319, + "num_input_tokens_seen": 340248920, + "step": 15771, + "time_per_iteration": 2.5904927253723145 + }, + { + "auxiliary_loss_clip": 0.01075994, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.03361821, + "balance_loss_mlp": 1.0202843, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 1.9442200015309496, + "language_loss": 0.696491, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71757376, + "num_input_tokens_seen": 340266775, + "step": 15772, + "time_per_iteration": 2.5835258960723877 + }, + { + "auxiliary_loss_clip": 0.01091884, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.03367472, + "balance_loss_mlp": 1.02080488, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.7132767073726214, + "language_loss": 0.73785782, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.75911599, + "num_input_tokens_seen": 340285295, + "step": 15773, + "time_per_iteration": 2.5306923389434814 + }, + { + "auxiliary_loss_clip": 0.01074128, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.03361452, + "balance_loss_mlp": 1.02120996, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 2.1147093585274046, + "language_loss": 0.62922341, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.6503036, + "num_input_tokens_seen": 340304265, + "step": 15774, + "time_per_iteration": 2.5652196407318115 + }, + { + "auxiliary_loss_clip": 0.01108917, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.03559291, + "balance_loss_mlp": 1.02370167, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 1.700693545211102, + "language_loss": 0.59732282, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61877084, + "num_input_tokens_seen": 340323690, + "step": 15775, + "time_per_iteration": 2.454909563064575 + }, + { + "auxiliary_loss_clip": 0.01088927, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.03693449, + "balance_loss_mlp": 1.01849651, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.5906808315224479, + "language_loss": 0.61829555, + "learning_rate": 2.772114638584555e-08, + "loss": 0.6394977, + "num_input_tokens_seen": 340345830, + "step": 15776, + "time_per_iteration": 2.631775379180908 + }, + { + "auxiliary_loss_clip": 0.01085117, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.03682613, + "balance_loss_mlp": 1.01713538, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 1.748281648640852, + "language_loss": 0.73877847, + "learning_rate": 2.765656478622458e-08, + "loss": 0.7599287, + "num_input_tokens_seen": 340365910, + "step": 15777, + "time_per_iteration": 4.193332672119141 + }, + { + "auxiliary_loss_clip": 0.01109159, + "auxiliary_loss_mlp": 0.01039284, + "balance_loss_clip": 1.03923559, + "balance_loss_mlp": 1.0260632, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.5783138231015217, + "language_loss": 0.7234599, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74494433, + "num_input_tokens_seen": 340383935, + "step": 15778, + "time_per_iteration": 2.5348329544067383 + }, + { + "auxiliary_loss_clip": 0.01093715, + "auxiliary_loss_mlp": 0.00775282, + "balance_loss_clip": 1.03680313, + "balance_loss_mlp": 1.000561, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 2.0348914648667304, + "language_loss": 0.70261645, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.72130632, + "num_input_tokens_seen": 340402760, + "step": 15779, + "time_per_iteration": 2.503068208694458 + }, + { + "auxiliary_loss_clip": 0.01108828, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.03645718, + "balance_loss_mlp": 1.01900494, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 1.9859037625246427, + "language_loss": 0.78495026, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80635715, + "num_input_tokens_seen": 340422105, + "step": 15780, + "time_per_iteration": 3.8847689628601074 + }, + { + "auxiliary_loss_clip": 0.01084048, + "auxiliary_loss_mlp": 0.00776904, + "balance_loss_clip": 1.03872085, + "balance_loss_mlp": 1.00055707, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.9537041848236003, + "language_loss": 0.66323853, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68184805, + "num_input_tokens_seen": 340441160, + "step": 15781, + "time_per_iteration": 2.531970262527466 + }, + { + "auxiliary_loss_clip": 0.01106898, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.03616142, + "balance_loss_mlp": 1.02185798, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 2.06864687771105, + "language_loss": 0.79997337, + "learning_rate": 2.733477870890999e-08, + "loss": 0.82138729, + "num_input_tokens_seen": 340458200, + "step": 15782, + "time_per_iteration": 2.4476664066314697 + }, + { + "auxiliary_loss_clip": 0.01017805, + "auxiliary_loss_mlp": 0.00998275, + "balance_loss_clip": 1.00422215, + "balance_loss_mlp": 0.99716061, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.7206287638128656, + "language_loss": 0.59813058, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61829138, + "num_input_tokens_seen": 340526420, + "step": 15783, + "time_per_iteration": 3.155012369155884 + }, + { + "auxiliary_loss_clip": 0.01096855, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.03433943, + "balance_loss_mlp": 1.02166557, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.7984631180972017, + "language_loss": 0.74157667, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76289225, + "num_input_tokens_seen": 340546325, + "step": 15784, + "time_per_iteration": 2.528024435043335 + }, + { + "auxiliary_loss_clip": 0.01061281, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.03535485, + "balance_loss_mlp": 1.01796865, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 1.764120042699696, + "language_loss": 0.69478387, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71571493, + "num_input_tokens_seen": 340565145, + "step": 15785, + "time_per_iteration": 2.6472809314727783 + }, + { + "auxiliary_loss_clip": 0.01110207, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.03604198, + "balance_loss_mlp": 1.01731932, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.4478552782447547, + "language_loss": 0.76327592, + "learning_rate": 2.707869629830495e-08, + "loss": 0.78467399, + "num_input_tokens_seen": 340585465, + "step": 15786, + "time_per_iteration": 2.475548267364502 + }, + { + "auxiliary_loss_clip": 0.01075373, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.03718472, + "balance_loss_mlp": 1.01933646, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 1.6128532165018563, + "language_loss": 0.7915976, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81265545, + "num_input_tokens_seen": 340606010, + "step": 15787, + "time_per_iteration": 2.591226577758789 + }, + { + "auxiliary_loss_clip": 0.01097197, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.03818011, + "balance_loss_mlp": 1.01578617, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.624400479221709, + "language_loss": 0.76279318, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78403318, + "num_input_tokens_seen": 340626135, + "step": 15788, + "time_per_iteration": 2.506673812866211 + }, + { + "auxiliary_loss_clip": 0.01098954, + "auxiliary_loss_mlp": 0.01032346, + "balance_loss_clip": 1.03631616, + "balance_loss_mlp": 1.01926827, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 2.50739894271438, + "language_loss": 0.7163862, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.73769921, + "num_input_tokens_seen": 340644870, + "step": 15789, + "time_per_iteration": 2.500074625015259 + }, + { + "auxiliary_loss_clip": 0.01066018, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.03724146, + "balance_loss_mlp": 1.01579142, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 1.9521981752601079, + "language_loss": 0.7343455, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75529379, + "num_input_tokens_seen": 340663695, + "step": 15790, + "time_per_iteration": 2.570080041885376 + }, + { + "auxiliary_loss_clip": 0.01077029, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.03536212, + "balance_loss_mlp": 1.02098703, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 1.771945432883534, + "language_loss": 0.77727866, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79839486, + "num_input_tokens_seen": 340682970, + "step": 15791, + "time_per_iteration": 2.5801494121551514 + }, + { + "auxiliary_loss_clip": 0.01100861, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.03615963, + "balance_loss_mlp": 1.01884341, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 2.4710052574998227, + "language_loss": 0.73877186, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.7600984, + "num_input_tokens_seen": 340702275, + "step": 15792, + "time_per_iteration": 4.167632818222046 + }, + { + "auxiliary_loss_clip": 0.01096515, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.03368616, + "balance_loss_mlp": 1.02249146, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 1.8936886718870003, + "language_loss": 0.78340304, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80471492, + "num_input_tokens_seen": 340719060, + "step": 15793, + "time_per_iteration": 2.500256061553955 + }, + { + "auxiliary_loss_clip": 0.01080354, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.03322268, + "balance_loss_mlp": 1.0192591, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.7823506195004684, + "language_loss": 0.77147734, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79259247, + "num_input_tokens_seen": 340737815, + "step": 15794, + "time_per_iteration": 2.5248236656188965 + }, + { + "auxiliary_loss_clip": 0.01079541, + "auxiliary_loss_mlp": 0.00777684, + "balance_loss_clip": 1.03531408, + "balance_loss_mlp": 1.00067282, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 2.001276472969552, + "language_loss": 0.61031771, + "learning_rate": 2.650688769211107e-08, + "loss": 0.62889004, + "num_input_tokens_seen": 340756035, + "step": 15795, + "time_per_iteration": 2.558912992477417 + }, + { + "auxiliary_loss_clip": 0.01095, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.03562105, + "balance_loss_mlp": 1.02304363, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.6790241675163953, + "language_loss": 0.79072344, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81202769, + "num_input_tokens_seen": 340775620, + "step": 15796, + "time_per_iteration": 2.5276074409484863 + }, + { + "auxiliary_loss_clip": 0.01097696, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.03551984, + "balance_loss_mlp": 1.01741886, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 2.8508222233032527, + "language_loss": 0.75768244, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77896661, + "num_input_tokens_seen": 340794510, + "step": 15797, + "time_per_iteration": 2.5114829540252686 + }, + { + "auxiliary_loss_clip": 0.01077368, + "auxiliary_loss_mlp": 0.00778461, + "balance_loss_clip": 1.03510678, + "balance_loss_mlp": 1.00064397, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 3.05921963048384, + "language_loss": 0.6588434, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.67740178, + "num_input_tokens_seen": 340812955, + "step": 15798, + "time_per_iteration": 2.5576255321502686 + }, + { + "auxiliary_loss_clip": 0.01099294, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.03722334, + "balance_loss_mlp": 1.01759362, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 3.0626630024234167, + "language_loss": 0.77433395, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79562104, + "num_input_tokens_seen": 340829200, + "step": 15799, + "time_per_iteration": 2.48476243019104 + }, + { + "auxiliary_loss_clip": 0.01092791, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.03415442, + "balance_loss_mlp": 1.02267802, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 1.8400223173309942, + "language_loss": 0.71094888, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.73222512, + "num_input_tokens_seen": 340848035, + "step": 15800, + "time_per_iteration": 2.4987499713897705 + }, + { + "auxiliary_loss_clip": 0.0108176, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.03156936, + "balance_loss_mlp": 1.01594329, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 2.2100502200892027, + "language_loss": 0.72025657, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.74135864, + "num_input_tokens_seen": 340870025, + "step": 15801, + "time_per_iteration": 2.55367374420166 + }, + { + "auxiliary_loss_clip": 0.01097903, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.03476942, + "balance_loss_mlp": 1.01901519, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.5129477834604383, + "language_loss": 0.81057829, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83186877, + "num_input_tokens_seen": 340892290, + "step": 15802, + "time_per_iteration": 2.5502514839172363 + }, + { + "auxiliary_loss_clip": 0.01110774, + "auxiliary_loss_mlp": 0.01031439, + "balance_loss_clip": 1.03734481, + "balance_loss_mlp": 1.01944041, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 2.0300868427243537, + "language_loss": 0.67654502, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69796717, + "num_input_tokens_seen": 340912260, + "step": 15803, + "time_per_iteration": 2.528474807739258 + }, + { + "auxiliary_loss_clip": 0.01082918, + "auxiliary_loss_mlp": 0.01031201, + "balance_loss_clip": 1.03386354, + "balance_loss_mlp": 1.01917922, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 2.012345399296308, + "language_loss": 0.76178026, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78292149, + "num_input_tokens_seen": 340928930, + "step": 15804, + "time_per_iteration": 3.9794113636016846 + }, + { + "auxiliary_loss_clip": 0.01096186, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.03598857, + "balance_loss_mlp": 1.02260447, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 1.8870206740952855, + "language_loss": 0.73175359, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75307083, + "num_input_tokens_seen": 340946615, + "step": 15805, + "time_per_iteration": 2.4682364463806152 + }, + { + "auxiliary_loss_clip": 0.01084839, + "auxiliary_loss_mlp": 0.01035801, + "balance_loss_clip": 1.03682375, + "balance_loss_mlp": 1.02354026, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 1.6089237855790035, + "language_loss": 0.80304843, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82425481, + "num_input_tokens_seen": 340967545, + "step": 15806, + "time_per_iteration": 2.5675628185272217 + }, + { + "auxiliary_loss_clip": 0.01073492, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.03512478, + "balance_loss_mlp": 1.0213089, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 1.9546647866079998, + "language_loss": 0.82346016, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84453034, + "num_input_tokens_seen": 340984955, + "step": 15807, + "time_per_iteration": 2.5421035289764404 + }, + { + "auxiliary_loss_clip": 0.0109115, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.03305256, + "balance_loss_mlp": 1.02150357, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 1.6027053123639383, + "language_loss": 0.71850604, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.73975521, + "num_input_tokens_seen": 341007300, + "step": 15808, + "time_per_iteration": 2.538344621658325 + }, + { + "auxiliary_loss_clip": 0.01096355, + "auxiliary_loss_mlp": 0.01030036, + "balance_loss_clip": 1.03479266, + "balance_loss_mlp": 1.01819873, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.550829860269831, + "language_loss": 0.69843495, + "learning_rate": 2.562945671948058e-08, + "loss": 0.71969891, + "num_input_tokens_seen": 341026695, + "step": 15809, + "time_per_iteration": 2.503741502761841 + }, + { + "auxiliary_loss_clip": 0.0108485, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.03317261, + "balance_loss_mlp": 1.0147301, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.8074374610771349, + "language_loss": 0.75312316, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77424753, + "num_input_tokens_seen": 341047080, + "step": 15810, + "time_per_iteration": 2.5693533420562744 + }, + { + "auxiliary_loss_clip": 0.0107608, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.03576279, + "balance_loss_mlp": 1.03203702, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.5276677436181707, + "language_loss": 0.80021369, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.82142466, + "num_input_tokens_seen": 341067310, + "step": 15811, + "time_per_iteration": 2.601470470428467 + }, + { + "auxiliary_loss_clip": 0.01084235, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.03412664, + "balance_loss_mlp": 1.02356768, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 2.497039776888102, + "language_loss": 0.70062852, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.72183251, + "num_input_tokens_seen": 341085110, + "step": 15812, + "time_per_iteration": 2.628068685531616 + }, + { + "auxiliary_loss_clip": 0.01072513, + "auxiliary_loss_mlp": 0.01039065, + "balance_loss_clip": 1.03451478, + "balance_loss_mlp": 1.02533221, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 5.10156696770877, + "language_loss": 0.65251714, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67363292, + "num_input_tokens_seen": 341103190, + "step": 15813, + "time_per_iteration": 2.6855626106262207 + }, + { + "auxiliary_loss_clip": 0.01099662, + "auxiliary_loss_mlp": 0.01037291, + "balance_loss_clip": 1.03581107, + "balance_loss_mlp": 1.02460766, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.9163495088670668, + "language_loss": 0.70547396, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72684348, + "num_input_tokens_seen": 341125695, + "step": 15814, + "time_per_iteration": 2.612065315246582 + }, + { + "auxiliary_loss_clip": 0.01095004, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.03455257, + "balance_loss_mlp": 1.01726341, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 1.9909199181127546, + "language_loss": 0.63661313, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65785027, + "num_input_tokens_seen": 341143930, + "step": 15815, + "time_per_iteration": 2.5192227363586426 + }, + { + "auxiliary_loss_clip": 0.01083539, + "auxiliary_loss_mlp": 0.01029563, + "balance_loss_clip": 1.03408504, + "balance_loss_mlp": 1.01803505, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 3.2053609104331606, + "language_loss": 0.58711207, + "learning_rate": 2.519624364862061e-08, + "loss": 0.60824311, + "num_input_tokens_seen": 341164280, + "step": 15816, + "time_per_iteration": 4.145687580108643 + }, + { + "auxiliary_loss_clip": 0.01107216, + "auxiliary_loss_mlp": 0.01037868, + "balance_loss_clip": 1.03598547, + "balance_loss_mlp": 1.02589297, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.6086972007746831, + "language_loss": 0.7365334, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75798416, + "num_input_tokens_seen": 341183670, + "step": 15817, + "time_per_iteration": 2.51086163520813 + }, + { + "auxiliary_loss_clip": 0.01090379, + "auxiliary_loss_mlp": 0.01036221, + "balance_loss_clip": 1.03778577, + "balance_loss_mlp": 1.02175486, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.6883854529269269, + "language_loss": 0.59678882, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.61805481, + "num_input_tokens_seen": 341201900, + "step": 15818, + "time_per_iteration": 2.5001633167266846 + }, + { + "auxiliary_loss_clip": 0.01108695, + "auxiliary_loss_mlp": 0.01036863, + "balance_loss_clip": 1.03712463, + "balance_loss_mlp": 1.02403617, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.7335393908466814, + "language_loss": 0.69296777, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71442342, + "num_input_tokens_seen": 341218340, + "step": 15819, + "time_per_iteration": 2.4228479862213135 + }, + { + "auxiliary_loss_clip": 0.01071133, + "auxiliary_loss_mlp": 0.01025143, + "balance_loss_clip": 1.03839386, + "balance_loss_mlp": 1.01315677, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 2.487538543238732, + "language_loss": 0.74245334, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76341611, + "num_input_tokens_seen": 341235885, + "step": 15820, + "time_per_iteration": 4.0089967250823975 + }, + { + "auxiliary_loss_clip": 0.01091874, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.03793931, + "balance_loss_mlp": 1.02226925, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 1.8773079887143156, + "language_loss": 0.78760767, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80887401, + "num_input_tokens_seen": 341255280, + "step": 15821, + "time_per_iteration": 2.5123438835144043 + }, + { + "auxiliary_loss_clip": 0.01070868, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.03519058, + "balance_loss_mlp": 1.01711583, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.4573942330218743, + "language_loss": 0.7120918, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73310447, + "num_input_tokens_seen": 341279055, + "step": 15822, + "time_per_iteration": 2.688849925994873 + }, + { + "auxiliary_loss_clip": 0.01094956, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.03760958, + "balance_loss_mlp": 1.02363658, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.6644260598119114, + "language_loss": 0.65993661, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68123627, + "num_input_tokens_seen": 341298560, + "step": 15823, + "time_per_iteration": 2.5281388759613037 + }, + { + "auxiliary_loss_clip": 0.01091205, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.03488016, + "balance_loss_mlp": 1.02116942, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 1.8768645215149116, + "language_loss": 0.77309293, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79434061, + "num_input_tokens_seen": 341316650, + "step": 15824, + "time_per_iteration": 2.507711410522461 + }, + { + "auxiliary_loss_clip": 0.01112006, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.03603816, + "balance_loss_mlp": 1.01527941, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 2.0001582331022925, + "language_loss": 0.73359436, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75499612, + "num_input_tokens_seen": 341336185, + "step": 15825, + "time_per_iteration": 2.5562942028045654 + }, + { + "auxiliary_loss_clip": 0.01025221, + "auxiliary_loss_mlp": 0.01001172, + "balance_loss_clip": 1.01643491, + "balance_loss_mlp": 0.99999154, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8315715803248855, + "language_loss": 0.5341022, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55436611, + "num_input_tokens_seen": 341395795, + "step": 15826, + "time_per_iteration": 2.993849515914917 + }, + { + "auxiliary_loss_clip": 0.01085695, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.03650248, + "balance_loss_mlp": 1.02080584, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 1.7780857101739427, + "language_loss": 0.72287774, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74406451, + "num_input_tokens_seen": 341415675, + "step": 15827, + "time_per_iteration": 2.5793914794921875 + }, + { + "auxiliary_loss_clip": 0.01086308, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.03569174, + "balance_loss_mlp": 1.01662791, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 1.8912334977648821, + "language_loss": 0.7484504, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.76959765, + "num_input_tokens_seen": 341432990, + "step": 15828, + "time_per_iteration": 2.5674304962158203 + }, + { + "auxiliary_loss_clip": 0.01068915, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.03680897, + "balance_loss_mlp": 1.02202916, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.8757740637124638, + "language_loss": 0.72847813, + "learning_rate": 2.440144071047978e-08, + "loss": 0.74950272, + "num_input_tokens_seen": 341454100, + "step": 15829, + "time_per_iteration": 2.632664203643799 + }, + { + "auxiliary_loss_clip": 0.01094428, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.03359723, + "balance_loss_mlp": 1.02476096, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 1.7471261911611191, + "language_loss": 0.61564231, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.63696808, + "num_input_tokens_seen": 341472955, + "step": 15830, + "time_per_iteration": 2.489729404449463 + }, + { + "auxiliary_loss_clip": 0.0109637, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.03805041, + "balance_loss_mlp": 1.01766944, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 1.954096321626414, + "language_loss": 0.72603357, + "learning_rate": 2.428028693179729e-08, + "loss": 0.74731404, + "num_input_tokens_seen": 341490165, + "step": 15831, + "time_per_iteration": 2.463282585144043 + }, + { + "auxiliary_loss_clip": 0.01058541, + "auxiliary_loss_mlp": 0.01025453, + "balance_loss_clip": 1.03363657, + "balance_loss_mlp": 1.014516, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.6649128665797606, + "language_loss": 0.65859783, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67943776, + "num_input_tokens_seen": 341508055, + "step": 15832, + "time_per_iteration": 4.063828945159912 + }, + { + "auxiliary_loss_clip": 0.0109437, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.03738213, + "balance_loss_mlp": 1.02085292, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 1.7096513486417546, + "language_loss": 0.77873039, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.80000782, + "num_input_tokens_seen": 341526155, + "step": 15833, + "time_per_iteration": 2.477153778076172 + }, + { + "auxiliary_loss_clip": 0.01075731, + "auxiliary_loss_mlp": 0.0102798, + "balance_loss_clip": 1.0386852, + "balance_loss_mlp": 1.01592255, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 3.358148338230838, + "language_loss": 0.74761569, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.7686528, + "num_input_tokens_seen": 341540450, + "step": 15834, + "time_per_iteration": 2.545332431793213 + }, + { + "auxiliary_loss_clip": 0.01098857, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.03860784, + "balance_loss_mlp": 1.02375889, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 2.002713195943177, + "language_loss": 0.7622509, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78361261, + "num_input_tokens_seen": 341557865, + "step": 15835, + "time_per_iteration": 2.5115878582000732 + }, + { + "auxiliary_loss_clip": 0.01087643, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.03460896, + "balance_loss_mlp": 1.02075064, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 1.9280873696081542, + "language_loss": 0.65988964, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68110126, + "num_input_tokens_seen": 341573890, + "step": 15836, + "time_per_iteration": 2.555931568145752 + }, + { + "auxiliary_loss_clip": 0.01074614, + "auxiliary_loss_mlp": 0.01025429, + "balance_loss_clip": 1.0358243, + "balance_loss_mlp": 1.01344872, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.9998270593159597, + "language_loss": 0.70561171, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72661209, + "num_input_tokens_seen": 341593770, + "step": 15837, + "time_per_iteration": 2.639522075653076 + }, + { + "auxiliary_loss_clip": 0.01111029, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.03666401, + "balance_loss_mlp": 1.02307153, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 2.2585693337199584, + "language_loss": 0.73410219, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.7555846, + "num_input_tokens_seen": 341612065, + "step": 15838, + "time_per_iteration": 2.4767062664031982 + }, + { + "auxiliary_loss_clip": 0.01081147, + "auxiliary_loss_mlp": 0.01030162, + "balance_loss_clip": 1.03557074, + "balance_loss_mlp": 1.01751363, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.739525384012475, + "language_loss": 0.77910686, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80021995, + "num_input_tokens_seen": 341631365, + "step": 15839, + "time_per_iteration": 2.6197400093078613 + }, + { + "auxiliary_loss_clip": 0.0108747, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.04161906, + "balance_loss_mlp": 1.01878393, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.8630371987207024, + "language_loss": 0.80306053, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.8242451, + "num_input_tokens_seen": 341650300, + "step": 15840, + "time_per_iteration": 2.583000421524048 + }, + { + "auxiliary_loss_clip": 0.01077351, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.03377461, + "balance_loss_mlp": 1.02223861, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 2.013581964073535, + "language_loss": 0.72865677, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.74975812, + "num_input_tokens_seen": 341667680, + "step": 15841, + "time_per_iteration": 2.5334455966949463 + }, + { + "auxiliary_loss_clip": 0.01079812, + "auxiliary_loss_mlp": 0.01026846, + "balance_loss_clip": 1.03529644, + "balance_loss_mlp": 1.01603985, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 1.9978264340845118, + "language_loss": 0.7905376, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.8116042, + "num_input_tokens_seen": 341685760, + "step": 15842, + "time_per_iteration": 2.568164348602295 + }, + { + "auxiliary_loss_clip": 0.01085946, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.03691149, + "balance_loss_mlp": 1.01907647, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 1.6250887722449079, + "language_loss": 0.72242832, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74359965, + "num_input_tokens_seen": 341705300, + "step": 15843, + "time_per_iteration": 4.069530248641968 + }, + { + "auxiliary_loss_clip": 0.01077896, + "auxiliary_loss_mlp": 0.00778146, + "balance_loss_clip": 1.03487909, + "balance_loss_mlp": 1.00061631, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.840068593628509, + "language_loss": 0.78370434, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80226475, + "num_input_tokens_seen": 341724565, + "step": 15844, + "time_per_iteration": 2.5507514476776123 + }, + { + "auxiliary_loss_clip": 0.01074696, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.03409362, + "balance_loss_mlp": 1.02065444, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 2.0575380764041635, + "language_loss": 0.70175296, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72285247, + "num_input_tokens_seen": 341743605, + "step": 15845, + "time_per_iteration": 2.5959062576293945 + }, + { + "auxiliary_loss_clip": 0.01073672, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.03676486, + "balance_loss_mlp": 1.02141953, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.4667170740780835, + "language_loss": 0.75371844, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77479064, + "num_input_tokens_seen": 341763475, + "step": 15846, + "time_per_iteration": 2.585015058517456 + }, + { + "auxiliary_loss_clip": 0.01078619, + "auxiliary_loss_mlp": 0.01025861, + "balance_loss_clip": 1.03667748, + "balance_loss_mlp": 1.01410651, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 1.9406358110006772, + "language_loss": 0.78060526, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80165005, + "num_input_tokens_seen": 341781265, + "step": 15847, + "time_per_iteration": 2.5930967330932617 + }, + { + "auxiliary_loss_clip": 0.01068823, + "auxiliary_loss_mlp": 0.01036798, + "balance_loss_clip": 1.03638422, + "balance_loss_mlp": 1.0254792, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 1.4890973114298307, + "language_loss": 0.77821904, + "learning_rate": 2.326258115328672e-08, + "loss": 0.79927528, + "num_input_tokens_seen": 341798825, + "step": 15848, + "time_per_iteration": 2.59012770652771 + }, + { + "auxiliary_loss_clip": 0.01090238, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.03587317, + "balance_loss_mlp": 1.02897918, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.7621844770045554, + "language_loss": 0.71821177, + "learning_rate": 2.320339062183674e-08, + "loss": 0.73953688, + "num_input_tokens_seen": 341819480, + "step": 15849, + "time_per_iteration": 2.559865713119507 + }, + { + "auxiliary_loss_clip": 0.01105457, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.03940678, + "balance_loss_mlp": 1.0224452, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 1.5474430210818575, + "language_loss": 0.75083327, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77224058, + "num_input_tokens_seen": 341838035, + "step": 15850, + "time_per_iteration": 2.4977314472198486 + }, + { + "auxiliary_loss_clip": 0.01082786, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.03572559, + "balance_loss_mlp": 1.02135372, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.253326495019794, + "language_loss": 0.72515482, + "learning_rate": 2.308523444215482e-08, + "loss": 0.7463131, + "num_input_tokens_seen": 341855895, + "step": 15851, + "time_per_iteration": 2.584078311920166 + }, + { + "auxiliary_loss_clip": 0.01086122, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.03911602, + "balance_loss_mlp": 1.01731873, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 4.85222751542961, + "language_loss": 0.80131435, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.82247031, + "num_input_tokens_seen": 341875240, + "step": 15852, + "time_per_iteration": 2.6099741458892822 + }, + { + "auxiliary_loss_clip": 0.01098918, + "auxiliary_loss_mlp": 0.01035808, + "balance_loss_clip": 1.03498673, + "balance_loss_mlp": 1.02300489, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.5232147562558838, + "language_loss": 0.59816861, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.6195159, + "num_input_tokens_seen": 341901020, + "step": 15853, + "time_per_iteration": 2.7324066162109375 + }, + { + "auxiliary_loss_clip": 0.0108392, + "auxiliary_loss_mlp": 0.01028552, + "balance_loss_clip": 1.03343463, + "balance_loss_mlp": 1.01658368, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.8042973289841655, + "language_loss": 0.72593129, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74705601, + "num_input_tokens_seen": 341919365, + "step": 15854, + "time_per_iteration": 2.5482676029205322 + }, + { + "auxiliary_loss_clip": 0.01089074, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.04029274, + "balance_loss_mlp": 1.01884747, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.983554732208776, + "language_loss": 0.67759717, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69879562, + "num_input_tokens_seen": 341939985, + "step": 15855, + "time_per_iteration": 4.9167938232421875 + }, + { + "auxiliary_loss_clip": 0.01108274, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.03591943, + "balance_loss_mlp": 1.02024007, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 1.772261116803213, + "language_loss": 0.76592416, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78732765, + "num_input_tokens_seen": 341959255, + "step": 15856, + "time_per_iteration": 2.5045676231384277 + }, + { + "auxiliary_loss_clip": 0.01082859, + "auxiliary_loss_mlp": 0.01031925, + "balance_loss_clip": 1.03121352, + "balance_loss_mlp": 1.02024293, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.6899261994668697, + "language_loss": 0.77964187, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80078971, + "num_input_tokens_seen": 341977205, + "step": 15857, + "time_per_iteration": 2.5741686820983887 + }, + { + "auxiliary_loss_clip": 0.01020577, + "auxiliary_loss_mlp": 0.00999827, + "balance_loss_clip": 1.00683153, + "balance_loss_mlp": 0.99855119, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.7060131254512264, + "language_loss": 0.62617135, + "learning_rate": 2.267404932183803e-08, + "loss": 0.6463753, + "num_input_tokens_seen": 342038545, + "step": 15858, + "time_per_iteration": 3.043203830718994 + }, + { + "auxiliary_loss_clip": 0.01059082, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.03301525, + "balance_loss_mlp": 1.01716447, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.4554454352855384, + "language_loss": 0.56629503, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.58717251, + "num_input_tokens_seen": 342058195, + "step": 15859, + "time_per_iteration": 4.095558166503906 + }, + { + "auxiliary_loss_clip": 0.01104495, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.03544116, + "balance_loss_mlp": 1.01659763, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 3.0020052978356055, + "language_loss": 0.81993783, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.84126198, + "num_input_tokens_seen": 342075025, + "step": 15860, + "time_per_iteration": 2.4557607173919678 + }, + { + "auxiliary_loss_clip": 0.01061958, + "auxiliary_loss_mlp": 0.00775798, + "balance_loss_clip": 1.03729093, + "balance_loss_mlp": 1.00058722, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 2.039126307711963, + "language_loss": 0.66867691, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68705451, + "num_input_tokens_seen": 342094595, + "step": 15861, + "time_per_iteration": 2.6647086143493652 + }, + { + "auxiliary_loss_clip": 0.01097706, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.03436649, + "balance_loss_mlp": 1.02390647, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 1.6282507358578373, + "language_loss": 0.65845025, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67979383, + "num_input_tokens_seen": 342115970, + "step": 15862, + "time_per_iteration": 2.5935373306274414 + }, + { + "auxiliary_loss_clip": 0.01069157, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.03336632, + "balance_loss_mlp": 1.01941252, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.5692315749514076, + "language_loss": 0.67400444, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69500309, + "num_input_tokens_seen": 342134080, + "step": 15863, + "time_per_iteration": 2.6228888034820557 + }, + { + "auxiliary_loss_clip": 0.01087372, + "auxiliary_loss_mlp": 0.01026855, + "balance_loss_clip": 1.03262019, + "balance_loss_mlp": 1.01440966, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 2.5268115649803677, + "language_loss": 0.78048193, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80162424, + "num_input_tokens_seen": 342154725, + "step": 15864, + "time_per_iteration": 2.6198902130126953 + }, + { + "auxiliary_loss_clip": 0.0107692, + "auxiliary_loss_mlp": 0.01025184, + "balance_loss_clip": 1.03593719, + "balance_loss_mlp": 1.01334059, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 2.1203163541152703, + "language_loss": 0.5986222, + "learning_rate": 2.226653824047586e-08, + "loss": 0.61964321, + "num_input_tokens_seen": 342172275, + "step": 15865, + "time_per_iteration": 2.600250482559204 + }, + { + "auxiliary_loss_clip": 0.01067628, + "auxiliary_loss_mlp": 0.01032934, + "balance_loss_clip": 1.03446901, + "balance_loss_mlp": 1.01939142, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 1.8601285122714641, + "language_loss": 0.69755864, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.71856427, + "num_input_tokens_seen": 342190880, + "step": 15866, + "time_per_iteration": 2.599271535873413 + }, + { + "auxiliary_loss_clip": 0.01085061, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.03333724, + "balance_loss_mlp": 1.02458024, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 6.86866405846283, + "language_loss": 0.8511169, + "learning_rate": 2.215078143255855e-08, + "loss": 0.87234914, + "num_input_tokens_seen": 342208165, + "step": 15867, + "time_per_iteration": 2.5935394763946533 + }, + { + "auxiliary_loss_clip": 0.01015207, + "auxiliary_loss_mlp": 0.01003458, + "balance_loss_clip": 1.00500607, + "balance_loss_mlp": 1.00227165, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7512716726788436, + "language_loss": 0.61770636, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63789296, + "num_input_tokens_seen": 342277110, + "step": 15868, + "time_per_iteration": 3.1440842151641846 + }, + { + "auxiliary_loss_clip": 0.01077215, + "auxiliary_loss_mlp": 0.01025053, + "balance_loss_clip": 1.04085541, + "balance_loss_mlp": 1.0128696, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 2.8739220223563993, + "language_loss": 0.5989182, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.61994088, + "num_input_tokens_seen": 342294695, + "step": 15869, + "time_per_iteration": 2.6335887908935547 + }, + { + "auxiliary_loss_clip": 0.01071533, + "auxiliary_loss_mlp": 0.00781603, + "balance_loss_clip": 1.03202498, + "balance_loss_mlp": 1.00062656, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 1.7102602205819069, + "language_loss": 0.70987159, + "learning_rate": 2.197770872795579e-08, + "loss": 0.72840291, + "num_input_tokens_seen": 342314970, + "step": 15870, + "time_per_iteration": 2.6591169834136963 + }, + { + "auxiliary_loss_clip": 0.01073947, + "auxiliary_loss_mlp": 0.01029136, + "balance_loss_clip": 1.03947616, + "balance_loss_mlp": 1.01672077, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 2.526809091814502, + "language_loss": 0.76872367, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78975451, + "num_input_tokens_seen": 342334255, + "step": 15871, + "time_per_iteration": 4.148014783859253 + }, + { + "auxiliary_loss_clip": 0.0109725, + "auxiliary_loss_mlp": 0.0102996, + "balance_loss_clip": 1.0342046, + "balance_loss_mlp": 1.01701391, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 2.680183320446203, + "language_loss": 0.58072257, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60199463, + "num_input_tokens_seen": 342354730, + "step": 15872, + "time_per_iteration": 2.6609761714935303 + }, + { + "auxiliary_loss_clip": 0.01085277, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.03226376, + "balance_loss_mlp": 1.02065563, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 1.625939202836473, + "language_loss": 0.74731898, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.76851797, + "num_input_tokens_seen": 342374565, + "step": 15873, + "time_per_iteration": 2.558466672897339 + }, + { + "auxiliary_loss_clip": 0.01110801, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.03776908, + "balance_loss_mlp": 1.02102029, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 2.0008508382063708, + "language_loss": 0.62484479, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64629543, + "num_input_tokens_seen": 342394590, + "step": 15874, + "time_per_iteration": 2.522606611251831 + }, + { + "auxiliary_loss_clip": 0.01083733, + "auxiliary_loss_mlp": 0.01034197, + "balance_loss_clip": 1.03348815, + "balance_loss_mlp": 1.02182329, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 1.9849199740736199, + "language_loss": 0.89549279, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91667199, + "num_input_tokens_seen": 342410445, + "step": 15875, + "time_per_iteration": 2.534545421600342 + }, + { + "auxiliary_loss_clip": 0.01112933, + "auxiliary_loss_mlp": 0.01034268, + "balance_loss_clip": 1.03801334, + "balance_loss_mlp": 1.02120256, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 1.6592332933469522, + "language_loss": 0.67751485, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.69898689, + "num_input_tokens_seen": 342430970, + "step": 15876, + "time_per_iteration": 2.5097570419311523 + }, + { + "auxiliary_loss_clip": 0.0109738, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.03543067, + "balance_loss_mlp": 1.01910269, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 2.8606430048970277, + "language_loss": 0.69389307, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.7151897, + "num_input_tokens_seen": 342449505, + "step": 15877, + "time_per_iteration": 2.5322206020355225 + }, + { + "auxiliary_loss_clip": 0.01073203, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.03794515, + "balance_loss_mlp": 1.01792479, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.7678869890523268, + "language_loss": 0.71098566, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.7320298, + "num_input_tokens_seen": 342470390, + "step": 15878, + "time_per_iteration": 2.6009104251861572 + }, + { + "auxiliary_loss_clip": 0.01105887, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.03551853, + "balance_loss_mlp": 1.01689684, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.7046372491770572, + "language_loss": 0.68271887, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70407295, + "num_input_tokens_seen": 342492560, + "step": 15879, + "time_per_iteration": 2.5233154296875 + }, + { + "auxiliary_loss_clip": 0.01073002, + "auxiliary_loss_mlp": 0.00776161, + "balance_loss_clip": 1.03398716, + "balance_loss_mlp": 1.00051773, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 1.9780653158529926, + "language_loss": 0.85265887, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.87115049, + "num_input_tokens_seen": 342512315, + "step": 15880, + "time_per_iteration": 2.662473678588867 + }, + { + "auxiliary_loss_clip": 0.01046029, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.03231049, + "balance_loss_mlp": 1.02184772, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 1.8322088607718172, + "language_loss": 0.71821195, + "learning_rate": 2.134888478151753e-08, + "loss": 0.739025, + "num_input_tokens_seen": 342533060, + "step": 15881, + "time_per_iteration": 2.7417566776275635 + }, + { + "auxiliary_loss_clip": 0.01098087, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.03791356, + "balance_loss_mlp": 1.02457988, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 2.1249931169728407, + "language_loss": 0.71503854, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.73638856, + "num_input_tokens_seen": 342550830, + "step": 15882, + "time_per_iteration": 2.5006585121154785 + }, + { + "auxiliary_loss_clip": 0.0108806, + "auxiliary_loss_mlp": 0.01030542, + "balance_loss_clip": 1.03579307, + "balance_loss_mlp": 1.01870465, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 4.271439164315556, + "language_loss": 0.66101772, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.68220377, + "num_input_tokens_seen": 342575070, + "step": 15883, + "time_per_iteration": 4.244631052017212 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.0383426, + "balance_loss_mlp": 1.0221343, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 2.351517523566966, + "language_loss": 0.78052735, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.80187094, + "num_input_tokens_seen": 342592215, + "step": 15884, + "time_per_iteration": 2.4813807010650635 + }, + { + "auxiliary_loss_clip": 0.01109689, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.03551614, + "balance_loss_mlp": 1.01708019, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 1.622893307224941, + "language_loss": 0.77763969, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.79903424, + "num_input_tokens_seen": 342610030, + "step": 15885, + "time_per_iteration": 2.468430995941162 + }, + { + "auxiliary_loss_clip": 0.01108724, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.035326, + "balance_loss_mlp": 1.020576, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.648931644517569, + "language_loss": 0.70066124, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72207493, + "num_input_tokens_seen": 342626475, + "step": 15886, + "time_per_iteration": 2.51835036277771 + }, + { + "auxiliary_loss_clip": 0.01081362, + "auxiliary_loss_mlp": 0.01036535, + "balance_loss_clip": 1.04104376, + "balance_loss_mlp": 1.022856, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 2.2436127210381347, + "language_loss": 0.72614205, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.74732107, + "num_input_tokens_seen": 342646645, + "step": 15887, + "time_per_iteration": 2.605184555053711 + }, + { + "auxiliary_loss_clip": 0.01085652, + "auxiliary_loss_mlp": 0.01027855, + "balance_loss_clip": 1.03434253, + "balance_loss_mlp": 1.01596427, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 1.8635508932875378, + "language_loss": 0.5706287, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.59176373, + "num_input_tokens_seen": 342663615, + "step": 15888, + "time_per_iteration": 2.534641742706299 + }, + { + "auxiliary_loss_clip": 0.01018949, + "auxiliary_loss_mlp": 0.01003655, + "balance_loss_clip": 1.00526714, + "balance_loss_mlp": 1.00255823, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.7050322743377054, + "language_loss": 0.57889342, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.59911942, + "num_input_tokens_seen": 342728275, + "step": 15889, + "time_per_iteration": 3.1433188915252686 + }, + { + "auxiliary_loss_clip": 0.01108989, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.03486419, + "balance_loss_mlp": 1.01626658, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.411896463555948, + "language_loss": 0.66901934, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69039834, + "num_input_tokens_seen": 342748860, + "step": 15890, + "time_per_iteration": 2.4914488792419434 + }, + { + "auxiliary_loss_clip": 0.01106985, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.03583527, + "balance_loss_mlp": 1.01987123, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.4311156980678499, + "language_loss": 0.73770308, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.75908911, + "num_input_tokens_seen": 342769705, + "step": 15891, + "time_per_iteration": 2.544517993927002 + }, + { + "auxiliary_loss_clip": 0.01071398, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.03340936, + "balance_loss_mlp": 1.02107215, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 1.7685942312858918, + "language_loss": 0.7801854, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80122125, + "num_input_tokens_seen": 342787000, + "step": 15892, + "time_per_iteration": 2.604252576828003 + }, + { + "auxiliary_loss_clip": 0.01106306, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.03529406, + "balance_loss_mlp": 1.02011395, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.6431753204463833, + "language_loss": 0.69919467, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.72058642, + "num_input_tokens_seen": 342807795, + "step": 15893, + "time_per_iteration": 2.523252010345459 + }, + { + "auxiliary_loss_clip": 0.01088899, + "auxiliary_loss_mlp": 0.00777594, + "balance_loss_clip": 1.03785992, + "balance_loss_mlp": 1.00059891, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 2.144941039815582, + "language_loss": 0.66459036, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.68325526, + "num_input_tokens_seen": 342825490, + "step": 15894, + "time_per_iteration": 2.524742364883423 + }, + { + "auxiliary_loss_clip": 0.01098239, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.03609514, + "balance_loss_mlp": 1.01826668, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 1.8636792415973784, + "language_loss": 0.81519061, + "learning_rate": 2.056169412853581e-08, + "loss": 0.83648378, + "num_input_tokens_seen": 342844965, + "step": 15895, + "time_per_iteration": 4.509793281555176 + }, + { + "auxiliary_loss_clip": 0.01088912, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.04098964, + "balance_loss_mlp": 1.01849186, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 1.5779512492257883, + "language_loss": 0.72413933, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74533844, + "num_input_tokens_seen": 342865915, + "step": 15896, + "time_per_iteration": 2.6039867401123047 + }, + { + "auxiliary_loss_clip": 0.01105836, + "auxiliary_loss_mlp": 0.01033672, + "balance_loss_clip": 1.03461015, + "balance_loss_mlp": 1.02136374, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 2.117049571984182, + "language_loss": 0.79175395, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81314909, + "num_input_tokens_seen": 342884000, + "step": 15897, + "time_per_iteration": 2.4605867862701416 + }, + { + "auxiliary_loss_clip": 0.01082832, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.03160048, + "balance_loss_mlp": 1.0176847, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.8952771859857116, + "language_loss": 0.72822857, + "learning_rate": 2.03949242614303e-08, + "loss": 0.74936748, + "num_input_tokens_seen": 342903095, + "step": 15898, + "time_per_iteration": 3.9083003997802734 + }, + { + "auxiliary_loss_clip": 0.01003504, + "auxiliary_loss_mlp": 0.01004437, + "balance_loss_clip": 1.00876963, + "balance_loss_mlp": 1.00323319, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.9198946341465697, + "language_loss": 0.52353102, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54361045, + "num_input_tokens_seen": 342958155, + "step": 15899, + "time_per_iteration": 3.063084840774536 + }, + { + "auxiliary_loss_clip": 0.01103488, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.03785002, + "balance_loss_mlp": 1.02015805, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.2423281099034655, + "language_loss": 0.68739605, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70876974, + "num_input_tokens_seen": 342972500, + "step": 15900, + "time_per_iteration": 2.4681642055511475 + }, + { + "auxiliary_loss_clip": 0.01100182, + "auxiliary_loss_mlp": 0.0077746, + "balance_loss_clip": 1.03576756, + "balance_loss_mlp": 1.00063813, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 2.2949154700341396, + "language_loss": 0.83002853, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.84880495, + "num_input_tokens_seen": 342989035, + "step": 15901, + "time_per_iteration": 2.4496679306030273 + }, + { + "auxiliary_loss_clip": 0.01010051, + "auxiliary_loss_mlp": 0.01007467, + "balance_loss_clip": 1.01854372, + "balance_loss_mlp": 1.00635231, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.7082201739841292, + "language_loss": 0.54291403, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56308919, + "num_input_tokens_seen": 343051675, + "step": 15902, + "time_per_iteration": 3.1806771755218506 + }, + { + "auxiliary_loss_clip": 0.0108452, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.03569722, + "balance_loss_mlp": 1.01991022, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.7245773763823402, + "language_loss": 0.85312819, + "learning_rate": 2.01184758473425e-08, + "loss": 0.8742733, + "num_input_tokens_seen": 343068895, + "step": 15903, + "time_per_iteration": 2.523350477218628 + }, + { + "auxiliary_loss_clip": 0.01082815, + "auxiliary_loss_mlp": 0.00780528, + "balance_loss_clip": 1.03426957, + "balance_loss_mlp": 1.00060844, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 1.9513277132541633, + "language_loss": 0.80499357, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.823627, + "num_input_tokens_seen": 343087115, + "step": 15904, + "time_per_iteration": 2.4882187843322754 + }, + { + "auxiliary_loss_clip": 0.01100499, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.03611183, + "balance_loss_mlp": 1.0214026, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 5.508559807928457, + "language_loss": 0.59874624, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62009662, + "num_input_tokens_seen": 343105575, + "step": 15905, + "time_per_iteration": 2.513645887374878 + }, + { + "auxiliary_loss_clip": 0.01097111, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.03660119, + "balance_loss_mlp": 1.01786947, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 1.8082307791323127, + "language_loss": 0.70304704, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72431475, + "num_input_tokens_seen": 343123025, + "step": 15906, + "time_per_iteration": 2.510148286819458 + }, + { + "auxiliary_loss_clip": 0.01059405, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.0335989, + "balance_loss_mlp": 1.01631236, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.8301769398105499, + "language_loss": 0.70355928, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.72444677, + "num_input_tokens_seen": 343141625, + "step": 15907, + "time_per_iteration": 2.584907054901123 + }, + { + "auxiliary_loss_clip": 0.01067881, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.0327332, + "balance_loss_mlp": 1.02033973, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 2.070395254336525, + "language_loss": 0.70154977, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72255504, + "num_input_tokens_seen": 343161300, + "step": 15908, + "time_per_iteration": 2.591716766357422 + }, + { + "auxiliary_loss_clip": 0.01086379, + "auxiliary_loss_mlp": 0.00777408, + "balance_loss_clip": 1.03644991, + "balance_loss_mlp": 1.00056863, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 1.7778109995939144, + "language_loss": 0.83100522, + "learning_rate": 1.978921532427802e-08, + "loss": 0.84964311, + "num_input_tokens_seen": 343177815, + "step": 15909, + "time_per_iteration": 2.4922008514404297 + }, + { + "auxiliary_loss_clip": 0.01096386, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.03432977, + "balance_loss_mlp": 1.02032423, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 1.9970031798314969, + "language_loss": 0.67497563, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69626188, + "num_input_tokens_seen": 343198140, + "step": 15910, + "time_per_iteration": 4.01730751991272 + }, + { + "auxiliary_loss_clip": 0.01101143, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.0379076, + "balance_loss_mlp": 1.0225153, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.806564296379171, + "language_loss": 0.74416715, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76552355, + "num_input_tokens_seen": 343218280, + "step": 15911, + "time_per_iteration": 2.531261920928955 + }, + { + "auxiliary_loss_clip": 0.01096529, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.03533912, + "balance_loss_mlp": 1.01617956, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 2.1331808248271344, + "language_loss": 0.69405222, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71529567, + "num_input_tokens_seen": 343236850, + "step": 15912, + "time_per_iteration": 2.433887243270874 + }, + { + "auxiliary_loss_clip": 0.01087883, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.03387201, + "balance_loss_mlp": 1.0229075, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 2.498580619308628, + "language_loss": 0.72760963, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74884546, + "num_input_tokens_seen": 343253065, + "step": 15913, + "time_per_iteration": 2.448058605194092 + }, + { + "auxiliary_loss_clip": 0.01034469, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.03921449, + "balance_loss_mlp": 1.01882696, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 2.100797205959879, + "language_loss": 0.73892063, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75956964, + "num_input_tokens_seen": 343270330, + "step": 15914, + "time_per_iteration": 2.6176798343658447 + }, + { + "auxiliary_loss_clip": 0.01106924, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.03663683, + "balance_loss_mlp": 1.01605177, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.3672380567842166, + "language_loss": 0.67079449, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69214463, + "num_input_tokens_seen": 343289625, + "step": 15915, + "time_per_iteration": 2.474724531173706 + }, + { + "auxiliary_loss_clip": 0.01092945, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.03514051, + "balance_loss_mlp": 1.01936173, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 1.7137701133657006, + "language_loss": 0.63963795, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66088027, + "num_input_tokens_seen": 343309200, + "step": 15916, + "time_per_iteration": 2.476736307144165 + }, + { + "auxiliary_loss_clip": 0.01102117, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.0354557, + "balance_loss_mlp": 1.01598215, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 2.041136114053634, + "language_loss": 0.80621839, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82751238, + "num_input_tokens_seen": 343326270, + "step": 15917, + "time_per_iteration": 2.4556126594543457 + }, + { + "auxiliary_loss_clip": 0.01076545, + "auxiliary_loss_mlp": 0.01036018, + "balance_loss_clip": 1.03335047, + "balance_loss_mlp": 1.02346539, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 1.7813522597448395, + "language_loss": 0.72902942, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.75015509, + "num_input_tokens_seen": 343344430, + "step": 15918, + "time_per_iteration": 2.4583630561828613 + }, + { + "auxiliary_loss_clip": 0.0100291, + "auxiliary_loss_mlp": 0.01004131, + "balance_loss_clip": 1.00652504, + "balance_loss_mlp": 1.00303388, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6484549379613159, + "language_loss": 0.53051651, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55058694, + "num_input_tokens_seen": 343416155, + "step": 15919, + "time_per_iteration": 3.2028374671936035 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.01036461, + "balance_loss_clip": 1.04103017, + "balance_loss_mlp": 1.02279317, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 2.593123639352112, + "language_loss": 0.75445521, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77589667, + "num_input_tokens_seen": 343431715, + "step": 15920, + "time_per_iteration": 2.441500186920166 + }, + { + "auxiliary_loss_clip": 0.01075744, + "auxiliary_loss_mlp": 0.01035512, + "balance_loss_clip": 1.0374651, + "balance_loss_mlp": 1.02267897, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 1.5813182676579307, + "language_loss": 0.79016459, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.81127715, + "num_input_tokens_seen": 343450425, + "step": 15921, + "time_per_iteration": 2.5086288452148438 + }, + { + "auxiliary_loss_clip": 0.01101692, + "auxiliary_loss_mlp": 0.01030197, + "balance_loss_clip": 1.03443038, + "balance_loss_mlp": 1.01711988, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 1.7769837851829466, + "language_loss": 0.50937581, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.53069466, + "num_input_tokens_seen": 343470445, + "step": 15922, + "time_per_iteration": 2.569955587387085 + }, + { + "auxiliary_loss_clip": 0.01055916, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.03011811, + "balance_loss_mlp": 1.02251053, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 2.0604460460447824, + "language_loss": 0.83747351, + "learning_rate": 1.903145411006557e-08, + "loss": 0.85839957, + "num_input_tokens_seen": 343485200, + "step": 15923, + "time_per_iteration": 4.0614306926727295 + }, + { + "auxiliary_loss_clip": 0.0108285, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.03216553, + "balance_loss_mlp": 1.02168393, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.6350267661707958, + "language_loss": 0.75175905, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77292287, + "num_input_tokens_seen": 343505080, + "step": 15924, + "time_per_iteration": 2.555145502090454 + }, + { + "auxiliary_loss_clip": 0.0108772, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.03631902, + "balance_loss_mlp": 1.01816273, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 10.586210414417712, + "language_loss": 0.8637377, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88491589, + "num_input_tokens_seen": 343523995, + "step": 15925, + "time_per_iteration": 2.570401668548584 + }, + { + "auxiliary_loss_clip": 0.01078596, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.03369868, + "balance_loss_mlp": 1.0201664, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 1.9461416847419468, + "language_loss": 0.7541762, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77529246, + "num_input_tokens_seen": 343542015, + "step": 15926, + "time_per_iteration": 2.56595778465271 + }, + { + "auxiliary_loss_clip": 0.01085479, + "auxiliary_loss_mlp": 0.01030967, + "balance_loss_clip": 1.0404563, + "balance_loss_mlp": 1.01985669, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 2.0697348461217127, + "language_loss": 0.77967381, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.80083823, + "num_input_tokens_seen": 343561680, + "step": 15927, + "time_per_iteration": 2.5251405239105225 + }, + { + "auxiliary_loss_clip": 0.01067802, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.0384264, + "balance_loss_mlp": 1.02188587, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 1.6555086062762923, + "language_loss": 0.68547457, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.706505, + "num_input_tokens_seen": 343585290, + "step": 15928, + "time_per_iteration": 2.6793251037597656 + }, + { + "auxiliary_loss_clip": 0.01088268, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.03699422, + "balance_loss_mlp": 1.01770711, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.765694489643722, + "language_loss": 0.82479966, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84598172, + "num_input_tokens_seen": 343604045, + "step": 15929, + "time_per_iteration": 2.5269949436187744 + }, + { + "auxiliary_loss_clip": 0.01079208, + "auxiliary_loss_mlp": 0.01040402, + "balance_loss_clip": 1.03575265, + "balance_loss_mlp": 1.02727735, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.5465231513110431, + "language_loss": 0.72519583, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74639189, + "num_input_tokens_seen": 343626595, + "step": 15930, + "time_per_iteration": 2.5882515907287598 + }, + { + "auxiliary_loss_clip": 0.0104661, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.03779948, + "balance_loss_mlp": 1.02206445, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.4202760715511284, + "language_loss": 0.62296981, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64379191, + "num_input_tokens_seen": 343646195, + "step": 15931, + "time_per_iteration": 2.629978895187378 + }, + { + "auxiliary_loss_clip": 0.01106136, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.0368135, + "balance_loss_mlp": 1.01780224, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 1.9791027445518237, + "language_loss": 0.69085419, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71221083, + "num_input_tokens_seen": 343663665, + "step": 15932, + "time_per_iteration": 2.4364118576049805 + }, + { + "auxiliary_loss_clip": 0.01080863, + "auxiliary_loss_mlp": 0.01037011, + "balance_loss_clip": 1.03523743, + "balance_loss_mlp": 1.02293277, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 1.96374696324161, + "language_loss": 0.75383258, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77501142, + "num_input_tokens_seen": 343682145, + "step": 15933, + "time_per_iteration": 2.5040700435638428 + }, + { + "auxiliary_loss_clip": 0.00999327, + "auxiliary_loss_mlp": 0.01001335, + "balance_loss_clip": 1.01194, + "balance_loss_mlp": 1.00031567, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.6952340858258264, + "language_loss": 0.57297599, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59298259, + "num_input_tokens_seen": 343744685, + "step": 15934, + "time_per_iteration": 4.862305641174316 + }, + { + "auxiliary_loss_clip": 0.01026586, + "auxiliary_loss_mlp": 0.00752964, + "balance_loss_clip": 1.0031935, + "balance_loss_mlp": 1.00028265, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9132910742619694, + "language_loss": 0.65965438, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67744988, + "num_input_tokens_seen": 343801835, + "step": 15935, + "time_per_iteration": 2.939321994781494 + }, + { + "auxiliary_loss_clip": 0.01018598, + "auxiliary_loss_mlp": 0.01002682, + "balance_loss_clip": 1.0200398, + "balance_loss_mlp": 1.00136435, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7869415264744127, + "language_loss": 0.56981647, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59002924, + "num_input_tokens_seen": 343861515, + "step": 15936, + "time_per_iteration": 3.0537214279174805 + }, + { + "auxiliary_loss_clip": 0.01052005, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.0353688, + "balance_loss_mlp": 1.01830602, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 1.716051929443559, + "language_loss": 0.78230393, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80313081, + "num_input_tokens_seen": 343881240, + "step": 15937, + "time_per_iteration": 4.003803730010986 + }, + { + "auxiliary_loss_clip": 0.01096654, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.03494143, + "balance_loss_mlp": 1.02025259, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 1.7252034132955079, + "language_loss": 0.6856482, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.7069484, + "num_input_tokens_seen": 343900885, + "step": 15938, + "time_per_iteration": 2.484396457672119 + }, + { + "auxiliary_loss_clip": 0.01076478, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.03516221, + "balance_loss_mlp": 1.0143199, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 2.3583062111031863, + "language_loss": 0.65533996, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.67636746, + "num_input_tokens_seen": 343918460, + "step": 15939, + "time_per_iteration": 2.56278657913208 + }, + { + "auxiliary_loss_clip": 0.01079883, + "auxiliary_loss_mlp": 0.01039525, + "balance_loss_clip": 1.03309846, + "balance_loss_mlp": 1.02686453, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 1.5220123518123525, + "language_loss": 0.7354629, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.756657, + "num_input_tokens_seen": 343938030, + "step": 15940, + "time_per_iteration": 2.5520730018615723 + }, + { + "auxiliary_loss_clip": 0.01108981, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.03674197, + "balance_loss_mlp": 1.0189414, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 4.699139219766385, + "language_loss": 0.7284289, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.74983716, + "num_input_tokens_seen": 343956635, + "step": 15941, + "time_per_iteration": 2.432119607925415 + }, + { + "auxiliary_loss_clip": 0.01086757, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.03697705, + "balance_loss_mlp": 1.02589679, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 1.4848294896149572, + "language_loss": 0.71417123, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73541427, + "num_input_tokens_seen": 343976625, + "step": 15942, + "time_per_iteration": 2.545504093170166 + }, + { + "auxiliary_loss_clip": 0.01110653, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.03663623, + "balance_loss_mlp": 1.02113938, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.5224205456550808, + "language_loss": 0.71884549, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74029374, + "num_input_tokens_seen": 343997790, + "step": 15943, + "time_per_iteration": 2.543419599533081 + }, + { + "auxiliary_loss_clip": 0.01100048, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.03717864, + "balance_loss_mlp": 1.02413988, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.8008481004702612, + "language_loss": 0.68021762, + "learning_rate": 1.792242006001965e-08, + "loss": 0.70158547, + "num_input_tokens_seen": 344016935, + "step": 15944, + "time_per_iteration": 2.4666411876678467 + }, + { + "auxiliary_loss_clip": 0.01107799, + "auxiliary_loss_mlp": 0.01037685, + "balance_loss_clip": 1.03459895, + "balance_loss_mlp": 1.02527535, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 1.6856371509072465, + "language_loss": 0.66084331, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.68229818, + "num_input_tokens_seen": 344035590, + "step": 15945, + "time_per_iteration": 2.395226240158081 + }, + { + "auxiliary_loss_clip": 0.00973916, + "auxiliary_loss_mlp": 0.01001671, + "balance_loss_clip": 1.01650262, + "balance_loss_mlp": 1.00030041, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7395731520731961, + "language_loss": 0.6186713, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.6384272, + "num_input_tokens_seen": 344100845, + "step": 15946, + "time_per_iteration": 3.485865354537964 + }, + { + "auxiliary_loss_clip": 0.01104636, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.03525496, + "balance_loss_mlp": 1.01843333, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 1.944504476269901, + "language_loss": 0.75241828, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.77376962, + "num_input_tokens_seen": 344121780, + "step": 15947, + "time_per_iteration": 2.830735921859741 + }, + { + "auxiliary_loss_clip": 0.01079058, + "auxiliary_loss_mlp": 0.0102678, + "balance_loss_clip": 1.03123629, + "balance_loss_mlp": 1.01512122, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.5456887058719486, + "language_loss": 0.69943726, + "learning_rate": 1.771493294473747e-08, + "loss": 0.72049564, + "num_input_tokens_seen": 344140150, + "step": 15948, + "time_per_iteration": 2.490262031555176 + }, + { + "auxiliary_loss_clip": 0.01059316, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.03610682, + "balance_loss_mlp": 1.0174371, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 2.5005760542915456, + "language_loss": 0.78752238, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80840749, + "num_input_tokens_seen": 344158200, + "step": 15949, + "time_per_iteration": 2.592862367630005 + }, + { + "auxiliary_loss_clip": 0.01110451, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.0376761, + "balance_loss_mlp": 1.0228312, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 1.7816088606549976, + "language_loss": 0.68689167, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70835644, + "num_input_tokens_seen": 344174720, + "step": 15950, + "time_per_iteration": 4.338485240936279 + }, + { + "auxiliary_loss_clip": 0.01088734, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.03601861, + "balance_loss_mlp": 1.01687527, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 1.8364863358317234, + "language_loss": 0.8611365, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88230699, + "num_input_tokens_seen": 344192580, + "step": 15951, + "time_per_iteration": 2.5491466522216797 + }, + { + "auxiliary_loss_clip": 0.01087074, + "auxiliary_loss_mlp": 0.01039072, + "balance_loss_clip": 1.03567719, + "balance_loss_mlp": 1.02614427, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.0741083447676347, + "language_loss": 0.80019754, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.82145905, + "num_input_tokens_seen": 344210345, + "step": 15952, + "time_per_iteration": 2.5297648906707764 + }, + { + "auxiliary_loss_clip": 0.01097733, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.03716969, + "balance_loss_mlp": 1.0177927, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 1.9764320931997625, + "language_loss": 0.69438106, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.71566474, + "num_input_tokens_seen": 344229540, + "step": 15953, + "time_per_iteration": 2.4786980152130127 + }, + { + "auxiliary_loss_clip": 0.01052988, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.03817058, + "balance_loss_mlp": 1.02248275, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 3.1772561572584257, + "language_loss": 0.58657295, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.60747224, + "num_input_tokens_seen": 344247830, + "step": 15954, + "time_per_iteration": 2.68356990814209 + }, + { + "auxiliary_loss_clip": 0.01098879, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.03569221, + "balance_loss_mlp": 1.02265263, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 2.1008468681564896, + "language_loss": 0.73965371, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.76100385, + "num_input_tokens_seen": 344267760, + "step": 15955, + "time_per_iteration": 2.5253961086273193 + }, + { + "auxiliary_loss_clip": 0.01088122, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.03661799, + "balance_loss_mlp": 1.02170801, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 2.123554614205998, + "language_loss": 0.62343454, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.64466184, + "num_input_tokens_seen": 344284905, + "step": 15956, + "time_per_iteration": 2.503074884414673 + }, + { + "auxiliary_loss_clip": 0.01071743, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.0373106, + "balance_loss_mlp": 1.01703727, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 2.537729456163641, + "language_loss": 0.60044408, + "learning_rate": 1.725248447997507e-08, + "loss": 0.62145555, + "num_input_tokens_seen": 344302025, + "step": 15957, + "time_per_iteration": 2.503417730331421 + }, + { + "auxiliary_loss_clip": 0.01074247, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.03540134, + "balance_loss_mlp": 1.02415133, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 1.8972187855199314, + "language_loss": 0.7429502, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76406372, + "num_input_tokens_seen": 344321935, + "step": 15958, + "time_per_iteration": 2.583047866821289 + }, + { + "auxiliary_loss_clip": 0.01085091, + "auxiliary_loss_mlp": 0.00776932, + "balance_loss_clip": 1.03291595, + "balance_loss_mlp": 1.00049734, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.6169608242296192, + "language_loss": 0.74758482, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76620507, + "num_input_tokens_seen": 344340405, + "step": 15959, + "time_per_iteration": 2.5172533988952637 + }, + { + "auxiliary_loss_clip": 0.01095718, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.03562772, + "balance_loss_mlp": 1.02018452, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 2.1075028134272387, + "language_loss": 0.65142649, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67271209, + "num_input_tokens_seen": 344359925, + "step": 15960, + "time_per_iteration": 2.476909637451172 + }, + { + "auxiliary_loss_clip": 0.011054, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.03553605, + "balance_loss_mlp": 1.02350056, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 2.7674930038695735, + "language_loss": 0.77868187, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80009484, + "num_input_tokens_seen": 344379100, + "step": 15961, + "time_per_iteration": 2.452826976776123 + }, + { + "auxiliary_loss_clip": 0.0106481, + "auxiliary_loss_mlp": 0.01027916, + "balance_loss_clip": 1.03780818, + "balance_loss_mlp": 1.01634669, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 1.9282240453215065, + "language_loss": 0.75863338, + "learning_rate": 1.699820008484698e-08, + "loss": 0.77956057, + "num_input_tokens_seen": 344396895, + "step": 15962, + "time_per_iteration": 3.9885897636413574 + }, + { + "auxiliary_loss_clip": 0.01089837, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.0368042, + "balance_loss_mlp": 1.01884604, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 1.983375115206316, + "language_loss": 0.71827281, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.73948514, + "num_input_tokens_seen": 344415115, + "step": 15963, + "time_per_iteration": 2.559565544128418 + }, + { + "auxiliary_loss_clip": 0.01079856, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.03855467, + "balance_loss_mlp": 1.02018213, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.5644833079792084, + "language_loss": 0.74299711, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76411331, + "num_input_tokens_seen": 344435185, + "step": 15964, + "time_per_iteration": 2.5261237621307373 + }, + { + "auxiliary_loss_clip": 0.00991219, + "auxiliary_loss_mlp": 0.01001316, + "balance_loss_clip": 1.00537264, + "balance_loss_mlp": 1.00014138, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.8826825836712462, + "language_loss": 0.57615614, + "learning_rate": 1.684653177987161e-08, + "loss": 0.5960815, + "num_input_tokens_seen": 344488950, + "step": 15965, + "time_per_iteration": 3.062610149383545 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.0361346, + "balance_loss_mlp": 1.01932907, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 1.5889233037094264, + "language_loss": 0.78901565, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.81042171, + "num_input_tokens_seen": 344506740, + "step": 15966, + "time_per_iteration": 2.417900323867798 + }, + { + "auxiliary_loss_clip": 0.0108223, + "auxiliary_loss_mlp": 0.01031725, + "balance_loss_clip": 1.03194392, + "balance_loss_mlp": 1.01914835, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 1.6763633360113774, + "language_loss": 0.79037029, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81150985, + "num_input_tokens_seen": 344526670, + "step": 15967, + "time_per_iteration": 2.507822036743164 + }, + { + "auxiliary_loss_clip": 0.01056973, + "auxiliary_loss_mlp": 0.01033926, + "balance_loss_clip": 1.03293347, + "balance_loss_mlp": 1.02008605, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 1.9414510868327937, + "language_loss": 0.80723161, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82814056, + "num_input_tokens_seen": 344541995, + "step": 15968, + "time_per_iteration": 2.5410990715026855 + }, + { + "auxiliary_loss_clip": 0.01064217, + "auxiliary_loss_mlp": 0.01041836, + "balance_loss_clip": 1.03559446, + "balance_loss_mlp": 1.02689338, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.698199576037238, + "language_loss": 0.67796844, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69902897, + "num_input_tokens_seen": 344559980, + "step": 15969, + "time_per_iteration": 2.585458755493164 + }, + { + "auxiliary_loss_clip": 0.01097523, + "auxiliary_loss_mlp": 0.01037992, + "balance_loss_clip": 1.03590477, + "balance_loss_mlp": 1.02617264, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 3.3966458944038176, + "language_loss": 0.79208672, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81344187, + "num_input_tokens_seen": 344577765, + "step": 15970, + "time_per_iteration": 2.4500656127929688 + }, + { + "auxiliary_loss_clip": 0.01095123, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.03664613, + "balance_loss_mlp": 1.01903057, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.6254724049094846, + "language_loss": 0.77493465, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79620272, + "num_input_tokens_seen": 344597650, + "step": 15971, + "time_per_iteration": 2.5097508430480957 + }, + { + "auxiliary_loss_clip": 0.01093619, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03731108, + "balance_loss_mlp": 1.01513076, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 1.9484244603570269, + "language_loss": 0.67129815, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69251579, + "num_input_tokens_seen": 344613580, + "step": 15972, + "time_per_iteration": 2.4943690299987793 + }, + { + "auxiliary_loss_clip": 0.01095145, + "auxiliary_loss_mlp": 0.00776594, + "balance_loss_clip": 1.03474164, + "balance_loss_mlp": 1.00056458, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.290697094972362, + "language_loss": 0.76209009, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78080738, + "num_input_tokens_seen": 344626910, + "step": 15973, + "time_per_iteration": 2.4118199348449707 + }, + { + "auxiliary_loss_clip": 0.0107069, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.04015613, + "balance_loss_mlp": 1.03362846, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 1.6483332208832524, + "language_loss": 0.69131327, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71248937, + "num_input_tokens_seen": 344644330, + "step": 15974, + "time_per_iteration": 4.137258529663086 + }, + { + "auxiliary_loss_clip": 0.01097742, + "auxiliary_loss_mlp": 0.0102974, + "balance_loss_clip": 1.0348835, + "balance_loss_mlp": 1.01721716, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 3.6609840685399, + "language_loss": 0.68208832, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.70336312, + "num_input_tokens_seen": 344663910, + "step": 15975, + "time_per_iteration": 2.4748494625091553 + }, + { + "auxiliary_loss_clip": 0.01105407, + "auxiliary_loss_mlp": 0.01030209, + "balance_loss_clip": 1.03605008, + "balance_loss_mlp": 1.01799583, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 2.445338342553165, + "language_loss": 0.55638033, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.5777365, + "num_input_tokens_seen": 344682320, + "step": 15976, + "time_per_iteration": 2.4428887367248535 + }, + { + "auxiliary_loss_clip": 0.01079364, + "auxiliary_loss_mlp": 0.01026451, + "balance_loss_clip": 1.03097272, + "balance_loss_mlp": 1.01445305, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 1.8997993895391818, + "language_loss": 0.68446106, + "learning_rate": 1.624662719799219e-08, + "loss": 0.7055192, + "num_input_tokens_seen": 344701355, + "step": 15977, + "time_per_iteration": 3.9653899669647217 + }, + { + "auxiliary_loss_clip": 0.01096869, + "auxiliary_loss_mlp": 0.01037362, + "balance_loss_clip": 1.03373992, + "balance_loss_mlp": 1.02454066, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 1.8952335368407156, + "language_loss": 0.81922394, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84056628, + "num_input_tokens_seen": 344717980, + "step": 15978, + "time_per_iteration": 2.4123029708862305 + }, + { + "auxiliary_loss_clip": 0.01101671, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.03518069, + "balance_loss_mlp": 1.01872015, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.8226071535803654, + "language_loss": 0.83416051, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85548663, + "num_input_tokens_seen": 344733480, + "step": 15979, + "time_per_iteration": 2.452019691467285 + }, + { + "auxiliary_loss_clip": 0.01107925, + "auxiliary_loss_mlp": 0.01034176, + "balance_loss_clip": 1.03510618, + "balance_loss_mlp": 1.02257061, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.6419991529411007, + "language_loss": 0.80368173, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82510275, + "num_input_tokens_seen": 344752130, + "step": 15980, + "time_per_iteration": 2.4286792278289795 + }, + { + "auxiliary_loss_clip": 0.01099931, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.03563881, + "balance_loss_mlp": 1.016505, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 2.2014372857603357, + "language_loss": 0.68563336, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70692098, + "num_input_tokens_seen": 344771195, + "step": 15981, + "time_per_iteration": 2.4939780235290527 + }, + { + "auxiliary_loss_clip": 0.01094523, + "auxiliary_loss_mlp": 0.00776796, + "balance_loss_clip": 1.03460336, + "balance_loss_mlp": 1.00057721, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.5449495115278224, + "language_loss": 0.69589949, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71461266, + "num_input_tokens_seen": 344793150, + "step": 15982, + "time_per_iteration": 2.496534585952759 + }, + { + "auxiliary_loss_clip": 0.00998653, + "auxiliary_loss_mlp": 0.01004344, + "balance_loss_clip": 1.00762606, + "balance_loss_mlp": 1.00324178, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6743076669217362, + "language_loss": 0.53256929, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55259931, + "num_input_tokens_seen": 344852855, + "step": 15983, + "time_per_iteration": 3.1979479789733887 + }, + { + "auxiliary_loss_clip": 0.01107474, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.03624773, + "balance_loss_mlp": 1.02118528, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.029983450462909, + "language_loss": 0.6814754, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.70288557, + "num_input_tokens_seen": 344869830, + "step": 15984, + "time_per_iteration": 2.413050413131714 + }, + { + "auxiliary_loss_clip": 0.01073178, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.03570306, + "balance_loss_mlp": 1.02242708, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.6798662190121725, + "language_loss": 0.67122346, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.69228828, + "num_input_tokens_seen": 344888905, + "step": 15985, + "time_per_iteration": 2.525994062423706 + }, + { + "auxiliary_loss_clip": 0.01109509, + "auxiliary_loss_mlp": 0.01030179, + "balance_loss_clip": 1.03668046, + "balance_loss_mlp": 1.01799607, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 1.9307718550785522, + "language_loss": 0.78734499, + "learning_rate": 1.580380726142283e-08, + "loss": 0.80874193, + "num_input_tokens_seen": 344907160, + "step": 15986, + "time_per_iteration": 2.41095232963562 + }, + { + "auxiliary_loss_clip": 0.01063215, + "auxiliary_loss_mlp": 0.01031534, + "balance_loss_clip": 1.03777051, + "balance_loss_mlp": 1.01868963, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 4.55519629228754, + "language_loss": 0.64063859, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.66158605, + "num_input_tokens_seen": 344922400, + "step": 15987, + "time_per_iteration": 2.5344367027282715 + }, + { + "auxiliary_loss_clip": 0.01104992, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.03649199, + "balance_loss_mlp": 1.01795769, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 2.503289229426323, + "language_loss": 0.66184622, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.68318546, + "num_input_tokens_seen": 344941910, + "step": 15988, + "time_per_iteration": 2.4660093784332275 + }, + { + "auxiliary_loss_clip": 0.01096766, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.03547454, + "balance_loss_mlp": 1.02568388, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 1.8630228635305839, + "language_loss": 0.74463284, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76597196, + "num_input_tokens_seen": 344960020, + "step": 15989, + "time_per_iteration": 4.172016143798828 + }, + { + "auxiliary_loss_clip": 0.01010983, + "auxiliary_loss_mlp": 0.01004876, + "balance_loss_clip": 1.00574327, + "balance_loss_mlp": 1.00343323, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8343202291951819, + "language_loss": 0.63154894, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65170747, + "num_input_tokens_seen": 345018290, + "step": 15990, + "time_per_iteration": 3.026803970336914 + }, + { + "auxiliary_loss_clip": 0.01095865, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.03514934, + "balance_loss_mlp": 1.02113748, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 2.252532964364966, + "language_loss": 0.7814213, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.80270803, + "num_input_tokens_seen": 345040235, + "step": 15991, + "time_per_iteration": 2.5109708309173584 + }, + { + "auxiliary_loss_clip": 0.01113441, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.03676474, + "balance_loss_mlp": 1.0218668, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.7965007492234024, + "language_loss": 0.84339255, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.86488217, + "num_input_tokens_seen": 345054540, + "step": 15992, + "time_per_iteration": 2.4260611534118652 + }, + { + "auxiliary_loss_clip": 0.01084871, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.03285015, + "balance_loss_mlp": 1.01668191, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 1.8295685425789214, + "language_loss": 0.72698641, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74813092, + "num_input_tokens_seen": 345074035, + "step": 15993, + "time_per_iteration": 2.5008444786071777 + }, + { + "auxiliary_loss_clip": 0.01074058, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.0364368, + "balance_loss_mlp": 1.01995707, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.5997054866736125, + "language_loss": 0.68217874, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70324445, + "num_input_tokens_seen": 345099270, + "step": 15994, + "time_per_iteration": 2.670239210128784 + }, + { + "auxiliary_loss_clip": 0.01073459, + "auxiliary_loss_mlp": 0.01029816, + "balance_loss_clip": 1.03726792, + "balance_loss_mlp": 1.01754987, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 1.769121372638831, + "language_loss": 0.84505445, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.8660872, + "num_input_tokens_seen": 345116975, + "step": 15995, + "time_per_iteration": 2.567678928375244 + }, + { + "auxiliary_loss_clip": 0.01100166, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.03603411, + "balance_loss_mlp": 1.02169597, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.8798286413573477, + "language_loss": 0.7603898, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78173196, + "num_input_tokens_seen": 345133645, + "step": 15996, + "time_per_iteration": 2.4365005493164062 + }, + { + "auxiliary_loss_clip": 0.01080609, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.03306365, + "balance_loss_mlp": 1.01848245, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 1.9514807435795194, + "language_loss": 0.76989031, + "learning_rate": 1.52708595287494e-08, + "loss": 0.79100955, + "num_input_tokens_seen": 345150740, + "step": 15997, + "time_per_iteration": 2.4869658946990967 + }, + { + "auxiliary_loss_clip": 0.01103658, + "auxiliary_loss_mlp": 0.00777097, + "balance_loss_clip": 1.03488255, + "balance_loss_mlp": 1.00058329, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.772097066598351, + "language_loss": 0.67120063, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69000816, + "num_input_tokens_seen": 345170365, + "step": 15998, + "time_per_iteration": 2.454554796218872 + }, + { + "auxiliary_loss_clip": 0.01080041, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.02952373, + "balance_loss_mlp": 1.01813078, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.6019783570737667, + "language_loss": 0.73040873, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.75151992, + "num_input_tokens_seen": 345188930, + "step": 15999, + "time_per_iteration": 2.464202404022217 + }, + { + "auxiliary_loss_clip": 0.01082263, + "auxiliary_loss_mlp": 0.01023854, + "balance_loss_clip": 1.03395581, + "balance_loss_mlp": 1.01216578, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 2.671354985942322, + "language_loss": 0.65090805, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67196923, + "num_input_tokens_seen": 345209615, + "step": 16000, + "time_per_iteration": 2.5253043174743652 + }, + { + "auxiliary_loss_clip": 0.01076878, + "auxiliary_loss_mlp": 0.01028329, + "balance_loss_clip": 1.0343678, + "balance_loss_mlp": 1.01554418, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 2.0592786357287762, + "language_loss": 0.75371403, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.77476609, + "num_input_tokens_seen": 345229175, + "step": 16001, + "time_per_iteration": 3.9609732627868652 + }, + { + "auxiliary_loss_clip": 0.01093285, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.03344333, + "balance_loss_mlp": 1.01937222, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.7177633200059683, + "language_loss": 0.68512928, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70638007, + "num_input_tokens_seen": 345247815, + "step": 16002, + "time_per_iteration": 2.420924425125122 + }, + { + "auxiliary_loss_clip": 0.01098065, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.0375092, + "balance_loss_mlp": 1.01775169, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.3224016971990191, + "language_loss": 0.6428268, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66410536, + "num_input_tokens_seen": 345269935, + "step": 16003, + "time_per_iteration": 2.5279717445373535 + }, + { + "auxiliary_loss_clip": 0.01060981, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.03723383, + "balance_loss_mlp": 1.02214766, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.8539814626849198, + "language_loss": 0.757954, + "learning_rate": 1.493645226826512e-08, + "loss": 0.77890038, + "num_input_tokens_seen": 345288310, + "step": 16004, + "time_per_iteration": 2.5375845432281494 + }, + { + "auxiliary_loss_clip": 0.01094938, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.03501487, + "balance_loss_mlp": 1.01910758, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 2.1461995003697183, + "language_loss": 0.79059625, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81186187, + "num_input_tokens_seen": 345306615, + "step": 16005, + "time_per_iteration": 2.4666526317596436 + }, + { + "auxiliary_loss_clip": 0.01093587, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.03443575, + "balance_loss_mlp": 1.01767206, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 2.3944690954777466, + "language_loss": 0.68098634, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.70221472, + "num_input_tokens_seen": 345331935, + "step": 16006, + "time_per_iteration": 2.7847776412963867 + }, + { + "auxiliary_loss_clip": 0.01079272, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.03705931, + "balance_loss_mlp": 1.01904488, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.9037372301952435, + "language_loss": 0.77992594, + "learning_rate": 1.479426394188521e-08, + "loss": 0.8010242, + "num_input_tokens_seen": 345351510, + "step": 16007, + "time_per_iteration": 2.510795831680298 + }, + { + "auxiliary_loss_clip": 0.01110441, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.03734481, + "balance_loss_mlp": 1.0221231, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 1.9698562272228541, + "language_loss": 0.67820227, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.69965428, + "num_input_tokens_seen": 345367750, + "step": 16008, + "time_per_iteration": 2.43257212638855 + }, + { + "auxiliary_loss_clip": 0.0109243, + "auxiliary_loss_mlp": 0.01034034, + "balance_loss_clip": 1.04174173, + "balance_loss_mlp": 1.02056289, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.0031575564733717, + "language_loss": 0.735677, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75694156, + "num_input_tokens_seen": 345384790, + "step": 16009, + "time_per_iteration": 2.5276477336883545 + }, + { + "auxiliary_loss_clip": 0.01097402, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.0369029, + "balance_loss_mlp": 1.01855743, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 3.0113642750154246, + "language_loss": 0.75569081, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77697361, + "num_input_tokens_seen": 345403390, + "step": 16010, + "time_per_iteration": 2.442422866821289 + }, + { + "auxiliary_loss_clip": 0.01101816, + "auxiliary_loss_mlp": 0.01036511, + "balance_loss_clip": 1.03928447, + "balance_loss_mlp": 1.02131152, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.7673312158769245, + "language_loss": 0.69892305, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.72030634, + "num_input_tokens_seen": 345418685, + "step": 16011, + "time_per_iteration": 2.431983470916748 + }, + { + "auxiliary_loss_clip": 0.01095696, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.0361886, + "balance_loss_mlp": 1.01945972, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.8244506923541493, + "language_loss": 0.68362463, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70488948, + "num_input_tokens_seen": 345442380, + "step": 16012, + "time_per_iteration": 2.786428213119507 + }, + { + "auxiliary_loss_clip": 0.01094907, + "auxiliary_loss_mlp": 0.01035233, + "balance_loss_clip": 1.03499961, + "balance_loss_mlp": 1.02149391, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 2.0260055311972116, + "language_loss": 0.72103751, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.7423389, + "num_input_tokens_seen": 345463815, + "step": 16013, + "time_per_iteration": 4.151867389678955 + }, + { + "auxiliary_loss_clip": 0.01081352, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.03966808, + "balance_loss_mlp": 1.0165149, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 8.776927451686088, + "language_loss": 0.63847458, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65958315, + "num_input_tokens_seen": 345484525, + "step": 16014, + "time_per_iteration": 2.6869306564331055 + }, + { + "auxiliary_loss_clip": 0.01083281, + "auxiliary_loss_mlp": 0.01029524, + "balance_loss_clip": 1.03678203, + "balance_loss_mlp": 1.01884854, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 1.5920693248141715, + "language_loss": 0.71577358, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.73690164, + "num_input_tokens_seen": 345508295, + "step": 16015, + "time_per_iteration": 2.6955254077911377 + }, + { + "auxiliary_loss_clip": 0.01071379, + "auxiliary_loss_mlp": 0.01026854, + "balance_loss_clip": 1.03003764, + "balance_loss_mlp": 1.01404476, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 2.0673292684806537, + "language_loss": 0.77108181, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79206419, + "num_input_tokens_seen": 345525155, + "step": 16016, + "time_per_iteration": 3.987612247467041 + }, + { + "auxiliary_loss_clip": 0.01026414, + "auxiliary_loss_mlp": 0.01002634, + "balance_loss_clip": 1.00314152, + "balance_loss_mlp": 1.00150728, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.8191509989772837, + "language_loss": 0.63064694, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65093744, + "num_input_tokens_seen": 345578905, + "step": 16017, + "time_per_iteration": 2.9226765632629395 + }, + { + "auxiliary_loss_clip": 0.01093884, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.03881216, + "balance_loss_mlp": 1.01733625, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 1.8166541514925034, + "language_loss": 0.66479522, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68602967, + "num_input_tokens_seen": 345598965, + "step": 16018, + "time_per_iteration": 2.5922584533691406 + }, + { + "auxiliary_loss_clip": 0.01060989, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.03702986, + "balance_loss_mlp": 1.02197814, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 2.344457077416972, + "language_loss": 0.79462504, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81557041, + "num_input_tokens_seen": 345617945, + "step": 16019, + "time_per_iteration": 2.570838451385498 + }, + { + "auxiliary_loss_clip": 0.01071132, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.0323565, + "balance_loss_mlp": 1.01811171, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.432953997193854, + "language_loss": 0.71904439, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.74004793, + "num_input_tokens_seen": 345637920, + "step": 16020, + "time_per_iteration": 2.5842275619506836 + }, + { + "auxiliary_loss_clip": 0.01084287, + "auxiliary_loss_mlp": 0.01023827, + "balance_loss_clip": 1.03585148, + "balance_loss_mlp": 1.0125792, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 1.656492105772192, + "language_loss": 0.76930821, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.7903893, + "num_input_tokens_seen": 345656195, + "step": 16021, + "time_per_iteration": 2.5498690605163574 + }, + { + "auxiliary_loss_clip": 0.01080569, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.03861117, + "balance_loss_mlp": 1.01791322, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 2.9775414361005077, + "language_loss": 0.65187007, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.67300373, + "num_input_tokens_seen": 345676700, + "step": 16022, + "time_per_iteration": 2.57122540473938 + }, + { + "auxiliary_loss_clip": 0.01080719, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.03097486, + "balance_loss_mlp": 1.01801014, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 2.185435380697002, + "language_loss": 0.72719318, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.74829745, + "num_input_tokens_seen": 345696725, + "step": 16023, + "time_per_iteration": 2.5347185134887695 + }, + { + "auxiliary_loss_clip": 0.01092936, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.03372788, + "balance_loss_mlp": 1.02276111, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.5375729703102723, + "language_loss": 0.81465346, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83593965, + "num_input_tokens_seen": 345716245, + "step": 16024, + "time_per_iteration": 2.4774348735809326 + }, + { + "auxiliary_loss_clip": 0.01102111, + "auxiliary_loss_mlp": 0.01033994, + "balance_loss_clip": 1.03654087, + "balance_loss_mlp": 1.02118516, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 1.5313613211542745, + "language_loss": 0.81346273, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83482379, + "num_input_tokens_seen": 345739060, + "step": 16025, + "time_per_iteration": 2.5122780799865723 + }, + { + "auxiliary_loss_clip": 0.0110137, + "auxiliary_loss_mlp": 0.01026389, + "balance_loss_clip": 1.0358088, + "balance_loss_mlp": 1.01402104, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 2.133492829151516, + "language_loss": 0.76677233, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78804994, + "num_input_tokens_seen": 345758325, + "step": 16026, + "time_per_iteration": 2.4934446811676025 + }, + { + "auxiliary_loss_clip": 0.01074788, + "auxiliary_loss_mlp": 0.00781478, + "balance_loss_clip": 1.03263521, + "balance_loss_mlp": 1.00060105, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 3.3981501678578345, + "language_loss": 0.6361388, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65470141, + "num_input_tokens_seen": 345778530, + "step": 16027, + "time_per_iteration": 2.5610454082489014 + }, + { + "auxiliary_loss_clip": 0.01099456, + "auxiliary_loss_mlp": 0.01029363, + "balance_loss_clip": 1.03591704, + "balance_loss_mlp": 1.01617301, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 1.8710206054678842, + "language_loss": 0.87130105, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89258921, + "num_input_tokens_seen": 345796535, + "step": 16028, + "time_per_iteration": 3.9502532482147217 + }, + { + "auxiliary_loss_clip": 0.00992418, + "auxiliary_loss_mlp": 0.01001413, + "balance_loss_clip": 1.01566195, + "balance_loss_mlp": 1.00019729, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.6829982268990007, + "language_loss": 0.53112912, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55106735, + "num_input_tokens_seen": 345859700, + "step": 16029, + "time_per_iteration": 3.1065003871917725 + }, + { + "auxiliary_loss_clip": 0.01107902, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.0354315, + "balance_loss_mlp": 1.01795149, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 1.433182771909264, + "language_loss": 0.74207669, + "learning_rate": 1.372666546129797e-08, + "loss": 0.7634598, + "num_input_tokens_seen": 345878760, + "step": 16030, + "time_per_iteration": 2.458935022354126 + }, + { + "auxiliary_loss_clip": 0.01082656, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.03369403, + "balance_loss_mlp": 1.01913452, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 1.799910141444287, + "language_loss": 0.6594485, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68058312, + "num_input_tokens_seen": 345900445, + "step": 16031, + "time_per_iteration": 2.5470738410949707 + }, + { + "auxiliary_loss_clip": 0.01020248, + "auxiliary_loss_mlp": 0.00753006, + "balance_loss_clip": 1.00699139, + "balance_loss_mlp": 1.00019312, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8340041632103192, + "language_loss": 0.60779124, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62552375, + "num_input_tokens_seen": 345961020, + "step": 16032, + "time_per_iteration": 3.119027853012085 + }, + { + "auxiliary_loss_clip": 0.01086159, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.03311276, + "balance_loss_mlp": 1.0204736, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 1.839449462783842, + "language_loss": 0.66611683, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68729115, + "num_input_tokens_seen": 345980210, + "step": 16033, + "time_per_iteration": 2.5022761821746826 + }, + { + "auxiliary_loss_clip": 0.01053752, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.03479588, + "balance_loss_mlp": 1.01970649, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.6238888587509854, + "language_loss": 0.65345776, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.67431676, + "num_input_tokens_seen": 345998280, + "step": 16034, + "time_per_iteration": 2.6018097400665283 + }, + { + "auxiliary_loss_clip": 0.01071186, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.03704143, + "balance_loss_mlp": 1.0180819, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 2.2609418614183547, + "language_loss": 0.74289155, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.7639122, + "num_input_tokens_seen": 346015545, + "step": 16035, + "time_per_iteration": 2.5774717330932617 + }, + { + "auxiliary_loss_clip": 0.01110475, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.03922963, + "balance_loss_mlp": 1.01952529, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 3.4917241252125177, + "language_loss": 0.81801647, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.83943731, + "num_input_tokens_seen": 346034055, + "step": 16036, + "time_per_iteration": 2.439578056335449 + }, + { + "auxiliary_loss_clip": 0.01081093, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.03545952, + "balance_loss_mlp": 1.01716316, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 2.0414707082693515, + "language_loss": 0.70025706, + "learning_rate": 1.340965177371789e-08, + "loss": 0.72136563, + "num_input_tokens_seen": 346054130, + "step": 16037, + "time_per_iteration": 2.549283027648926 + }, + { + "auxiliary_loss_clip": 0.01108088, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.03532863, + "balance_loss_mlp": 1.01655269, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.6658925444738801, + "language_loss": 0.63181543, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65318155, + "num_input_tokens_seen": 346072990, + "step": 16038, + "time_per_iteration": 2.425764560699463 + }, + { + "auxiliary_loss_clip": 0.0107512, + "auxiliary_loss_mlp": 0.00781735, + "balance_loss_clip": 1.03332531, + "balance_loss_mlp": 1.00062943, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 1.8182759060971734, + "language_loss": 0.70881224, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.72738075, + "num_input_tokens_seen": 346093745, + "step": 16039, + "time_per_iteration": 2.5907418727874756 + }, + { + "auxiliary_loss_clip": 0.01067675, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.03667355, + "balance_loss_mlp": 1.01965845, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 2.6494889421761973, + "language_loss": 0.72846293, + "learning_rate": 1.327491870605657e-08, + "loss": 0.74945974, + "num_input_tokens_seen": 346110115, + "step": 16040, + "time_per_iteration": 2.5766544342041016 + }, + { + "auxiliary_loss_clip": 0.01098539, + "auxiliary_loss_mlp": 0.01032012, + "balance_loss_clip": 1.03403544, + "balance_loss_mlp": 1.01901817, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 2.1485098965043603, + "language_loss": 0.73038274, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75168824, + "num_input_tokens_seen": 346127165, + "step": 16041, + "time_per_iteration": 3.920969009399414 + }, + { + "auxiliary_loss_clip": 0.01081796, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.03397536, + "balance_loss_mlp": 1.0195936, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 1.8089586613969848, + "language_loss": 0.72180212, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.74293441, + "num_input_tokens_seen": 346145950, + "step": 16042, + "time_per_iteration": 2.4838180541992188 + }, + { + "auxiliary_loss_clip": 0.01070747, + "auxiliary_loss_mlp": 0.01036189, + "balance_loss_clip": 1.03282404, + "balance_loss_mlp": 1.02420878, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.7877045275694807, + "language_loss": 0.80923617, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83030552, + "num_input_tokens_seen": 346165005, + "step": 16043, + "time_per_iteration": 2.568697452545166 + }, + { + "auxiliary_loss_clip": 0.01084013, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.03803182, + "balance_loss_mlp": 1.01856434, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.668162588520115, + "language_loss": 0.71506059, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.73619986, + "num_input_tokens_seen": 346185095, + "step": 16044, + "time_per_iteration": 2.5249955654144287 + }, + { + "auxiliary_loss_clip": 0.01083424, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.03328681, + "balance_loss_mlp": 1.01493549, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 1.9347245445036867, + "language_loss": 0.70060003, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.7217074, + "num_input_tokens_seen": 346202580, + "step": 16045, + "time_per_iteration": 2.491520404815674 + }, + { + "auxiliary_loss_clip": 0.01045877, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.03632927, + "balance_loss_mlp": 1.02201986, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 1.8746877172449858, + "language_loss": 0.75183761, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77264977, + "num_input_tokens_seen": 346219395, + "step": 16046, + "time_per_iteration": 2.574056625366211 + }, + { + "auxiliary_loss_clip": 0.01100267, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.03587627, + "balance_loss_mlp": 1.0217464, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.622134962230603, + "language_loss": 0.62765652, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64900398, + "num_input_tokens_seen": 346239715, + "step": 16047, + "time_per_iteration": 2.501603126525879 + }, + { + "auxiliary_loss_clip": 0.01088257, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.03899348, + "balance_loss_mlp": 1.02032685, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.938473396273585, + "language_loss": 0.69124496, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71245009, + "num_input_tokens_seen": 346258500, + "step": 16048, + "time_per_iteration": 2.5914418697357178 + }, + { + "auxiliary_loss_clip": 0.01101203, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.03783703, + "balance_loss_mlp": 1.01760447, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 2.1397551974765086, + "language_loss": 0.64079165, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66211063, + "num_input_tokens_seen": 346279110, + "step": 16049, + "time_per_iteration": 2.5551304817199707 + }, + { + "auxiliary_loss_clip": 0.01098255, + "auxiliary_loss_mlp": 0.01027586, + "balance_loss_clip": 1.03657889, + "balance_loss_mlp": 1.01542115, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 1.7937596732681285, + "language_loss": 0.71072543, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.7319839, + "num_input_tokens_seen": 346297860, + "step": 16050, + "time_per_iteration": 2.4644534587860107 + }, + { + "auxiliary_loss_clip": 0.01100644, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.0337311, + "balance_loss_mlp": 1.02314687, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 1.9197190284155337, + "language_loss": 0.69665956, + "learning_rate": 1.278669873970606e-08, + "loss": 0.71803999, + "num_input_tokens_seen": 346319860, + "step": 16051, + "time_per_iteration": 2.6605069637298584 + }, + { + "auxiliary_loss_clip": 0.01019724, + "auxiliary_loss_mlp": 0.01001856, + "balance_loss_clip": 1.00645733, + "balance_loss_mlp": 1.00072384, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8393803499868989, + "language_loss": 0.59159714, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61181295, + "num_input_tokens_seen": 346379025, + "step": 16052, + "time_per_iteration": 4.579999685287476 + }, + { + "auxiliary_loss_clip": 0.01103047, + "auxiliary_loss_mlp": 0.01029484, + "balance_loss_clip": 1.03339362, + "balance_loss_mlp": 1.01718748, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 2.383453008505359, + "language_loss": 0.74265707, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76398236, + "num_input_tokens_seen": 346402250, + "step": 16053, + "time_per_iteration": 2.5035347938537598 + }, + { + "auxiliary_loss_clip": 0.01084862, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.03515553, + "balance_loss_mlp": 1.01855242, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 1.9457522978071664, + "language_loss": 0.68622816, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70738709, + "num_input_tokens_seen": 346419555, + "step": 16054, + "time_per_iteration": 2.4779231548309326 + }, + { + "auxiliary_loss_clip": 0.01092397, + "auxiliary_loss_mlp": 0.00776583, + "balance_loss_clip": 1.03813851, + "balance_loss_mlp": 1.00057113, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.4515630709526623, + "language_loss": 0.62392628, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.64261603, + "num_input_tokens_seen": 346441245, + "step": 16055, + "time_per_iteration": 2.5576159954071045 + }, + { + "auxiliary_loss_clip": 0.01069262, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.03550029, + "balance_loss_mlp": 1.01806533, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.9265907504271262, + "language_loss": 0.76726276, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.78825551, + "num_input_tokens_seen": 346460065, + "step": 16056, + "time_per_iteration": 4.044718980789185 + }, + { + "auxiliary_loss_clip": 0.01079402, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.03162122, + "balance_loss_mlp": 1.019593, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.5797828981262165, + "language_loss": 0.71403086, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73514193, + "num_input_tokens_seen": 346478005, + "step": 16057, + "time_per_iteration": 2.510205030441284 + }, + { + "auxiliary_loss_clip": 0.01105859, + "auxiliary_loss_mlp": 0.01031771, + "balance_loss_clip": 1.03547549, + "balance_loss_mlp": 1.02032065, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 1.9141691590607826, + "language_loss": 0.71943152, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.74080789, + "num_input_tokens_seen": 346497575, + "step": 16058, + "time_per_iteration": 2.4762892723083496 + }, + { + "auxiliary_loss_clip": 0.01094965, + "auxiliary_loss_mlp": 0.01037513, + "balance_loss_clip": 1.03452706, + "balance_loss_mlp": 1.02565765, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.5317606028904132, + "language_loss": 0.73999774, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76132244, + "num_input_tokens_seen": 346520000, + "step": 16059, + "time_per_iteration": 2.5686168670654297 + }, + { + "auxiliary_loss_clip": 0.01090412, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.03537202, + "balance_loss_mlp": 1.02039504, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 1.9897451981352678, + "language_loss": 0.73497117, + "learning_rate": 1.239402791721722e-08, + "loss": 0.756199, + "num_input_tokens_seen": 346541605, + "step": 16060, + "time_per_iteration": 2.680408000946045 + }, + { + "auxiliary_loss_clip": 0.01084849, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.03700674, + "balance_loss_mlp": 1.02214026, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 1.6058072541952297, + "language_loss": 0.76552522, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.78670645, + "num_input_tokens_seen": 346560955, + "step": 16061, + "time_per_iteration": 2.568460464477539 + }, + { + "auxiliary_loss_clip": 0.01011571, + "auxiliary_loss_mlp": 0.01003301, + "balance_loss_clip": 1.00721455, + "balance_loss_mlp": 1.00212049, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.754526128280434, + "language_loss": 0.64125144, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66140014, + "num_input_tokens_seen": 346621615, + "step": 16062, + "time_per_iteration": 3.1284892559051514 + }, + { + "auxiliary_loss_clip": 0.01057417, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.03031337, + "balance_loss_mlp": 1.01852822, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.0571303508326584, + "language_loss": 0.93293214, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95380318, + "num_input_tokens_seen": 346637460, + "step": 16063, + "time_per_iteration": 2.5675711631774902 + }, + { + "auxiliary_loss_clip": 0.0109833, + "auxiliary_loss_mlp": 0.01036308, + "balance_loss_clip": 1.036461, + "balance_loss_mlp": 1.02387452, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 1.8550801215240442, + "language_loss": 0.82089579, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84224212, + "num_input_tokens_seen": 346655625, + "step": 16064, + "time_per_iteration": 2.5114760398864746 + }, + { + "auxiliary_loss_clip": 0.01095524, + "auxiliary_loss_mlp": 0.00777852, + "balance_loss_clip": 1.03642738, + "balance_loss_mlp": 1.00053883, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 1.5052294639433093, + "language_loss": 0.84071344, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.85944718, + "num_input_tokens_seen": 346675220, + "step": 16065, + "time_per_iteration": 2.528745412826538 + }, + { + "auxiliary_loss_clip": 0.01086053, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.03638005, + "balance_loss_mlp": 1.01926351, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.6602427885388917, + "language_loss": 0.67612851, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69730294, + "num_input_tokens_seen": 346694710, + "step": 16066, + "time_per_iteration": 2.6054115295410156 + }, + { + "auxiliary_loss_clip": 0.01107335, + "auxiliary_loss_mlp": 0.01026519, + "balance_loss_clip": 1.03514338, + "balance_loss_mlp": 1.01488447, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 1.8243936268677239, + "language_loss": 0.82175261, + "learning_rate": 1.209283794752558e-08, + "loss": 0.84309113, + "num_input_tokens_seen": 346712645, + "step": 16067, + "time_per_iteration": 3.895507335662842 + }, + { + "auxiliary_loss_clip": 0.01085216, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.03487837, + "balance_loss_mlp": 1.01821947, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 1.8458896108971319, + "language_loss": 0.69225156, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71340841, + "num_input_tokens_seen": 346732375, + "step": 16068, + "time_per_iteration": 2.532435178756714 + }, + { + "auxiliary_loss_clip": 0.01087846, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.03321314, + "balance_loss_mlp": 1.02056503, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 2.0175604569378907, + "language_loss": 0.68170178, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70289773, + "num_input_tokens_seen": 346750430, + "step": 16069, + "time_per_iteration": 2.4773201942443848 + }, + { + "auxiliary_loss_clip": 0.01089256, + "auxiliary_loss_mlp": 0.01028203, + "balance_loss_clip": 1.03843236, + "balance_loss_mlp": 1.01640177, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 2.047673120137426, + "language_loss": 0.88937294, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91054755, + "num_input_tokens_seen": 346768455, + "step": 16070, + "time_per_iteration": 2.5204830169677734 + }, + { + "auxiliary_loss_clip": 0.01111287, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.03905177, + "balance_loss_mlp": 1.02521777, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 2.6304240346218366, + "language_loss": 0.77037227, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.79186398, + "num_input_tokens_seen": 346786530, + "step": 16071, + "time_per_iteration": 2.46976900100708 + }, + { + "auxiliary_loss_clip": 0.01083567, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.03285313, + "balance_loss_mlp": 1.0159905, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.7710406558475738, + "language_loss": 0.65590507, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.67703426, + "num_input_tokens_seen": 346804635, + "step": 16072, + "time_per_iteration": 2.490746259689331 + }, + { + "auxiliary_loss_clip": 0.01101261, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.03726816, + "balance_loss_mlp": 1.01863337, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.8940835917418781, + "language_loss": 0.77752388, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79884207, + "num_input_tokens_seen": 346823070, + "step": 16073, + "time_per_iteration": 2.4949941635131836 + }, + { + "auxiliary_loss_clip": 0.01111353, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.03690159, + "balance_loss_mlp": 1.01943564, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 5.904749226701653, + "language_loss": 0.75958711, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.78101897, + "num_input_tokens_seen": 346841180, + "step": 16074, + "time_per_iteration": 2.401731252670288 + }, + { + "auxiliary_loss_clip": 0.01089963, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.03958178, + "balance_loss_mlp": 1.01874924, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.4778521557429787, + "language_loss": 0.75788999, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77910286, + "num_input_tokens_seen": 346864250, + "step": 16075, + "time_per_iteration": 2.588752508163452 + }, + { + "auxiliary_loss_clip": 0.01079346, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.03751779, + "balance_loss_mlp": 1.02060091, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 2.038566920285898, + "language_loss": 0.79161668, + "learning_rate": 1.171102125547696e-08, + "loss": 0.81273526, + "num_input_tokens_seen": 346881955, + "step": 16076, + "time_per_iteration": 2.5399887561798096 + }, + { + "auxiliary_loss_clip": 0.01088022, + "auxiliary_loss_mlp": 0.01041721, + "balance_loss_clip": 1.03717375, + "balance_loss_mlp": 1.0290792, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 2.1673709493815068, + "language_loss": 0.72613156, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74742901, + "num_input_tokens_seen": 346900445, + "step": 16077, + "time_per_iteration": 2.5049195289611816 + }, + { + "auxiliary_loss_clip": 0.01093719, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.03356194, + "balance_loss_mlp": 1.02187562, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 1.919762684100387, + "language_loss": 0.59178942, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.6130746, + "num_input_tokens_seen": 346920135, + "step": 16078, + "time_per_iteration": 2.524768352508545 + }, + { + "auxiliary_loss_clip": 0.01100031, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.03602123, + "balance_loss_mlp": 1.01916599, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 1.831433652958018, + "language_loss": 0.72103179, + "learning_rate": 1.158510609718899e-08, + "loss": 0.74234962, + "num_input_tokens_seen": 346940450, + "step": 16079, + "time_per_iteration": 2.501544952392578 + }, + { + "auxiliary_loss_clip": 0.01091748, + "auxiliary_loss_mlp": 0.01026748, + "balance_loss_clip": 1.03397119, + "balance_loss_mlp": 1.01541758, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.886925994708146, + "language_loss": 0.72399485, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74517977, + "num_input_tokens_seen": 346960935, + "step": 16080, + "time_per_iteration": 2.493746042251587 + }, + { + "auxiliary_loss_clip": 0.01078355, + "auxiliary_loss_mlp": 0.01036418, + "balance_loss_clip": 1.03204799, + "balance_loss_mlp": 1.02295947, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 1.9863327786688734, + "language_loss": 0.73878145, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.75992924, + "num_input_tokens_seen": 346980100, + "step": 16081, + "time_per_iteration": 3.9748315811157227 + }, + { + "auxiliary_loss_clip": 0.01081036, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.03237343, + "balance_loss_mlp": 1.01627946, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 1.779942398476781, + "language_loss": 0.67436594, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69546664, + "num_input_tokens_seen": 347001250, + "step": 16082, + "time_per_iteration": 2.5588319301605225 + }, + { + "auxiliary_loss_clip": 0.01067073, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.03126287, + "balance_loss_mlp": 1.02522182, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.9694453465195183, + "language_loss": 0.76693702, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78799963, + "num_input_tokens_seen": 347022975, + "step": 16083, + "time_per_iteration": 2.585932970046997 + }, + { + "auxiliary_loss_clip": 0.01062524, + "auxiliary_loss_mlp": 0.01030615, + "balance_loss_clip": 1.03498983, + "balance_loss_mlp": 1.01824737, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 1.9552396622706663, + "language_loss": 0.79193842, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81286979, + "num_input_tokens_seen": 347038780, + "step": 16084, + "time_per_iteration": 2.587416410446167 + }, + { + "auxiliary_loss_clip": 0.01102493, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.03588533, + "balance_loss_mlp": 1.01744056, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.0586832862540776, + "language_loss": 0.67515618, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69648552, + "num_input_tokens_seen": 347056705, + "step": 16085, + "time_per_iteration": 2.462576389312744 + }, + { + "auxiliary_loss_clip": 0.01090925, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.03691149, + "balance_loss_mlp": 1.01870584, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 2.0104213872900756, + "language_loss": 0.69090837, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.71213835, + "num_input_tokens_seen": 347075710, + "step": 16086, + "time_per_iteration": 2.5354621410369873 + }, + { + "auxiliary_loss_clip": 0.01096214, + "auxiliary_loss_mlp": 0.01034259, + "balance_loss_clip": 1.0351181, + "balance_loss_mlp": 1.02158666, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 1.5672771858511478, + "language_loss": 0.78527761, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80658233, + "num_input_tokens_seen": 347092325, + "step": 16087, + "time_per_iteration": 2.482912302017212 + }, + { + "auxiliary_loss_clip": 0.01079964, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.0326879, + "balance_loss_mlp": 1.01858222, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 1.7755996034906734, + "language_loss": 0.70986307, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73096836, + "num_input_tokens_seen": 347110595, + "step": 16088, + "time_per_iteration": 2.4985084533691406 + }, + { + "auxiliary_loss_clip": 0.01105697, + "auxiliary_loss_mlp": 0.00777462, + "balance_loss_clip": 1.03650308, + "balance_loss_mlp": 1.00046015, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.44932778693916, + "language_loss": 0.70747393, + "learning_rate": 1.117029020040916e-08, + "loss": 0.72630554, + "num_input_tokens_seen": 347131625, + "step": 16089, + "time_per_iteration": 2.5125327110290527 + }, + { + "auxiliary_loss_clip": 0.01109507, + "auxiliary_loss_mlp": 0.01029522, + "balance_loss_clip": 1.03679562, + "balance_loss_mlp": 1.01776826, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.325190629565948, + "language_loss": 0.75394058, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.7753309, + "num_input_tokens_seen": 347147910, + "step": 16090, + "time_per_iteration": 2.4464590549468994 + }, + { + "auxiliary_loss_clip": 0.01090381, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.03781188, + "balance_loss_mlp": 1.02126741, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.6945628035806122, + "language_loss": 0.68749839, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.70873833, + "num_input_tokens_seen": 347168805, + "step": 16091, + "time_per_iteration": 2.5547878742218018 + }, + { + "auxiliary_loss_clip": 0.01106794, + "auxiliary_loss_mlp": 0.01034221, + "balance_loss_clip": 1.03611898, + "balance_loss_mlp": 1.0214237, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 1.8539424239113103, + "language_loss": 0.77134347, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79275358, + "num_input_tokens_seen": 347189455, + "step": 16092, + "time_per_iteration": 3.9228031635284424 + }, + { + "auxiliary_loss_clip": 0.01107413, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.03632593, + "balance_loss_mlp": 1.01781142, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 2.141341413721019, + "language_loss": 0.76369262, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78506112, + "num_input_tokens_seen": 347206030, + "step": 16093, + "time_per_iteration": 2.4196767807006836 + }, + { + "auxiliary_loss_clip": 0.01082196, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.03552008, + "balance_loss_mlp": 1.01568627, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.7607774506207587, + "language_loss": 0.68701601, + "learning_rate": 1.096571027726112e-08, + "loss": 0.7081241, + "num_input_tokens_seen": 347226250, + "step": 16094, + "time_per_iteration": 2.519343137741089 + }, + { + "auxiliary_loss_clip": 0.01099655, + "auxiliary_loss_mlp": 0.01029813, + "balance_loss_clip": 1.03606272, + "balance_loss_mlp": 1.01811862, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.7121326813205677, + "language_loss": 0.75779259, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.77908725, + "num_input_tokens_seen": 347247350, + "step": 16095, + "time_per_iteration": 3.8626322746276855 + }, + { + "auxiliary_loss_clip": 0.01114281, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.03859138, + "balance_loss_mlp": 1.02268958, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 2.0559979306611234, + "language_loss": 0.70191002, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72340882, + "num_input_tokens_seen": 347266870, + "step": 16096, + "time_per_iteration": 2.4468021392822266 + }, + { + "auxiliary_loss_clip": 0.01089208, + "auxiliary_loss_mlp": 0.01027918, + "balance_loss_clip": 1.03500605, + "balance_loss_mlp": 1.01542485, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 2.025882457395125, + "language_loss": 0.71910214, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.74027348, + "num_input_tokens_seen": 347290120, + "step": 16097, + "time_per_iteration": 2.720268487930298 + }, + { + "auxiliary_loss_clip": 0.01106689, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.03592801, + "balance_loss_mlp": 1.02369785, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 1.7069622666430064, + "language_loss": 0.77854794, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.79997247, + "num_input_tokens_seen": 347308785, + "step": 16098, + "time_per_iteration": 2.465188980102539 + }, + { + "auxiliary_loss_clip": 0.01075392, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.03716636, + "balance_loss_mlp": 1.01686597, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 2.0302269296153437, + "language_loss": 0.90815681, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92919356, + "num_input_tokens_seen": 347326375, + "step": 16099, + "time_per_iteration": 2.4710004329681396 + }, + { + "auxiliary_loss_clip": 0.01098022, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.03502417, + "balance_loss_mlp": 1.01804805, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 2.2569862242171363, + "language_loss": 0.66040689, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68169779, + "num_input_tokens_seen": 347348250, + "step": 16100, + "time_per_iteration": 2.569226026535034 + }, + { + "auxiliary_loss_clip": 0.01076355, + "auxiliary_loss_mlp": 0.01032632, + "balance_loss_clip": 1.04477262, + "balance_loss_mlp": 1.02008533, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.6355977888412787, + "language_loss": 0.73452818, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.7556181, + "num_input_tokens_seen": 347367400, + "step": 16101, + "time_per_iteration": 2.578061580657959 + }, + { + "auxiliary_loss_clip": 0.01085424, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.03555489, + "balance_loss_mlp": 1.01760983, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 1.6011160270244256, + "language_loss": 0.73187792, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.753034, + "num_input_tokens_seen": 347387600, + "step": 16102, + "time_per_iteration": 2.5449764728546143 + }, + { + "auxiliary_loss_clip": 0.01076895, + "auxiliary_loss_mlp": 0.01037069, + "balance_loss_clip": 1.03910923, + "balance_loss_mlp": 1.02334166, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 2.1315097446109323, + "language_loss": 0.77671218, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.7978518, + "num_input_tokens_seen": 347406915, + "step": 16103, + "time_per_iteration": 2.561474561691284 + }, + { + "auxiliary_loss_clip": 0.01088511, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.03471088, + "balance_loss_mlp": 1.0183568, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 1.7557973296727332, + "language_loss": 0.80442649, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82561243, + "num_input_tokens_seen": 347425140, + "step": 16104, + "time_per_iteration": 2.623581886291504 + }, + { + "auxiliary_loss_clip": 0.01087648, + "auxiliary_loss_mlp": 0.01035968, + "balance_loss_clip": 1.03046131, + "balance_loss_mlp": 1.02495956, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.4691113031319467, + "language_loss": 0.77727115, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.79850733, + "num_input_tokens_seen": 347446350, + "step": 16105, + "time_per_iteration": 2.5611679553985596 + }, + { + "auxiliary_loss_clip": 0.01002189, + "auxiliary_loss_mlp": 0.01007423, + "balance_loss_clip": 1.00544429, + "balance_loss_mlp": 1.00621915, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.8193221394475547, + "language_loss": 0.56693077, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58702695, + "num_input_tokens_seen": 347510135, + "step": 16106, + "time_per_iteration": 3.1332709789276123 + }, + { + "auxiliary_loss_clip": 0.01005182, + "auxiliary_loss_mlp": 0.01003217, + "balance_loss_clip": 1.00877285, + "balance_loss_mlp": 1.00192952, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.8946063296742706, + "language_loss": 0.6152851, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63536912, + "num_input_tokens_seen": 347562505, + "step": 16107, + "time_per_iteration": 4.513175010681152 + }, + { + "auxiliary_loss_clip": 0.01099806, + "auxiliary_loss_mlp": 0.01041355, + "balance_loss_clip": 1.03671265, + "balance_loss_mlp": 1.02762234, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.2826310755969397, + "language_loss": 0.73730844, + "learning_rate": 1.040291854638875e-08, + "loss": 0.7587201, + "num_input_tokens_seen": 347579150, + "step": 16108, + "time_per_iteration": 2.57011079788208 + }, + { + "auxiliary_loss_clip": 0.01093844, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.03437328, + "balance_loss_mlp": 1.01712334, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 2.3086123265456235, + "language_loss": 0.57091051, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59215164, + "num_input_tokens_seen": 347596705, + "step": 16109, + "time_per_iteration": 2.5434365272521973 + }, + { + "auxiliary_loss_clip": 0.01017646, + "auxiliary_loss_mlp": 0.0099938, + "balance_loss_clip": 1.00305724, + "balance_loss_mlp": 0.99830168, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6714497525803325, + "language_loss": 0.54239762, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56256783, + "num_input_tokens_seen": 347661870, + "step": 16110, + "time_per_iteration": 3.034696340560913 + }, + { + "auxiliary_loss_clip": 0.01046102, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.03560758, + "balance_loss_mlp": 1.02755415, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 2.188993983882894, + "language_loss": 0.62534726, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64623374, + "num_input_tokens_seen": 347684295, + "step": 16111, + "time_per_iteration": 2.7528162002563477 + }, + { + "auxiliary_loss_clip": 0.01084713, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.03458476, + "balance_loss_mlp": 1.0198456, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 2.1578613304196543, + "language_loss": 0.74592042, + "learning_rate": 1.024483677309118e-08, + "loss": 0.76707381, + "num_input_tokens_seen": 347702585, + "step": 16112, + "time_per_iteration": 2.5303213596343994 + }, + { + "auxiliary_loss_clip": 0.01095855, + "auxiliary_loss_mlp": 0.01023151, + "balance_loss_clip": 1.03528762, + "balance_loss_mlp": 1.0121541, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 2.4581836756162154, + "language_loss": 0.66694719, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68813729, + "num_input_tokens_seen": 347721810, + "step": 16113, + "time_per_iteration": 2.502131938934326 + }, + { + "auxiliary_loss_clip": 0.01018656, + "auxiliary_loss_mlp": 0.00999784, + "balance_loss_clip": 1.00600135, + "balance_loss_mlp": 0.99852669, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.6918668187890332, + "language_loss": 0.56545866, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58564305, + "num_input_tokens_seen": 347782330, + "step": 16114, + "time_per_iteration": 3.071563959121704 + }, + { + "auxiliary_loss_clip": 0.01082912, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.03421557, + "balance_loss_mlp": 1.02671087, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.922211076797178, + "language_loss": 0.82487553, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84609771, + "num_input_tokens_seen": 347794835, + "step": 16115, + "time_per_iteration": 2.500256299972534 + }, + { + "auxiliary_loss_clip": 0.01092402, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.03558421, + "balance_loss_mlp": 1.01728678, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.610083050852025, + "language_loss": 0.72219592, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74340636, + "num_input_tokens_seen": 347814320, + "step": 16116, + "time_per_iteration": 2.482372283935547 + }, + { + "auxiliary_loss_clip": 0.01072749, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.03811789, + "balance_loss_mlp": 1.0217042, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 2.0737691926138795, + "language_loss": 0.76038134, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.78145468, + "num_input_tokens_seen": 347832125, + "step": 16117, + "time_per_iteration": 2.55845046043396 + }, + { + "auxiliary_loss_clip": 0.01109691, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.03618371, + "balance_loss_mlp": 1.01764917, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 1.9895014551999843, + "language_loss": 0.77745807, + "learning_rate": 1.000997769426548e-08, + "loss": 0.7988559, + "num_input_tokens_seen": 347850765, + "step": 16118, + "time_per_iteration": 2.4529154300689697 + }, + { + "auxiliary_loss_clip": 0.01086452, + "auxiliary_loss_mlp": 0.00779381, + "balance_loss_clip": 1.03351402, + "balance_loss_mlp": 1.00069785, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.689341606956414, + "language_loss": 0.78136635, + "learning_rate": 9.971098618001272e-09, + "loss": 0.80002475, + "num_input_tokens_seen": 347870125, + "step": 16119, + "time_per_iteration": 2.5252201557159424 + }, + { + "auxiliary_loss_clip": 0.01059737, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.03199005, + "balance_loss_mlp": 1.02198815, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.4386600038701878, + "language_loss": 0.75771546, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77865487, + "num_input_tokens_seen": 347890615, + "step": 16120, + "time_per_iteration": 4.029642581939697 + }, + { + "auxiliary_loss_clip": 0.01098941, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.03650451, + "balance_loss_mlp": 1.02002525, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 1.7670050803311157, + "language_loss": 0.69647741, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71778375, + "num_input_tokens_seen": 347908685, + "step": 16121, + "time_per_iteration": 2.500778913497925 + }, + { + "auxiliary_loss_clip": 0.01094851, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.03467226, + "balance_loss_mlp": 1.02035785, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 3.325820981572421, + "language_loss": 0.68960828, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71088254, + "num_input_tokens_seen": 347926385, + "step": 16122, + "time_per_iteration": 2.559483528137207 + }, + { + "auxiliary_loss_clip": 0.01067361, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.02999318, + "balance_loss_mlp": 1.0194509, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 1.7674158754182046, + "language_loss": 0.75833023, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77931941, + "num_input_tokens_seen": 347945290, + "step": 16123, + "time_per_iteration": 2.5282251834869385 + }, + { + "auxiliary_loss_clip": 0.01076942, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.03720057, + "balance_loss_mlp": 1.02442312, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.7876664895867171, + "language_loss": 0.74732822, + "learning_rate": 9.777835197497753e-09, + "loss": 0.7684828, + "num_input_tokens_seen": 347966330, + "step": 16124, + "time_per_iteration": 2.598200559616089 + }, + { + "auxiliary_loss_clip": 0.0109768, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.03514254, + "balance_loss_mlp": 1.02532792, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 5.068473651612268, + "language_loss": 0.73868597, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76003313, + "num_input_tokens_seen": 347982590, + "step": 16125, + "time_per_iteration": 2.4971091747283936 + }, + { + "auxiliary_loss_clip": 0.01018827, + "auxiliary_loss_mlp": 0.01003139, + "balance_loss_clip": 1.00523877, + "balance_loss_mlp": 1.00200009, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.8826876023184493, + "language_loss": 0.61429548, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63451517, + "num_input_tokens_seen": 348043310, + "step": 16126, + "time_per_iteration": 3.012446165084839 + }, + { + "auxiliary_loss_clip": 0.01096025, + "auxiliary_loss_mlp": 0.01038621, + "balance_loss_clip": 1.03695738, + "balance_loss_mlp": 1.02672374, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 1.7998314588604105, + "language_loss": 0.74813092, + "learning_rate": 9.662782766562738e-09, + "loss": 0.76947743, + "num_input_tokens_seen": 348062200, + "step": 16127, + "time_per_iteration": 2.4762439727783203 + }, + { + "auxiliary_loss_clip": 0.01063287, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.03206706, + "balance_loss_mlp": 1.01971591, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.659392901983182, + "language_loss": 0.69368529, + "learning_rate": 9.62458290188839e-09, + "loss": 0.71464533, + "num_input_tokens_seen": 348080685, + "step": 16128, + "time_per_iteration": 2.561795473098755 + }, + { + "auxiliary_loss_clip": 0.01080556, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.04066312, + "balance_loss_mlp": 1.01954412, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.551615990237612, + "language_loss": 0.65369517, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67481911, + "num_input_tokens_seen": 348102500, + "step": 16129, + "time_per_iteration": 2.66654896736145 + }, + { + "auxiliary_loss_clip": 0.01075539, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.03571582, + "balance_loss_mlp": 1.01752484, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 1.9105254403428034, + "language_loss": 0.63324988, + "learning_rate": 9.548409599691166e-09, + "loss": 0.65430999, + "num_input_tokens_seen": 348122515, + "step": 16130, + "time_per_iteration": 2.5701215267181396 + }, + { + "auxiliary_loss_clip": 0.01100971, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.03554821, + "balance_loss_mlp": 1.01990044, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 2.04080940688354, + "language_loss": 0.69694674, + "learning_rate": 9.510436165056867e-09, + "loss": 0.71828085, + "num_input_tokens_seen": 348138775, + "step": 16131, + "time_per_iteration": 3.908241033554077 + }, + { + "auxiliary_loss_clip": 0.0111043, + "auxiliary_loss_mlp": 0.00777971, + "balance_loss_clip": 1.03658664, + "balance_loss_mlp": 1.00060892, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 2.202822213031229, + "language_loss": 0.76477063, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78365469, + "num_input_tokens_seen": 348157115, + "step": 16132, + "time_per_iteration": 2.474895477294922 + }, + { + "auxiliary_loss_clip": 0.0107613, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.03537822, + "balance_loss_mlp": 1.02460968, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 5.265082585165302, + "language_loss": 0.78594482, + "learning_rate": 9.434715735916477e-09, + "loss": 0.80708212, + "num_input_tokens_seen": 348173035, + "step": 16133, + "time_per_iteration": 2.547987699508667 + }, + { + "auxiliary_loss_clip": 0.01078048, + "auxiliary_loss_mlp": 0.01028526, + "balance_loss_clip": 1.03490758, + "balance_loss_mlp": 1.01701617, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.7047833747507413, + "language_loss": 0.64754838, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66861415, + "num_input_tokens_seen": 348192960, + "step": 16134, + "time_per_iteration": 3.9875564575195312 + }, + { + "auxiliary_loss_clip": 0.01084226, + "auxiliary_loss_mlp": 0.01030396, + "balance_loss_clip": 1.03238046, + "balance_loss_mlp": 1.01798654, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 1.9716669742340018, + "language_loss": 0.80381227, + "learning_rate": 9.359297236513519e-09, + "loss": 0.82495844, + "num_input_tokens_seen": 348212805, + "step": 16135, + "time_per_iteration": 2.52359938621521 + }, + { + "auxiliary_loss_clip": 0.01099405, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.03580058, + "balance_loss_mlp": 1.02369213, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 2.865116339617387, + "language_loss": 0.73269182, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75405329, + "num_input_tokens_seen": 348232900, + "step": 16136, + "time_per_iteration": 2.520098924636841 + }, + { + "auxiliary_loss_clip": 0.01107834, + "auxiliary_loss_mlp": 0.01034062, + "balance_loss_clip": 1.03711903, + "balance_loss_mlp": 1.02317214, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.696165934566527, + "language_loss": 0.76307166, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78449059, + "num_input_tokens_seen": 348253065, + "step": 16137, + "time_per_iteration": 2.4456217288970947 + }, + { + "auxiliary_loss_clip": 0.00999827, + "auxiliary_loss_mlp": 0.01002945, + "balance_loss_clip": 1.01895571, + "balance_loss_mlp": 1.00178266, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 0.7709662914454477, + "language_loss": 0.54854584, + "learning_rate": 9.246735630678015e-09, + "loss": 0.56857353, + "num_input_tokens_seen": 348316075, + "step": 16138, + "time_per_iteration": 3.2446670532226562 + }, + { + "auxiliary_loss_clip": 0.01088423, + "auxiliary_loss_mlp": 0.01030222, + "balance_loss_clip": 1.03562331, + "balance_loss_mlp": 1.01846814, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 5.345534995969902, + "language_loss": 0.70591742, + "learning_rate": 9.209366072632007e-09, + "loss": 0.72710389, + "num_input_tokens_seen": 348337605, + "step": 16139, + "time_per_iteration": 2.621910333633423 + }, + { + "auxiliary_loss_clip": 0.0109982, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.03697157, + "balance_loss_mlp": 1.01913738, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 1.5319569142179053, + "language_loss": 0.7254585, + "learning_rate": 9.172072005566134e-09, + "loss": 0.74677396, + "num_input_tokens_seen": 348359430, + "step": 16140, + "time_per_iteration": 2.5014753341674805 + }, + { + "auxiliary_loss_clip": 0.01102028, + "auxiliary_loss_mlp": 0.0077864, + "balance_loss_clip": 1.03700292, + "balance_loss_mlp": 1.00062346, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.199251817214973, + "language_loss": 0.68625045, + "learning_rate": 9.13485343089504e-09, + "loss": 0.70505714, + "num_input_tokens_seen": 348377890, + "step": 16141, + "time_per_iteration": 2.438697338104248 + }, + { + "auxiliary_loss_clip": 0.01093568, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.03452539, + "balance_loss_mlp": 1.02038813, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 2.0070997289081904, + "language_loss": 0.6873796, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70863461, + "num_input_tokens_seen": 348396550, + "step": 16142, + "time_per_iteration": 2.498690605163574 + }, + { + "auxiliary_loss_clip": 0.01054997, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.03322709, + "balance_loss_mlp": 1.02098036, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.7153384005441488, + "language_loss": 0.55785114, + "learning_rate": 9.060642764378457e-09, + "loss": 0.5787394, + "num_input_tokens_seen": 348417120, + "step": 16143, + "time_per_iteration": 2.6296277046203613 + }, + { + "auxiliary_loss_clip": 0.01100061, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.03738797, + "balance_loss_mlp": 1.01924896, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 2.2462980050196197, + "language_loss": 0.67889667, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70020306, + "num_input_tokens_seen": 348437750, + "step": 16144, + "time_per_iteration": 2.527630090713501 + }, + { + "auxiliary_loss_clip": 0.01097457, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.03815329, + "balance_loss_mlp": 1.03060591, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 1.7791607814666444, + "language_loss": 0.72156292, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74295926, + "num_input_tokens_seen": 348460935, + "step": 16145, + "time_per_iteration": 2.6142735481262207 + }, + { + "auxiliary_loss_clip": 0.0108778, + "auxiliary_loss_mlp": 0.01027807, + "balance_loss_clip": 1.03614795, + "balance_loss_mlp": 1.01459873, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 3.2171735644912345, + "language_loss": 0.80213767, + "learning_rate": 8.949892992753395e-09, + "loss": 0.82329357, + "num_input_tokens_seen": 348474480, + "step": 16146, + "time_per_iteration": 3.954996109008789 + }, + { + "auxiliary_loss_clip": 0.01000785, + "auxiliary_loss_mlp": 0.0100079, + "balance_loss_clip": 1.01034105, + "balance_loss_mlp": 0.99962157, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.7635078167450825, + "language_loss": 0.54538953, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56540525, + "num_input_tokens_seen": 348541220, + "step": 16147, + "time_per_iteration": 3.238471746444702 + }, + { + "auxiliary_loss_clip": 0.01076688, + "auxiliary_loss_mlp": 0.0078069, + "balance_loss_clip": 1.03364539, + "balance_loss_mlp": 1.00059557, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 7.439743691797025, + "language_loss": 0.61099946, + "learning_rate": 8.876437313434682e-09, + "loss": 0.62957323, + "num_input_tokens_seen": 348559230, + "step": 16148, + "time_per_iteration": 2.591456413269043 + }, + { + "auxiliary_loss_clip": 0.01076173, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.04008079, + "balance_loss_mlp": 1.02892041, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.6724613392004037, + "language_loss": 0.73389369, + "learning_rate": 8.839822728487155e-09, + "loss": 0.75506103, + "num_input_tokens_seen": 348577850, + "step": 16149, + "time_per_iteration": 2.551384925842285 + }, + { + "auxiliary_loss_clip": 0.01097393, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.03473806, + "balance_loss_mlp": 1.02608728, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 2.5220849278353086, + "language_loss": 0.74995184, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77130699, + "num_input_tokens_seen": 348598345, + "step": 16150, + "time_per_iteration": 2.6396405696868896 + }, + { + "auxiliary_loss_clip": 0.01092674, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.03786314, + "balance_loss_mlp": 1.01500654, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 2.4940535679964846, + "language_loss": 0.73732203, + "learning_rate": 8.766820074958214e-09, + "loss": 0.75854409, + "num_input_tokens_seen": 348616300, + "step": 16151, + "time_per_iteration": 2.489719867706299 + }, + { + "auxiliary_loss_clip": 0.01096317, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.03612542, + "balance_loss_mlp": 1.01708353, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 1.8170635908722752, + "language_loss": 0.74695575, + "learning_rate": 8.730432009145027e-09, + "loss": 0.76820874, + "num_input_tokens_seen": 348633845, + "step": 16152, + "time_per_iteration": 2.476977825164795 + }, + { + "auxiliary_loss_clip": 0.01076803, + "auxiliary_loss_mlp": 0.01033458, + "balance_loss_clip": 1.04080594, + "balance_loss_mlp": 1.02144194, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 1.8505487728107035, + "language_loss": 0.67316931, + "learning_rate": 8.694119452473448e-09, + "loss": 0.69427192, + "num_input_tokens_seen": 348653070, + "step": 16153, + "time_per_iteration": 2.57519268989563 + }, + { + "auxiliary_loss_clip": 0.01050793, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.03357327, + "balance_loss_mlp": 1.01798534, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 1.711466998017948, + "language_loss": 0.70815825, + "learning_rate": 8.65788240632037e-09, + "loss": 0.72895443, + "num_input_tokens_seen": 348672145, + "step": 16154, + "time_per_iteration": 2.7256975173950195 + }, + { + "auxiliary_loss_clip": 0.01061223, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.03868008, + "balance_loss_mlp": 1.01788187, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 1.8507312312859927, + "language_loss": 0.81061125, + "learning_rate": 8.621720872059812e-09, + "loss": 0.83153749, + "num_input_tokens_seen": 348690615, + "step": 16155, + "time_per_iteration": 2.6570582389831543 + }, + { + "auxiliary_loss_clip": 0.01102263, + "auxiliary_loss_mlp": 0.00779733, + "balance_loss_clip": 1.04081416, + "balance_loss_mlp": 1.00065649, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 3.057654170081025, + "language_loss": 0.67661119, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69543117, + "num_input_tokens_seen": 348708665, + "step": 16156, + "time_per_iteration": 2.484386444091797 + }, + { + "auxiliary_loss_clip": 0.01098162, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.03453231, + "balance_loss_mlp": 1.02274895, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.49560360709723, + "language_loss": 0.90828192, + "learning_rate": 8.54962434469919e-09, + "loss": 0.92961186, + "num_input_tokens_seen": 348726105, + "step": 16157, + "time_per_iteration": 2.5032541751861572 + }, + { + "auxiliary_loss_clip": 0.01070973, + "auxiliary_loss_mlp": 0.00777378, + "balance_loss_clip": 1.03474581, + "balance_loss_mlp": 1.00065935, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 2.1394684553347694, + "language_loss": 0.72918081, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74766433, + "num_input_tokens_seen": 348743360, + "step": 16158, + "time_per_iteration": 2.5178678035736084 + }, + { + "auxiliary_loss_clip": 0.01059438, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.03333855, + "balance_loss_mlp": 1.02382815, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 2.231695354231557, + "language_loss": 0.60134542, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62230057, + "num_input_tokens_seen": 348759045, + "step": 16159, + "time_per_iteration": 4.0111305713653564 + }, + { + "auxiliary_loss_clip": 0.01102516, + "auxiliary_loss_mlp": 0.01026703, + "balance_loss_clip": 1.03472793, + "balance_loss_mlp": 1.01546788, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.727581615675749, + "language_loss": 0.78622097, + "learning_rate": 8.44204592704112e-09, + "loss": 0.80751312, + "num_input_tokens_seen": 348779910, + "step": 16160, + "time_per_iteration": 2.5091309547424316 + }, + { + "auxiliary_loss_clip": 0.01026334, + "auxiliary_loss_mlp": 0.00999041, + "balance_loss_clip": 1.00303781, + "balance_loss_mlp": 0.99784929, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7704601550390514, + "language_loss": 0.54310751, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56336123, + "num_input_tokens_seen": 348838995, + "step": 16161, + "time_per_iteration": 3.0174899101257324 + }, + { + "auxiliary_loss_clip": 0.01094146, + "auxiliary_loss_mlp": 0.00777183, + "balance_loss_clip": 1.03580546, + "balance_loss_mlp": 1.00063515, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 1.8139902536907762, + "language_loss": 0.72155809, + "learning_rate": 8.3707045800554e-09, + "loss": 0.74027145, + "num_input_tokens_seen": 348858090, + "step": 16162, + "time_per_iteration": 2.460686206817627 + }, + { + "auxiliary_loss_clip": 0.01072812, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.03232992, + "balance_loss_mlp": 1.01800942, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.6052541193497745, + "language_loss": 0.78439057, + "learning_rate": 8.335147190060787e-09, + "loss": 0.80542207, + "num_input_tokens_seen": 348877885, + "step": 16163, + "time_per_iteration": 2.5568125247955322 + }, + { + "auxiliary_loss_clip": 0.0108637, + "auxiliary_loss_mlp": 0.01027886, + "balance_loss_clip": 1.0394094, + "balance_loss_mlp": 1.01610792, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 1.8107159025496242, + "language_loss": 0.73050225, + "learning_rate": 8.299665324196903e-09, + "loss": 0.75164479, + "num_input_tokens_seen": 348897720, + "step": 16164, + "time_per_iteration": 2.5289981365203857 + }, + { + "auxiliary_loss_clip": 0.01049657, + "auxiliary_loss_mlp": 0.01040688, + "balance_loss_clip": 1.03224385, + "balance_loss_mlp": 1.02611446, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 2.0150857732507803, + "language_loss": 0.84053135, + "learning_rate": 8.264258983809114e-09, + "loss": 0.86143482, + "num_input_tokens_seen": 348915410, + "step": 16165, + "time_per_iteration": 2.5964057445526123 + }, + { + "auxiliary_loss_clip": 0.01072487, + "auxiliary_loss_mlp": 0.01027238, + "balance_loss_clip": 1.03403163, + "balance_loss_mlp": 1.01670587, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.4855230295791455, + "language_loss": 0.78923762, + "learning_rate": 8.228928170240345e-09, + "loss": 0.8102349, + "num_input_tokens_seen": 348934335, + "step": 16166, + "time_per_iteration": 2.5926666259765625 + }, + { + "auxiliary_loss_clip": 0.01082586, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.03724885, + "balance_loss_mlp": 1.01898885, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.914438339507373, + "language_loss": 0.70787442, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72900522, + "num_input_tokens_seen": 348952405, + "step": 16167, + "time_per_iteration": 2.483912467956543 + }, + { + "auxiliary_loss_clip": 0.01081156, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.03928459, + "balance_loss_mlp": 1.02453041, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.4374914308163185, + "language_loss": 0.7563256, + "learning_rate": 8.158493128915812e-09, + "loss": 0.77749902, + "num_input_tokens_seen": 348973580, + "step": 16168, + "time_per_iteration": 2.557131052017212 + }, + { + "auxiliary_loss_clip": 0.01057905, + "auxiliary_loss_mlp": 0.01051537, + "balance_loss_clip": 1.03622949, + "balance_loss_mlp": 1.03643346, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 2.6425996171344344, + "language_loss": 0.7273975, + "learning_rate": 8.123388903830797e-09, + "loss": 0.74849188, + "num_input_tokens_seen": 348992035, + "step": 16169, + "time_per_iteration": 2.585688352584839 + }, + { + "auxiliary_loss_clip": 0.01075452, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.03215528, + "balance_loss_mlp": 1.01990223, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 2.491593337502028, + "language_loss": 0.57544732, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59653926, + "num_input_tokens_seen": 349013160, + "step": 16170, + "time_per_iteration": 4.170869827270508 + }, + { + "auxiliary_loss_clip": 0.01075586, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.03345871, + "balance_loss_mlp": 1.01719487, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1.927404748470966, + "language_loss": 0.71521026, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73626649, + "num_input_tokens_seen": 349033485, + "step": 16171, + "time_per_iteration": 2.5734171867370605 + }, + { + "auxiliary_loss_clip": 0.01074698, + "auxiliary_loss_mlp": 0.01035662, + "balance_loss_clip": 1.03332925, + "balance_loss_mlp": 1.02330637, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 1.7187524276176356, + "language_loss": 0.68594444, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70704794, + "num_input_tokens_seen": 349051705, + "step": 16172, + "time_per_iteration": 2.5611371994018555 + }, + { + "auxiliary_loss_clip": 0.01093013, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.03352678, + "balance_loss_mlp": 1.01604998, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 1.6836730708821122, + "language_loss": 0.86184651, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88306034, + "num_input_tokens_seen": 349070825, + "step": 16173, + "time_per_iteration": 3.95324969291687 + }, + { + "auxiliary_loss_clip": 0.01062025, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.0325799, + "balance_loss_mlp": 1.01841521, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 1.7414169615187989, + "language_loss": 0.64305139, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66399884, + "num_input_tokens_seen": 349089730, + "step": 16174, + "time_per_iteration": 2.578667163848877 + }, + { + "auxiliary_loss_clip": 0.01096645, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.03496194, + "balance_loss_mlp": 1.01688588, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.5427487363543908, + "language_loss": 0.78033221, + "learning_rate": 7.914349775085538e-09, + "loss": 0.80158037, + "num_input_tokens_seen": 349111315, + "step": 16175, + "time_per_iteration": 2.505740165710449 + }, + { + "auxiliary_loss_clip": 0.01098413, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.03677893, + "balance_loss_mlp": 1.01981044, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 5.547813836491527, + "language_loss": 0.56769526, + "learning_rate": 7.879774302919307e-09, + "loss": 0.58901042, + "num_input_tokens_seen": 349129495, + "step": 16176, + "time_per_iteration": 2.4366610050201416 + }, + { + "auxiliary_loss_clip": 0.01087616, + "auxiliary_loss_mlp": 0.01030163, + "balance_loss_clip": 1.03753328, + "balance_loss_mlp": 1.01924944, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.253306559584769, + "language_loss": 0.72562993, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74680775, + "num_input_tokens_seen": 349148850, + "step": 16177, + "time_per_iteration": 2.554177761077881 + }, + { + "auxiliary_loss_clip": 0.01086388, + "auxiliary_loss_mlp": 0.01031035, + "balance_loss_clip": 1.03374159, + "balance_loss_mlp": 1.01904869, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.6549552780611945, + "language_loss": 0.68782723, + "learning_rate": 7.810849984090984e-09, + "loss": 0.70900148, + "num_input_tokens_seen": 349167620, + "step": 16178, + "time_per_iteration": 2.5275001525878906 + }, + { + "auxiliary_loss_clip": 0.01059762, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.03305805, + "balance_loss_mlp": 1.01889563, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 1.9938729328012899, + "language_loss": 0.67400956, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69492078, + "num_input_tokens_seen": 349185845, + "step": 16179, + "time_per_iteration": 2.648258924484253 + }, + { + "auxiliary_loss_clip": 0.01085337, + "auxiliary_loss_mlp": 0.00776268, + "balance_loss_clip": 1.03617287, + "balance_loss_mlp": 1.00055063, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 1.717559770424344, + "language_loss": 0.76818889, + "learning_rate": 7.742227841308624e-09, + "loss": 0.78680491, + "num_input_tokens_seen": 349204525, + "step": 16180, + "time_per_iteration": 2.5233840942382812 + }, + { + "auxiliary_loss_clip": 0.01100333, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.03691709, + "balance_loss_mlp": 1.0209949, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 1.9418831798444514, + "language_loss": 0.76613581, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78747213, + "num_input_tokens_seen": 349228075, + "step": 16181, + "time_per_iteration": 2.547711133956909 + }, + { + "auxiliary_loss_clip": 0.01107515, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.0357151, + "balance_loss_mlp": 1.02197528, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.4521104638693627, + "language_loss": 0.63405788, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65547574, + "num_input_tokens_seen": 349246990, + "step": 16182, + "time_per_iteration": 2.410273551940918 + }, + { + "auxiliary_loss_clip": 0.01042823, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.04007602, + "balance_loss_mlp": 1.02300274, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 2.0601224095976445, + "language_loss": 0.62175119, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64254236, + "num_input_tokens_seen": 349265890, + "step": 16183, + "time_per_iteration": 2.9128427505493164 + }, + { + "auxiliary_loss_clip": 0.01086607, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.03469396, + "balance_loss_mlp": 1.02088976, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.5828036575276805, + "language_loss": 0.7797097, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80091095, + "num_input_tokens_seen": 349285275, + "step": 16184, + "time_per_iteration": 2.9707746505737305 + }, + { + "auxiliary_loss_clip": 0.01069838, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.02992272, + "balance_loss_mlp": 1.01808167, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.265859699475177, + "language_loss": 0.79188609, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81289113, + "num_input_tokens_seen": 349301515, + "step": 16185, + "time_per_iteration": 3.9779412746429443 + }, + { + "auxiliary_loss_clip": 0.01078923, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.0356437, + "balance_loss_mlp": 1.01876533, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 1.6828152545668993, + "language_loss": 0.77630186, + "learning_rate": 7.538174573094469e-09, + "loss": 0.79739624, + "num_input_tokens_seen": 349319590, + "step": 16186, + "time_per_iteration": 2.512559413909912 + }, + { + "auxiliary_loss_clip": 0.01083353, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.03486037, + "balance_loss_mlp": 1.01690507, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.7491532377199857, + "language_loss": 0.65307283, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67419398, + "num_input_tokens_seen": 349339230, + "step": 16187, + "time_per_iteration": 2.512036085128784 + }, + { + "auxiliary_loss_clip": 0.01081003, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.03137481, + "balance_loss_mlp": 1.02445769, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.638331217463719, + "language_loss": 0.80603075, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82721037, + "num_input_tokens_seen": 349361155, + "step": 16188, + "time_per_iteration": 2.6088318824768066 + }, + { + "auxiliary_loss_clip": 0.01074044, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.03529811, + "balance_loss_mlp": 1.01862931, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 2.1247223571745804, + "language_loss": 0.77910119, + "learning_rate": 7.437167905363084e-09, + "loss": 0.8001402, + "num_input_tokens_seen": 349379335, + "step": 16189, + "time_per_iteration": 2.588538408279419 + }, + { + "auxiliary_loss_clip": 0.01092056, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.03236127, + "balance_loss_mlp": 1.01660621, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.803399684240747, + "language_loss": 0.51506722, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53628212, + "num_input_tokens_seen": 349401575, + "step": 16190, + "time_per_iteration": 2.6218771934509277 + }, + { + "auxiliary_loss_clip": 0.01097323, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.03657019, + "balance_loss_mlp": 1.01791883, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 7.808443570921943, + "language_loss": 0.80833066, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.82960159, + "num_input_tokens_seen": 349420650, + "step": 16191, + "time_per_iteration": 2.470203399658203 + }, + { + "auxiliary_loss_clip": 0.01092574, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.032619, + "balance_loss_mlp": 1.01755381, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 2.004838363673076, + "language_loss": 0.82795393, + "learning_rate": 7.336841261255111e-09, + "loss": 0.84917408, + "num_input_tokens_seen": 349436830, + "step": 16192, + "time_per_iteration": 2.4419567584991455 + }, + { + "auxiliary_loss_clip": 0.01049374, + "auxiliary_loss_mlp": 0.01034356, + "balance_loss_clip": 1.03615808, + "balance_loss_mlp": 1.02142811, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 1.7797445975499742, + "language_loss": 0.7489686, + "learning_rate": 7.303550168837658e-09, + "loss": 0.76980591, + "num_input_tokens_seen": 349454325, + "step": 16193, + "time_per_iteration": 2.601452589035034 + }, + { + "auxiliary_loss_clip": 0.01081858, + "auxiliary_loss_mlp": 0.01034509, + "balance_loss_clip": 1.03753936, + "balance_loss_mlp": 1.02373278, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 14.091692572390901, + "language_loss": 0.85181975, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87298346, + "num_input_tokens_seen": 349470230, + "step": 16194, + "time_per_iteration": 2.5381593704223633 + }, + { + "auxiliary_loss_clip": 0.01070808, + "auxiliary_loss_mlp": 0.0103199, + "balance_loss_clip": 1.03372312, + "balance_loss_mlp": 1.01967609, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 1.7641896957328893, + "language_loss": 0.75883311, + "learning_rate": 7.237194675009828e-09, + "loss": 0.77986109, + "num_input_tokens_seen": 349486250, + "step": 16195, + "time_per_iteration": 2.497772455215454 + }, + { + "auxiliary_loss_clip": 0.01007166, + "auxiliary_loss_mlp": 0.01004837, + "balance_loss_clip": 1.01057029, + "balance_loss_mlp": 1.0035193, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.713029027564211, + "language_loss": 0.52457047, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54469049, + "num_input_tokens_seen": 349545865, + "step": 16196, + "time_per_iteration": 3.0650439262390137 + }, + { + "auxiliary_loss_clip": 0.01082125, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.03574991, + "balance_loss_mlp": 1.01869345, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 1.9061551456959667, + "language_loss": 0.76640952, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78753507, + "num_input_tokens_seen": 349566080, + "step": 16197, + "time_per_iteration": 2.5675833225250244 + }, + { + "auxiliary_loss_clip": 0.01110198, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.03555942, + "balance_loss_mlp": 1.01531363, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 2.1229747816659224, + "language_loss": 0.67839813, + "learning_rate": 7.13822818063492e-09, + "loss": 0.69977713, + "num_input_tokens_seen": 349585665, + "step": 16198, + "time_per_iteration": 3.867083787918091 + }, + { + "auxiliary_loss_clip": 0.01107007, + "auxiliary_loss_mlp": 0.01028295, + "balance_loss_clip": 1.03456497, + "balance_loss_mlp": 1.01557565, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 1.93646797823993, + "language_loss": 0.77835369, + "learning_rate": 7.10539048654768e-09, + "loss": 0.7997067, + "num_input_tokens_seen": 349605125, + "step": 16199, + "time_per_iteration": 2.457148551940918 + }, + { + "auxiliary_loss_clip": 0.01087566, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.03962612, + "balance_loss_mlp": 1.02293551, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 2.3011642385456446, + "language_loss": 0.79558396, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81681013, + "num_input_tokens_seen": 349623360, + "step": 16200, + "time_per_iteration": 2.5250871181488037 + }, + { + "auxiliary_loss_clip": 0.01059161, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.0378269, + "balance_loss_mlp": 1.02145338, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 2.0116524816146457, + "language_loss": 0.68704951, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70798481, + "num_input_tokens_seen": 349644390, + "step": 16201, + "time_per_iteration": 2.632915496826172 + }, + { + "auxiliary_loss_clip": 0.01076776, + "auxiliary_loss_mlp": 0.01028357, + "balance_loss_clip": 1.03352809, + "balance_loss_mlp": 1.01665044, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.386543978885837, + "language_loss": 0.72496206, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.7460134, + "num_input_tokens_seen": 349663200, + "step": 16202, + "time_per_iteration": 2.548532724380493 + }, + { + "auxiliary_loss_clip": 0.01084325, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.03435373, + "balance_loss_mlp": 1.01922774, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 2.5520198266543557, + "language_loss": 0.73079687, + "learning_rate": 6.974795430241265e-09, + "loss": 0.7519564, + "num_input_tokens_seen": 349681975, + "step": 16203, + "time_per_iteration": 2.4816157817840576 + }, + { + "auxiliary_loss_clip": 0.01106519, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.0349021, + "balance_loss_mlp": 1.01861846, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.7035163927049088, + "language_loss": 0.77665746, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79803091, + "num_input_tokens_seen": 349701185, + "step": 16204, + "time_per_iteration": 2.4610435962677 + }, + { + "auxiliary_loss_clip": 0.01093124, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.03879273, + "balance_loss_mlp": 1.02480388, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 1.8948294723822996, + "language_loss": 0.79363108, + "learning_rate": 6.909951351435905e-09, + "loss": 0.8149448, + "num_input_tokens_seen": 349720360, + "step": 16205, + "time_per_iteration": 2.504411220550537 + }, + { + "auxiliary_loss_clip": 0.01108104, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03683901, + "balance_loss_mlp": 1.0205487, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 2.376150158640391, + "language_loss": 0.74360168, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76500547, + "num_input_tokens_seen": 349741040, + "step": 16206, + "time_per_iteration": 2.4633519649505615 + }, + { + "auxiliary_loss_clip": 0.01053452, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.03243005, + "balance_loss_mlp": 1.02066493, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.0900164575146993, + "language_loss": 0.84003294, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86090088, + "num_input_tokens_seen": 349758895, + "step": 16207, + "time_per_iteration": 2.5716912746429443 + }, + { + "auxiliary_loss_clip": 0.01096807, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.03643394, + "balance_loss_mlp": 1.02081871, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.6299356914165275, + "language_loss": 0.70950365, + "learning_rate": 6.813252072591425e-09, + "loss": 0.73079962, + "num_input_tokens_seen": 349779740, + "step": 16208, + "time_per_iteration": 2.519249200820923 + }, + { + "auxiliary_loss_clip": 0.01068564, + "auxiliary_loss_mlp": 0.01024311, + "balance_loss_clip": 1.03312778, + "balance_loss_mlp": 1.01392245, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 2.0415183740979526, + "language_loss": 0.7674613, + "learning_rate": 6.781170141698878e-09, + "loss": 0.78839004, + "num_input_tokens_seen": 349796820, + "step": 16209, + "time_per_iteration": 2.5238494873046875 + }, + { + "auxiliary_loss_clip": 0.01072911, + "auxiliary_loss_mlp": 0.00782044, + "balance_loss_clip": 1.03242922, + "balance_loss_mlp": 1.00056994, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.728767821461761, + "language_loss": 0.78528714, + "learning_rate": 6.749163793864144e-09, + "loss": 0.8038367, + "num_input_tokens_seen": 349816550, + "step": 16210, + "time_per_iteration": 4.105688810348511 + }, + { + "auxiliary_loss_clip": 0.0108568, + "auxiliary_loss_mlp": 0.01037609, + "balance_loss_clip": 1.0345999, + "balance_loss_mlp": 1.02513373, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 4.371609949226664, + "language_loss": 0.78440243, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80563527, + "num_input_tokens_seen": 349834350, + "step": 16211, + "time_per_iteration": 2.5676465034484863 + }, + { + "auxiliary_loss_clip": 0.01080812, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.03978086, + "balance_loss_mlp": 1.02022827, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 2.2260292979497414, + "language_loss": 0.78281415, + "learning_rate": 6.685377852219787e-09, + "loss": 0.8039574, + "num_input_tokens_seen": 349853460, + "step": 16212, + "time_per_iteration": 2.567105293273926 + }, + { + "auxiliary_loss_clip": 0.0108103, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.03847969, + "balance_loss_mlp": 1.02185369, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.4615020862544186, + "language_loss": 0.80157399, + "learning_rate": 6.653598260829118e-09, + "loss": 0.8227222, + "num_input_tokens_seen": 349874830, + "step": 16213, + "time_per_iteration": 3.978624105453491 + }, + { + "auxiliary_loss_clip": 0.01062997, + "auxiliary_loss_mlp": 0.01026299, + "balance_loss_clip": 1.03266048, + "balance_loss_mlp": 1.01495004, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 2.126840137995832, + "language_loss": 0.66153926, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68243217, + "num_input_tokens_seen": 349893690, + "step": 16214, + "time_per_iteration": 2.5949864387512207 + }, + { + "auxiliary_loss_clip": 0.01093689, + "auxiliary_loss_mlp": 0.01030659, + "balance_loss_clip": 1.03933048, + "balance_loss_mlp": 1.01836228, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.7575376181522533, + "language_loss": 0.74260604, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76384956, + "num_input_tokens_seen": 349912480, + "step": 16215, + "time_per_iteration": 2.5188496112823486 + }, + { + "auxiliary_loss_clip": 0.01060728, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.03645182, + "balance_loss_mlp": 1.02269101, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.7553783548446142, + "language_loss": 0.66898203, + "learning_rate": 6.558713018834483e-09, + "loss": 0.68993634, + "num_input_tokens_seen": 349932470, + "step": 16216, + "time_per_iteration": 2.7072463035583496 + }, + { + "auxiliary_loss_clip": 0.01055792, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.03113508, + "balance_loss_mlp": 1.01747942, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 1.862736890352414, + "language_loss": 0.72189099, + "learning_rate": 6.527235786226937e-09, + "loss": 0.7427476, + "num_input_tokens_seen": 349949060, + "step": 16217, + "time_per_iteration": 2.591115951538086 + }, + { + "auxiliary_loss_clip": 0.01074691, + "auxiliary_loss_mlp": 0.01029537, + "balance_loss_clip": 1.03721941, + "balance_loss_mlp": 1.01762152, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.6309499092516795, + "language_loss": 0.78215665, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80319893, + "num_input_tokens_seen": 349968010, + "step": 16218, + "time_per_iteration": 2.602875232696533 + }, + { + "auxiliary_loss_clip": 0.0108089, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.03608584, + "balance_loss_mlp": 1.01719999, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 2.2997578404593715, + "language_loss": 0.77303618, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79414129, + "num_input_tokens_seen": 349985270, + "step": 16219, + "time_per_iteration": 2.503016471862793 + }, + { + "auxiliary_loss_clip": 0.0108842, + "auxiliary_loss_mlp": 0.01032286, + "balance_loss_clip": 1.03568828, + "balance_loss_mlp": 1.02055025, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 2.1985639869692926, + "language_loss": 0.80819005, + "learning_rate": 6.433257649285817e-09, + "loss": 0.82939708, + "num_input_tokens_seen": 350003935, + "step": 16220, + "time_per_iteration": 2.5302491188049316 + }, + { + "auxiliary_loss_clip": 0.01105788, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.03522027, + "balance_loss_mlp": 1.01961815, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 1.8532148542872366, + "language_loss": 0.7502743, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77164406, + "num_input_tokens_seen": 350023595, + "step": 16221, + "time_per_iteration": 2.41676664352417 + }, + { + "auxiliary_loss_clip": 0.01072146, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.03330493, + "balance_loss_mlp": 1.01521826, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.588437225980897, + "language_loss": 0.66386133, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68484914, + "num_input_tokens_seen": 350045920, + "step": 16222, + "time_per_iteration": 2.6029207706451416 + }, + { + "auxiliary_loss_clip": 0.01095818, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.03532016, + "balance_loss_mlp": 1.02235866, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.741303875359237, + "language_loss": 0.88338852, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90468359, + "num_input_tokens_seen": 350063925, + "step": 16223, + "time_per_iteration": 2.498760938644409 + }, + { + "auxiliary_loss_clip": 0.01047352, + "auxiliary_loss_mlp": 0.01035321, + "balance_loss_clip": 1.03316712, + "balance_loss_mlp": 1.023597, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.8228227586808623, + "language_loss": 0.74690819, + "learning_rate": 6.309011819690457e-09, + "loss": 0.76773489, + "num_input_tokens_seen": 350080900, + "step": 16224, + "time_per_iteration": 4.079948425292969 + }, + { + "auxiliary_loss_clip": 0.0101087, + "auxiliary_loss_mlp": 0.01002595, + "balance_loss_clip": 1.00668013, + "balance_loss_mlp": 1.00135481, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8077906761813416, + "language_loss": 0.59105563, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61119032, + "num_input_tokens_seen": 350144550, + "step": 16225, + "time_per_iteration": 3.0391385555267334 + }, + { + "auxiliary_loss_clip": 0.01078243, + "auxiliary_loss_mlp": 0.00777505, + "balance_loss_clip": 1.03833592, + "balance_loss_mlp": 1.00062561, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 1.847636834142266, + "language_loss": 0.68979985, + "learning_rate": 6.247342505960818e-09, + "loss": 0.70835733, + "num_input_tokens_seen": 350164050, + "step": 16226, + "time_per_iteration": 2.6067416667938232 + }, + { + "auxiliary_loss_clip": 0.01095071, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.03418446, + "balance_loss_mlp": 1.02719045, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.7440856131510865, + "language_loss": 0.83011878, + "learning_rate": 6.216621253462894e-09, + "loss": 0.85146928, + "num_input_tokens_seen": 350181350, + "step": 16227, + "time_per_iteration": 2.4419729709625244 + }, + { + "auxiliary_loss_clip": 0.01106078, + "auxiliary_loss_mlp": 0.01027113, + "balance_loss_clip": 1.03542733, + "balance_loss_mlp": 1.01573408, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.7126402145282065, + "language_loss": 0.77810967, + "learning_rate": 6.185975605430549e-09, + "loss": 0.79944164, + "num_input_tokens_seen": 350199765, + "step": 16228, + "time_per_iteration": 2.4594390392303467 + }, + { + "auxiliary_loss_clip": 0.01018637, + "auxiliary_loss_mlp": 0.00999975, + "balance_loss_clip": 1.00395322, + "balance_loss_mlp": 0.99880725, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8469591402522958, + "language_loss": 0.55808252, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57826865, + "num_input_tokens_seen": 350256420, + "step": 16229, + "time_per_iteration": 2.9576902389526367 + }, + { + "auxiliary_loss_clip": 0.01099482, + "auxiliary_loss_mlp": 0.01029684, + "balance_loss_clip": 1.03591037, + "balance_loss_mlp": 1.01691699, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.7445939230233045, + "language_loss": 0.75095928, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77225101, + "num_input_tokens_seen": 350276270, + "step": 16230, + "time_per_iteration": 2.501345157623291 + }, + { + "auxiliary_loss_clip": 0.01082, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.0361613, + "balance_loss_mlp": 1.02010977, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 1.770769668820177, + "language_loss": 0.72204542, + "learning_rate": 6.094492299733245e-09, + "loss": 0.74317372, + "num_input_tokens_seen": 350295000, + "step": 16231, + "time_per_iteration": 2.4722321033477783 + }, + { + "auxiliary_loss_clip": 0.01089577, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.0371871, + "balance_loss_mlp": 1.01759148, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 4.077854211705255, + "language_loss": 0.76917815, + "learning_rate": 6.064149081155267e-09, + "loss": 0.79037839, + "num_input_tokens_seen": 350314980, + "step": 16232, + "time_per_iteration": 2.5389201641082764 + }, + { + "auxiliary_loss_clip": 0.01017637, + "auxiliary_loss_mlp": 0.01001356, + "balance_loss_clip": 1.01770556, + "balance_loss_mlp": 1.00017595, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7379216594221529, + "language_loss": 0.53820872, + "learning_rate": 6.033881472824465e-09, + "loss": 0.5583986, + "num_input_tokens_seen": 350371985, + "step": 16233, + "time_per_iteration": 2.9890975952148438 + }, + { + "auxiliary_loss_clip": 0.01107769, + "auxiliary_loss_mlp": 0.01036049, + "balance_loss_clip": 1.03508735, + "balance_loss_mlp": 1.02373457, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1.7370487234389997, + "language_loss": 0.71457922, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73601735, + "num_input_tokens_seen": 350390590, + "step": 16234, + "time_per_iteration": 2.434983968734741 + }, + { + "auxiliary_loss_clip": 0.01100659, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.03647673, + "balance_loss_mlp": 1.01659405, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.3660395962654794, + "language_loss": 0.79149902, + "learning_rate": 5.973573091493156e-09, + "loss": 0.81280136, + "num_input_tokens_seen": 350403770, + "step": 16235, + "time_per_iteration": 2.4153366088867188 + }, + { + "auxiliary_loss_clip": 0.01091629, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.03611243, + "balance_loss_mlp": 1.02287912, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.8917397913695273, + "language_loss": 0.76976252, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79104304, + "num_input_tokens_seen": 350421870, + "step": 16236, + "time_per_iteration": 2.4720375537872314 + }, + { + "auxiliary_loss_clip": 0.01095299, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.03375852, + "balance_loss_mlp": 1.01598382, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 1.6714281276723222, + "language_loss": 0.7568866, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77811813, + "num_input_tokens_seen": 350440025, + "step": 16237, + "time_per_iteration": 2.4413039684295654 + }, + { + "auxiliary_loss_clip": 0.01063951, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.03230631, + "balance_loss_mlp": 1.02061462, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.690505809685054, + "language_loss": 0.72799873, + "learning_rate": 5.8836776249509e-09, + "loss": 0.7489835, + "num_input_tokens_seen": 350459435, + "step": 16238, + "time_per_iteration": 4.049142837524414 + }, + { + "auxiliary_loss_clip": 0.01086659, + "auxiliary_loss_mlp": 0.00777828, + "balance_loss_clip": 1.03524804, + "balance_loss_mlp": 1.00053525, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 2.304354299062998, + "language_loss": 0.83964837, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85829324, + "num_input_tokens_seen": 350472655, + "step": 16239, + "time_per_iteration": 2.4920105934143066 + }, + { + "auxiliary_loss_clip": 0.01069023, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.03170419, + "balance_loss_mlp": 1.02923477, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 2.9009852923570856, + "language_loss": 0.59653646, + "learning_rate": 5.824125397483115e-09, + "loss": 0.6176672, + "num_input_tokens_seen": 350488160, + "step": 16240, + "time_per_iteration": 2.5033295154571533 + }, + { + "auxiliary_loss_clip": 0.01071494, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.03539705, + "balance_loss_mlp": 1.01903391, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 2.1969685440365034, + "language_loss": 0.82483625, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84586108, + "num_input_tokens_seen": 350506065, + "step": 16241, + "time_per_iteration": 2.504380226135254 + }, + { + "auxiliary_loss_clip": 0.01071572, + "auxiliary_loss_mlp": 0.0103823, + "balance_loss_clip": 1.0343709, + "balance_loss_mlp": 1.02664256, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 1.731367055802584, + "language_loss": 0.83378339, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85488141, + "num_input_tokens_seen": 350524495, + "step": 16242, + "time_per_iteration": 2.552677631378174 + }, + { + "auxiliary_loss_clip": 0.01099413, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.03834808, + "balance_loss_mlp": 1.01799417, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.661651017881992, + "language_loss": 0.75575149, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77704626, + "num_input_tokens_seen": 350544185, + "step": 16243, + "time_per_iteration": 2.466900110244751 + }, + { + "auxiliary_loss_clip": 0.01097232, + "auxiliary_loss_mlp": 0.01038066, + "balance_loss_clip": 1.03416598, + "balance_loss_mlp": 1.02485716, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.6850340583458017, + "language_loss": 0.69739705, + "learning_rate": 5.705928383713754e-09, + "loss": 0.71875, + "num_input_tokens_seen": 350562675, + "step": 16244, + "time_per_iteration": 2.4564852714538574 + }, + { + "auxiliary_loss_clip": 0.01089365, + "auxiliary_loss_mlp": 0.01025776, + "balance_loss_clip": 1.03766692, + "balance_loss_mlp": 1.01359296, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.7935895050671449, + "language_loss": 0.83783603, + "learning_rate": 5.676568187055197e-09, + "loss": 0.85898739, + "num_input_tokens_seen": 350581535, + "step": 16245, + "time_per_iteration": 2.522792100906372 + }, + { + "auxiliary_loss_clip": 0.0105657, + "auxiliary_loss_mlp": 0.01028649, + "balance_loss_clip": 1.03338099, + "balance_loss_mlp": 1.01705027, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.331796245886297, + "language_loss": 0.78433144, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80518365, + "num_input_tokens_seen": 350601615, + "step": 16246, + "time_per_iteration": 2.6511130332946777 + }, + { + "auxiliary_loss_clip": 0.01100396, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.03477669, + "balance_loss_mlp": 1.01992774, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.9431238130242936, + "language_loss": 0.7425369, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76384598, + "num_input_tokens_seen": 350619580, + "step": 16247, + "time_per_iteration": 2.4144561290740967 + }, + { + "auxiliary_loss_clip": 0.01057327, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.03430629, + "balance_loss_mlp": 1.01842082, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.8339981387862814, + "language_loss": 0.80126077, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82214665, + "num_input_tokens_seen": 350640015, + "step": 16248, + "time_per_iteration": 2.675762414932251 + }, + { + "auxiliary_loss_clip": 0.01051826, + "auxiliary_loss_mlp": 0.01044885, + "balance_loss_clip": 1.03669834, + "balance_loss_mlp": 1.03046036, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 1.7138223330181033, + "language_loss": 0.79169899, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81266606, + "num_input_tokens_seen": 350659155, + "step": 16249, + "time_per_iteration": 4.120872735977173 + }, + { + "auxiliary_loss_clip": 0.01093882, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.03598583, + "balance_loss_mlp": 1.02013838, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 2.912917572648867, + "language_loss": 0.66608512, + "learning_rate": 5.530901600093507e-09, + "loss": 0.68734783, + "num_input_tokens_seen": 350676615, + "step": 16250, + "time_per_iteration": 2.445617198944092 + }, + { + "auxiliary_loss_clip": 0.01026875, + "auxiliary_loss_mlp": 0.01002506, + "balance_loss_clip": 1.00359535, + "balance_loss_mlp": 1.00139737, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.7771616407496864, + "language_loss": 0.59904647, + "learning_rate": 5.501995169700846e-09, + "loss": 0.6193403, + "num_input_tokens_seen": 350736805, + "step": 16251, + "time_per_iteration": 3.056509256362915 + }, + { + "auxiliary_loss_clip": 0.01094056, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.03354895, + "balance_loss_mlp": 1.01797295, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.6828646742937676, + "language_loss": 0.78531957, + "learning_rate": 5.473164370872307e-09, + "loss": 0.8065666, + "num_input_tokens_seen": 350753600, + "step": 16252, + "time_per_iteration": 2.5736873149871826 + }, + { + "auxiliary_loss_clip": 0.01090578, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.02359545, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 4.164652942763839, + "language_loss": 0.64821392, + "learning_rate": 5.444409204701461e-09, + "loss": 0.6694808, + "num_input_tokens_seen": 350771225, + "step": 16253, + "time_per_iteration": 3.849644184112549 + }, + { + "auxiliary_loss_clip": 0.01101724, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.03788614, + "balance_loss_mlp": 1.02053678, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 2.2871919801324903, + "language_loss": 0.76464939, + "learning_rate": 5.415729672278324e-09, + "loss": 0.78601289, + "num_input_tokens_seen": 350789100, + "step": 16254, + "time_per_iteration": 2.454040050506592 + }, + { + "auxiliary_loss_clip": 0.01102211, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.03674304, + "balance_loss_mlp": 1.01756632, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.9548040355119924, + "language_loss": 0.6385181, + "learning_rate": 5.387125774690471e-09, + "loss": 0.65983957, + "num_input_tokens_seen": 350811085, + "step": 16255, + "time_per_iteration": 2.59696102142334 + }, + { + "auxiliary_loss_clip": 0.01082307, + "auxiliary_loss_mlp": 0.00778853, + "balance_loss_clip": 1.03652179, + "balance_loss_mlp": 1.0005939, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.6975778840990647, + "language_loss": 0.75642544, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77503705, + "num_input_tokens_seen": 350831065, + "step": 16256, + "time_per_iteration": 2.55041766166687 + }, + { + "auxiliary_loss_clip": 0.0110579, + "auxiliary_loss_mlp": 0.01039519, + "balance_loss_clip": 1.03658009, + "balance_loss_mlp": 1.02622688, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 2.5812654508286306, + "language_loss": 0.78164542, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80309856, + "num_input_tokens_seen": 350849675, + "step": 16257, + "time_per_iteration": 2.4304895401000977 + }, + { + "auxiliary_loss_clip": 0.01095956, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.03953695, + "balance_loss_mlp": 1.02105618, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.589096063007332, + "language_loss": 0.75081003, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77210945, + "num_input_tokens_seen": 350868955, + "step": 16258, + "time_per_iteration": 2.5126001834869385 + }, + { + "auxiliary_loss_clip": 0.01020519, + "auxiliary_loss_mlp": 0.00999765, + "balance_loss_clip": 1.00669479, + "balance_loss_mlp": 0.99848986, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6670162191937593, + "language_loss": 0.5980978, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61830068, + "num_input_tokens_seen": 350935110, + "step": 16259, + "time_per_iteration": 3.108086347579956 + }, + { + "auxiliary_loss_clip": 0.0109359, + "auxiliary_loss_mlp": 0.01033214, + "balance_loss_clip": 1.0370605, + "balance_loss_mlp": 1.02044106, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.953283173473866, + "language_loss": 0.73591113, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.7571792, + "num_input_tokens_seen": 350953220, + "step": 16260, + "time_per_iteration": 2.517510175704956 + }, + { + "auxiliary_loss_clip": 0.01099481, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.03624392, + "balance_loss_mlp": 1.02025247, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 2.4670043521534497, + "language_loss": 0.79517126, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81649292, + "num_input_tokens_seen": 350971915, + "step": 16261, + "time_per_iteration": 2.4552743434906006 + }, + { + "auxiliary_loss_clip": 0.0110073, + "auxiliary_loss_mlp": 0.01024631, + "balance_loss_clip": 1.03656316, + "balance_loss_mlp": 1.01237583, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.9061076889548554, + "language_loss": 0.74260956, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76386321, + "num_input_tokens_seen": 350990470, + "step": 16262, + "time_per_iteration": 2.476452112197876 + }, + { + "auxiliary_loss_clip": 0.01098779, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.03541553, + "balance_loss_mlp": 1.0165472, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 2.3211180607952686, + "language_loss": 0.7004925, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72178602, + "num_input_tokens_seen": 351010755, + "step": 16263, + "time_per_iteration": 2.526585340499878 + }, + { + "auxiliary_loss_clip": 0.01101074, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.03837621, + "balance_loss_mlp": 1.02038836, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 1.8352537677114598, + "language_loss": 0.66294217, + "learning_rate": 5.133094442018038e-09, + "loss": 0.68427509, + "num_input_tokens_seen": 351029965, + "step": 16264, + "time_per_iteration": 4.028187036514282 + }, + { + "auxiliary_loss_clip": 0.01064148, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.03809428, + "balance_loss_mlp": 1.01823127, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 1.9657643889551828, + "language_loss": 0.7310816, + "learning_rate": 5.105246951967679e-09, + "loss": 0.75204325, + "num_input_tokens_seen": 351046205, + "step": 16265, + "time_per_iteration": 2.5454840660095215 + }, + { + "auxiliary_loss_clip": 0.01096949, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.03517318, + "balance_loss_mlp": 1.01862907, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 1.8052747143089027, + "language_loss": 0.68600118, + "learning_rate": 5.077475108526297e-09, + "loss": 0.70728254, + "num_input_tokens_seen": 351065390, + "step": 16266, + "time_per_iteration": 2.5018484592437744 + }, + { + "auxiliary_loss_clip": 0.01062212, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.03310132, + "balance_loss_mlp": 1.01694632, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 1.8246586465123222, + "language_loss": 0.86963737, + "learning_rate": 5.049778912747049e-09, + "loss": 0.89053726, + "num_input_tokens_seen": 351084355, + "step": 16267, + "time_per_iteration": 2.5730490684509277 + }, + { + "auxiliary_loss_clip": 0.01047002, + "auxiliary_loss_mlp": 0.01025789, + "balance_loss_clip": 1.0349586, + "balance_loss_mlp": 1.01333737, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 2.318883947296159, + "language_loss": 0.70297527, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72370327, + "num_input_tokens_seen": 351105870, + "step": 16268, + "time_per_iteration": 2.758601188659668 + }, + { + "auxiliary_loss_clip": 0.01089287, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.0347929, + "balance_loss_mlp": 1.0174284, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.5863552048945218, + "language_loss": 0.73910642, + "learning_rate": 4.994613468372711e-09, + "loss": 0.76029253, + "num_input_tokens_seen": 351124760, + "step": 16269, + "time_per_iteration": 2.49851393699646 + }, + { + "auxiliary_loss_clip": 0.01085934, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.03583908, + "balance_loss_mlp": 1.02208233, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 2.0988941693917114, + "language_loss": 0.70905864, + "learning_rate": 4.967144221869501e-09, + "loss": 0.7302748, + "num_input_tokens_seen": 351142820, + "step": 16270, + "time_per_iteration": 2.5605037212371826 + }, + { + "auxiliary_loss_clip": 0.0111085, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.03786957, + "balance_loss_mlp": 1.02386582, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 1.7531145341578045, + "language_loss": 0.63697243, + "learning_rate": 4.939750627212191e-09, + "loss": 0.65844071, + "num_input_tokens_seen": 351164805, + "step": 16271, + "time_per_iteration": 2.517632007598877 + }, + { + "auxiliary_loss_clip": 0.01083555, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.0377444, + "balance_loss_mlp": 1.01843691, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 2.260425002979301, + "language_loss": 0.70522559, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72636855, + "num_input_tokens_seen": 351187005, + "step": 16272, + "time_per_iteration": 2.6720898151397705 + }, + { + "auxiliary_loss_clip": 0.01055942, + "auxiliary_loss_mlp": 0.01030384, + "balance_loss_clip": 1.04034877, + "balance_loss_mlp": 1.0182004, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 1.7210038397572998, + "language_loss": 0.66648239, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68734562, + "num_input_tokens_seen": 351208450, + "step": 16273, + "time_per_iteration": 2.645498752593994 + }, + { + "auxiliary_loss_clip": 0.01077973, + "auxiliary_loss_mlp": 0.01023145, + "balance_loss_clip": 1.03599858, + "balance_loss_mlp": 1.01115811, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.6276168542202833, + "language_loss": 0.73959273, + "learning_rate": 4.85802376468869e-09, + "loss": 0.7606039, + "num_input_tokens_seen": 351229585, + "step": 16274, + "time_per_iteration": 2.5934863090515137 + }, + { + "auxiliary_loss_clip": 0.01084321, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.03450072, + "balance_loss_mlp": 1.01996684, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.6507947008195782, + "language_loss": 0.77725816, + "learning_rate": 4.830932787773579e-09, + "loss": 0.79842228, + "num_input_tokens_seen": 351249525, + "step": 16275, + "time_per_iteration": 2.5215818881988525 + }, + { + "auxiliary_loss_clip": 0.01044632, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.03931403, + "balance_loss_mlp": 1.01674664, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 1.8114372369299037, + "language_loss": 0.70681894, + "learning_rate": 4.803917467869567e-09, + "loss": 0.7275629, + "num_input_tokens_seen": 351272530, + "step": 16276, + "time_per_iteration": 3.0620594024658203 + }, + { + "auxiliary_loss_clip": 0.01078574, + "auxiliary_loss_mlp": 0.01033425, + "balance_loss_clip": 1.0327965, + "balance_loss_mlp": 1.0217067, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 1.828966007877774, + "language_loss": 0.85685575, + "learning_rate": 4.776977806000726e-09, + "loss": 0.87797576, + "num_input_tokens_seen": 351288530, + "step": 16277, + "time_per_iteration": 4.974852800369263 + }, + { + "auxiliary_loss_clip": 0.01090675, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.03375649, + "balance_loss_mlp": 1.01884687, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 1.7768642269490253, + "language_loss": 0.70807719, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.72930068, + "num_input_tokens_seen": 351305890, + "step": 16278, + "time_per_iteration": 2.5242319107055664 + }, + { + "auxiliary_loss_clip": 0.01088694, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.0338316, + "balance_loss_mlp": 1.01951671, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 1.896137756748222, + "language_loss": 0.84507769, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86628723, + "num_input_tokens_seen": 351325010, + "step": 16279, + "time_per_iteration": 2.4807233810424805 + }, + { + "auxiliary_loss_clip": 0.01097473, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.03433073, + "balance_loss_mlp": 1.01919365, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 1.748470170023985, + "language_loss": 0.78934771, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81064194, + "num_input_tokens_seen": 351343060, + "step": 16280, + "time_per_iteration": 2.464012384414673 + }, + { + "auxiliary_loss_clip": 0.01065357, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.03417873, + "balance_loss_mlp": 1.02281022, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.8932331899300445, + "language_loss": 0.79515398, + "learning_rate": 4.669975759268085e-09, + "loss": 0.8161515, + "num_input_tokens_seen": 351363260, + "step": 16281, + "time_per_iteration": 2.5542500019073486 + }, + { + "auxiliary_loss_clip": 0.010943, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.03578222, + "balance_loss_mlp": 1.02072787, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.6117874942797623, + "language_loss": 0.80336618, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82464403, + "num_input_tokens_seen": 351382610, + "step": 16282, + "time_per_iteration": 2.4906070232391357 + }, + { + "auxiliary_loss_clip": 0.01087026, + "auxiliary_loss_mlp": 0.01040359, + "balance_loss_clip": 1.03523195, + "balance_loss_mlp": 1.0281992, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 2.2213548204685614, + "language_loss": 0.83037889, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85165268, + "num_input_tokens_seen": 351401075, + "step": 16283, + "time_per_iteration": 2.492147922515869 + }, + { + "auxiliary_loss_clip": 0.01092381, + "auxiliary_loss_mlp": 0.01032898, + "balance_loss_clip": 1.03595471, + "balance_loss_mlp": 1.02005339, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 1.9759441049058317, + "language_loss": 0.72115403, + "learning_rate": 4.590518683360134e-09, + "loss": 0.74240685, + "num_input_tokens_seen": 351419275, + "step": 16284, + "time_per_iteration": 2.446820020675659 + }, + { + "auxiliary_loss_clip": 0.01094445, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.03573775, + "balance_loss_mlp": 1.02399373, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 1.9232420671784973, + "language_loss": 0.64139605, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66269463, + "num_input_tokens_seen": 351437375, + "step": 16285, + "time_per_iteration": 2.440307140350342 + }, + { + "auxiliary_loss_clip": 0.01080953, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.0338428, + "balance_loss_mlp": 1.01963961, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.5841920472065543, + "language_loss": 0.70889074, + "learning_rate": 4.537925628385286e-09, + "loss": 0.73001689, + "num_input_tokens_seen": 351457810, + "step": 16286, + "time_per_iteration": 2.5291898250579834 + }, + { + "auxiliary_loss_clip": 0.01094486, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.03864753, + "balance_loss_mlp": 1.01862872, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 1.3969466245771067, + "language_loss": 0.58445334, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60570347, + "num_input_tokens_seen": 351478825, + "step": 16287, + "time_per_iteration": 2.537431001663208 + }, + { + "auxiliary_loss_clip": 0.01096112, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.03563237, + "balance_loss_mlp": 1.02187371, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 1.6536683667466476, + "language_loss": 0.81594837, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83725107, + "num_input_tokens_seen": 351498785, + "step": 16288, + "time_per_iteration": 4.063505411148071 + }, + { + "auxiliary_loss_clip": 0.01083379, + "auxiliary_loss_mlp": 0.00778695, + "balance_loss_clip": 1.03375733, + "balance_loss_mlp": 1.00053632, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.7079478078838368, + "language_loss": 0.71657228, + "learning_rate": 4.459603559311631e-09, + "loss": 0.73519301, + "num_input_tokens_seen": 351520235, + "step": 16289, + "time_per_iteration": 2.562377691268921 + }, + { + "auxiliary_loss_clip": 0.01078562, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.04258966, + "balance_loss_mlp": 1.02309704, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.3823663539296915, + "language_loss": 0.75020266, + "learning_rate": 4.43364754382003e-09, + "loss": 0.77134371, + "num_input_tokens_seen": 351538900, + "step": 16290, + "time_per_iteration": 2.6074132919311523 + }, + { + "auxiliary_loss_clip": 0.01099692, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.03661251, + "balance_loss_mlp": 1.02221096, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.7072842604637684, + "language_loss": 0.67240614, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69375771, + "num_input_tokens_seen": 351558715, + "step": 16291, + "time_per_iteration": 2.4918406009674072 + }, + { + "auxiliary_loss_clip": 0.01111614, + "auxiliary_loss_mlp": 0.00779052, + "balance_loss_clip": 1.03671813, + "balance_loss_mlp": 1.00065017, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 1.7374682014111864, + "language_loss": 0.62996286, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64886951, + "num_input_tokens_seen": 351578450, + "step": 16292, + "time_per_iteration": 3.8414711952209473 + }, + { + "auxiliary_loss_clip": 0.01074536, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.0359087, + "balance_loss_mlp": 1.02008891, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 1.8094909313676515, + "language_loss": 0.73479956, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75586426, + "num_input_tokens_seen": 351597195, + "step": 16293, + "time_per_iteration": 2.521557569503784 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.01029363, + "balance_loss_clip": 1.03591287, + "balance_loss_mlp": 1.01691806, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 2.9213709590347676, + "language_loss": 0.84044969, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86175394, + "num_input_tokens_seen": 351617460, + "step": 16294, + "time_per_iteration": 2.546409845352173 + }, + { + "auxiliary_loss_clip": 0.0107005, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03225672, + "balance_loss_mlp": 1.02124298, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 1.8895689559885591, + "language_loss": 0.72102463, + "learning_rate": 4.305002567088767e-09, + "loss": 0.74204922, + "num_input_tokens_seen": 351635900, + "step": 16295, + "time_per_iteration": 2.5095512866973877 + }, + { + "auxiliary_loss_clip": 0.0110398, + "auxiliary_loss_mlp": 0.01039957, + "balance_loss_clip": 1.03726578, + "balance_loss_mlp": 1.02699351, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.6659828580062823, + "language_loss": 0.80767524, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.82911468, + "num_input_tokens_seen": 351655400, + "step": 16296, + "time_per_iteration": 2.4612972736358643 + }, + { + "auxiliary_loss_clip": 0.01079486, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.03251255, + "balance_loss_mlp": 1.02412093, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 1.846183660026933, + "language_loss": 0.75776696, + "learning_rate": 4.254074308266853e-09, + "loss": 0.77892536, + "num_input_tokens_seen": 351675505, + "step": 16297, + "time_per_iteration": 2.531442403793335 + }, + { + "auxiliary_loss_clip": 0.01100051, + "auxiliary_loss_mlp": 0.01035988, + "balance_loss_clip": 1.03508937, + "balance_loss_mlp": 1.02355433, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 2.000930059917288, + "language_loss": 0.78016871, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80152905, + "num_input_tokens_seen": 351697920, + "step": 16298, + "time_per_iteration": 2.557401418685913 + }, + { + "auxiliary_loss_clip": 0.0109232, + "auxiliary_loss_mlp": 0.01027223, + "balance_loss_clip": 1.03411746, + "balance_loss_mlp": 1.01570094, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 1.4823838816452966, + "language_loss": 0.72694355, + "learning_rate": 4.203448764984019e-09, + "loss": 0.74813902, + "num_input_tokens_seen": 351717615, + "step": 16299, + "time_per_iteration": 2.470536947250366 + }, + { + "auxiliary_loss_clip": 0.0108558, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.03325784, + "balance_loss_mlp": 1.02024102, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 2.175643216431674, + "language_loss": 0.89230943, + "learning_rate": 4.178249514071419e-09, + "loss": 0.91349924, + "num_input_tokens_seen": 351735260, + "step": 16300, + "time_per_iteration": 2.5448954105377197 + }, + { + "auxiliary_loss_clip": 0.01098889, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.0354743, + "balance_loss_mlp": 1.0170188, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.322094695115688, + "language_loss": 0.78651452, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.8077969, + "num_input_tokens_seen": 351755800, + "step": 16301, + "time_per_iteration": 2.4606122970581055 + }, + { + "auxiliary_loss_clip": 0.01087084, + "auxiliary_loss_mlp": 0.01040589, + "balance_loss_clip": 1.03561497, + "balance_loss_mlp": 1.02814388, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 2.096630626717178, + "language_loss": 0.75232941, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77360612, + "num_input_tokens_seen": 351774790, + "step": 16302, + "time_per_iteration": 2.536837577819824 + }, + { + "auxiliary_loss_clip": 0.01079037, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.03462386, + "balance_loss_mlp": 1.01972103, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 2.270763870878265, + "language_loss": 0.79595768, + "learning_rate": 4.103105855705724e-09, + "loss": 0.81707293, + "num_input_tokens_seen": 351792855, + "step": 16303, + "time_per_iteration": 4.0142176151275635 + }, + { + "auxiliary_loss_clip": 0.01067413, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.03166103, + "balance_loss_mlp": 1.02258205, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 1.9040523485455094, + "language_loss": 0.83256745, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85360348, + "num_input_tokens_seen": 351811450, + "step": 16304, + "time_per_iteration": 2.5190930366516113 + }, + { + "auxiliary_loss_clip": 0.01072001, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.03612161, + "balance_loss_mlp": 1.01590085, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 2.5451957727036842, + "language_loss": 0.70321196, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72419995, + "num_input_tokens_seen": 351831960, + "step": 16305, + "time_per_iteration": 2.556901454925537 + }, + { + "auxiliary_loss_clip": 0.01081124, + "auxiliary_loss_mlp": 0.01042795, + "balance_loss_clip": 1.0361228, + "balance_loss_mlp": 1.02833509, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 2.325486475716781, + "language_loss": 0.72140038, + "learning_rate": 4.028643358815032e-09, + "loss": 0.74263954, + "num_input_tokens_seen": 351851585, + "step": 16306, + "time_per_iteration": 2.503063917160034 + }, + { + "auxiliary_loss_clip": 0.01081688, + "auxiliary_loss_mlp": 0.010352, + "balance_loss_clip": 1.03186584, + "balance_loss_mlp": 1.02350521, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.6623590776574277, + "language_loss": 0.73509252, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75626135, + "num_input_tokens_seen": 351871085, + "step": 16307, + "time_per_iteration": 2.505171537399292 + }, + { + "auxiliary_loss_clip": 0.01076234, + "auxiliary_loss_mlp": 0.01027347, + "balance_loss_clip": 1.03784597, + "balance_loss_mlp": 1.01654065, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.4891116513045055, + "language_loss": 0.75088966, + "learning_rate": 3.979380129822018e-09, + "loss": 0.77192545, + "num_input_tokens_seen": 351891775, + "step": 16308, + "time_per_iteration": 2.5771045684814453 + }, + { + "auxiliary_loss_clip": 0.01008146, + "auxiliary_loss_mlp": 0.01004567, + "balance_loss_clip": 1.00412726, + "balance_loss_mlp": 1.00330353, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.7614801532171502, + "language_loss": 0.57815516, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59828228, + "num_input_tokens_seen": 351946770, + "step": 16309, + "time_per_iteration": 2.9305739402770996 + }, + { + "auxiliary_loss_clip": 0.01066735, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.03506279, + "balance_loss_mlp": 1.01777673, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.9718652388327897, + "language_loss": 0.66453522, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68550348, + "num_input_tokens_seen": 351966155, + "step": 16310, + "time_per_iteration": 2.6767327785491943 + }, + { + "auxiliary_loss_clip": 0.01007702, + "auxiliary_loss_mlp": 0.01001819, + "balance_loss_clip": 1.00688946, + "balance_loss_mlp": 1.00060904, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 0.821164314530859, + "language_loss": 0.54512179, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56521702, + "num_input_tokens_seen": 352031655, + "step": 16311, + "time_per_iteration": 3.159358263015747 + }, + { + "auxiliary_loss_clip": 0.01098074, + "auxiliary_loss_mlp": 0.01024036, + "balance_loss_clip": 1.03547978, + "balance_loss_mlp": 1.01225233, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 1.5893469262409454, + "language_loss": 0.79898626, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82020736, + "num_input_tokens_seen": 352051920, + "step": 16312, + "time_per_iteration": 2.529017686843872 + }, + { + "auxiliary_loss_clip": 0.01084559, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.03453672, + "balance_loss_mlp": 1.01685596, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 1.8591352871320184, + "language_loss": 0.62980384, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65093291, + "num_input_tokens_seen": 352069315, + "step": 16313, + "time_per_iteration": 2.524362802505493 + }, + { + "auxiliary_loss_clip": 0.01097594, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.03670526, + "balance_loss_mlp": 1.02177358, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 2.20144491470756, + "language_loss": 0.72337079, + "learning_rate": 3.833407015731316e-09, + "loss": 0.74468786, + "num_input_tokens_seen": 352089480, + "step": 16314, + "time_per_iteration": 2.4963326454162598 + }, + { + "auxiliary_loss_clip": 0.01011108, + "auxiliary_loss_mlp": 0.01001667, + "balance_loss_clip": 1.01636553, + "balance_loss_mlp": 1.00043917, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.694720196028923, + "language_loss": 0.51731378, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53744155, + "num_input_tokens_seen": 352150000, + "step": 16315, + "time_per_iteration": 3.0976321697235107 + }, + { + "auxiliary_loss_clip": 0.01095712, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.03356302, + "balance_loss_mlp": 1.01998448, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.3199912178844642, + "language_loss": 0.69726634, + "learning_rate": 3.785354859932033e-09, + "loss": 0.71854049, + "num_input_tokens_seen": 352170990, + "step": 16316, + "time_per_iteration": 2.4851009845733643 + }, + { + "auxiliary_loss_clip": 0.01108217, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.03541207, + "balance_loss_mlp": 1.01607394, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 2.4389815164978983, + "language_loss": 0.55469227, + "learning_rate": 3.76144232656661e-09, + "loss": 0.5760541, + "num_input_tokens_seen": 352195335, + "step": 16317, + "time_per_iteration": 4.006295680999756 + }, + { + "auxiliary_loss_clip": 0.01056099, + "auxiliary_loss_mlp": 0.01034911, + "balance_loss_clip": 1.0289793, + "balance_loss_mlp": 1.0226922, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.718204616874756, + "language_loss": 0.73185748, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75276756, + "num_input_tokens_seen": 352214170, + "step": 16318, + "time_per_iteration": 2.5878424644470215 + }, + { + "auxiliary_loss_clip": 0.0108482, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.03606832, + "balance_loss_mlp": 1.01751351, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.096584820843949, + "language_loss": 0.81712675, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.83826226, + "num_input_tokens_seen": 352231470, + "step": 16319, + "time_per_iteration": 2.467818260192871 + }, + { + "auxiliary_loss_clip": 0.01018872, + "auxiliary_loss_mlp": 0.01010055, + "balance_loss_clip": 1.00385475, + "balance_loss_mlp": 1.00863636, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7202814832255683, + "language_loss": 0.53582418, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55611348, + "num_input_tokens_seen": 352291770, + "step": 16320, + "time_per_iteration": 2.940032958984375 + }, + { + "auxiliary_loss_clip": 0.01060196, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.03305161, + "balance_loss_mlp": 1.02065122, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 1.6605228616936005, + "language_loss": 0.7346034, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75553334, + "num_input_tokens_seen": 352310735, + "step": 16321, + "time_per_iteration": 2.600663423538208 + }, + { + "auxiliary_loss_clip": 0.01088249, + "auxiliary_loss_mlp": 0.01032565, + "balance_loss_clip": 1.03737843, + "balance_loss_mlp": 1.02037621, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 1.4997702104934636, + "language_loss": 0.78463119, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80583942, + "num_input_tokens_seen": 352329545, + "step": 16322, + "time_per_iteration": 2.531982660293579 + }, + { + "auxiliary_loss_clip": 0.01096953, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.0358156, + "balance_loss_mlp": 1.02302742, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.4769984239737168, + "language_loss": 0.80951142, + "learning_rate": 3.619556806799595e-09, + "loss": 0.83083349, + "num_input_tokens_seen": 352352080, + "step": 16323, + "time_per_iteration": 2.5075619220733643 + }, + { + "auxiliary_loss_clip": 0.01110461, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.03699827, + "balance_loss_mlp": 1.02298403, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 3.9262402642790617, + "language_loss": 0.85048664, + "learning_rate": 3.596174175278799e-09, + "loss": 0.87193918, + "num_input_tokens_seen": 352366455, + "step": 16324, + "time_per_iteration": 2.415121078491211 + }, + { + "auxiliary_loss_clip": 0.01086445, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.03681731, + "balance_loss_mlp": 1.01909935, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.5858811212553763, + "language_loss": 0.74585754, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76703596, + "num_input_tokens_seen": 352386090, + "step": 16325, + "time_per_iteration": 2.643678903579712 + }, + { + "auxiliary_loss_clip": 0.0105641, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.03610468, + "balance_loss_mlp": 1.02159464, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.6433893262211898, + "language_loss": 0.76802677, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78891397, + "num_input_tokens_seen": 352404000, + "step": 16326, + "time_per_iteration": 2.5841331481933594 + }, + { + "auxiliary_loss_clip": 0.01076191, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.03453732, + "balance_loss_mlp": 1.01933122, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 1.7836225923525666, + "language_loss": 0.67205012, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69313526, + "num_input_tokens_seen": 352423540, + "step": 16327, + "time_per_iteration": 2.546274185180664 + }, + { + "auxiliary_loss_clip": 0.01102758, + "auxiliary_loss_mlp": 0.01035866, + "balance_loss_clip": 1.03550887, + "balance_loss_mlp": 1.0227586, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.7607505135459947, + "language_loss": 0.73806679, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75945294, + "num_input_tokens_seen": 352445530, + "step": 16328, + "time_per_iteration": 4.144175291061401 + }, + { + "auxiliary_loss_clip": 0.01093895, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.03574872, + "balance_loss_mlp": 1.02192819, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 1.9713990151531489, + "language_loss": 0.80978787, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.8310827, + "num_input_tokens_seen": 352466325, + "step": 16329, + "time_per_iteration": 2.5392136573791504 + }, + { + "auxiliary_loss_clip": 0.01111574, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.03548288, + "balance_loss_mlp": 1.02015376, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 2.2153240858842196, + "language_loss": 0.76030791, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78175879, + "num_input_tokens_seen": 352485505, + "step": 16330, + "time_per_iteration": 2.467595100402832 + }, + { + "auxiliary_loss_clip": 0.01118358, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.03867459, + "balance_loss_mlp": 1.01748872, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.535290966186173, + "language_loss": 0.66896552, + "learning_rate": 3.434615511252126e-09, + "loss": 0.69047469, + "num_input_tokens_seen": 352505360, + "step": 16331, + "time_per_iteration": 3.883596181869507 + }, + { + "auxiliary_loss_clip": 0.01095416, + "auxiliary_loss_mlp": 0.01031534, + "balance_loss_clip": 1.03428006, + "balance_loss_mlp": 1.0196135, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 1.719642744651936, + "language_loss": 0.73095226, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75222182, + "num_input_tokens_seen": 352524035, + "step": 16332, + "time_per_iteration": 2.494715690612793 + }, + { + "auxiliary_loss_clip": 0.0109719, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.037323, + "balance_loss_mlp": 1.01854134, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 1.7634903352396047, + "language_loss": 0.76985991, + "learning_rate": 3.389137269534936e-09, + "loss": 0.79112858, + "num_input_tokens_seen": 352543210, + "step": 16333, + "time_per_iteration": 2.4716110229492188 + }, + { + "auxiliary_loss_clip": 0.01091016, + "auxiliary_loss_mlp": 0.00777193, + "balance_loss_clip": 1.03607988, + "balance_loss_mlp": 1.00051689, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.218667853708272, + "language_loss": 0.72906852, + "learning_rate": 3.366511715771958e-09, + "loss": 0.74775064, + "num_input_tokens_seen": 352559770, + "step": 16334, + "time_per_iteration": 2.436476945877075 + }, + { + "auxiliary_loss_clip": 0.01059889, + "auxiliary_loss_mlp": 0.01037954, + "balance_loss_clip": 1.03628767, + "balance_loss_mlp": 1.0256933, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 1.8449287583198466, + "language_loss": 0.7831732, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80415165, + "num_input_tokens_seen": 352577690, + "step": 16335, + "time_per_iteration": 2.5939977169036865 + }, + { + "auxiliary_loss_clip": 0.01085569, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_clip": 1.03351593, + "balance_loss_mlp": 1.02821255, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.523745707238535, + "language_loss": 0.63868731, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.65997362, + "num_input_tokens_seen": 352598850, + "step": 16336, + "time_per_iteration": 2.6023707389831543 + }, + { + "auxiliary_loss_clip": 0.01081273, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_clip": 1.03955913, + "balance_loss_mlp": 1.02950788, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 1.9071386884991557, + "language_loss": 0.73272324, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75397205, + "num_input_tokens_seen": 352616130, + "step": 16337, + "time_per_iteration": 2.511636257171631 + }, + { + "auxiliary_loss_clip": 0.01092628, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.03506494, + "balance_loss_mlp": 1.01939023, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.8539812822934316, + "language_loss": 0.73156166, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.75281513, + "num_input_tokens_seen": 352636885, + "step": 16338, + "time_per_iteration": 2.4631497859954834 + }, + { + "auxiliary_loss_clip": 0.01043145, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.03065014, + "balance_loss_mlp": 1.02062166, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.75282093848355, + "language_loss": 0.81377685, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83453858, + "num_input_tokens_seen": 352657905, + "step": 16339, + "time_per_iteration": 2.6381659507751465 + }, + { + "auxiliary_loss_clip": 0.0105757, + "auxiliary_loss_mlp": 0.01039025, + "balance_loss_clip": 1.02995229, + "balance_loss_mlp": 1.02613854, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 1.9991676835187928, + "language_loss": 0.62923574, + "learning_rate": 3.232348386403405e-09, + "loss": 0.65020174, + "num_input_tokens_seen": 352676320, + "step": 16340, + "time_per_iteration": 2.5734519958496094 + }, + { + "auxiliary_loss_clip": 0.01111528, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.03759384, + "balance_loss_mlp": 1.01828265, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 2.3700301957850214, + "language_loss": 0.85897017, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.88039482, + "num_input_tokens_seen": 352692665, + "step": 16341, + "time_per_iteration": 2.395941734313965 + }, + { + "auxiliary_loss_clip": 0.0108343, + "auxiliary_loss_mlp": 0.01028706, + "balance_loss_clip": 1.03392339, + "balance_loss_mlp": 1.01645088, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.5394533688328504, + "language_loss": 0.67107511, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69219643, + "num_input_tokens_seen": 352716130, + "step": 16342, + "time_per_iteration": 4.139851331710815 + }, + { + "auxiliary_loss_clip": 0.01108861, + "auxiliary_loss_mlp": 0.01025096, + "balance_loss_clip": 1.035882, + "balance_loss_mlp": 1.01284134, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.5704618036539917, + "language_loss": 0.77468812, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79602766, + "num_input_tokens_seen": 352734705, + "step": 16343, + "time_per_iteration": 2.429246425628662 + }, + { + "auxiliary_loss_clip": 0.01072668, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.03489196, + "balance_loss_mlp": 1.01889431, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.513842121070673, + "language_loss": 0.75477588, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77579701, + "num_input_tokens_seen": 352756225, + "step": 16344, + "time_per_iteration": 2.597806215286255 + }, + { + "auxiliary_loss_clip": 0.0108593, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.03458357, + "balance_loss_mlp": 1.0197978, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 1.8190781375276432, + "language_loss": 0.66366619, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68484962, + "num_input_tokens_seen": 352776210, + "step": 16345, + "time_per_iteration": 2.5588276386260986 + }, + { + "auxiliary_loss_clip": 0.01092237, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.03526437, + "balance_loss_mlp": 1.0162406, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.525433705829353, + "language_loss": 0.79686713, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81805658, + "num_input_tokens_seen": 352795455, + "step": 16346, + "time_per_iteration": 2.4835658073425293 + }, + { + "auxiliary_loss_clip": 0.01103147, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.04100037, + "balance_loss_mlp": 1.02053237, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 2.119894953387124, + "language_loss": 0.74932826, + "learning_rate": 3.079269666552031e-09, + "loss": 0.77069372, + "num_input_tokens_seen": 352812895, + "step": 16347, + "time_per_iteration": 2.510518789291382 + }, + { + "auxiliary_loss_clip": 0.01037019, + "auxiliary_loss_mlp": 0.0103782, + "balance_loss_clip": 1.0289669, + "balance_loss_mlp": 1.02622652, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.7162422306923375, + "language_loss": 0.6669457, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68769407, + "num_input_tokens_seen": 352835470, + "step": 16348, + "time_per_iteration": 2.786384344100952 + }, + { + "auxiliary_loss_clip": 0.010885, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.03598762, + "balance_loss_mlp": 1.01954365, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 1.9092118852041315, + "language_loss": 0.69465339, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.7158584, + "num_input_tokens_seen": 352854295, + "step": 16349, + "time_per_iteration": 2.560246706008911 + }, + { + "auxiliary_loss_clip": 0.01075912, + "auxiliary_loss_mlp": 0.01028984, + "balance_loss_clip": 1.03503966, + "balance_loss_mlp": 1.01811218, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 1.975695835735081, + "language_loss": 0.76379365, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.78484261, + "num_input_tokens_seen": 352869695, + "step": 16350, + "time_per_iteration": 2.468780040740967 + }, + { + "auxiliary_loss_clip": 0.01080719, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.03527927, + "balance_loss_mlp": 1.01761603, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 1.9278794446337741, + "language_loss": 0.8417933, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86290205, + "num_input_tokens_seen": 352887430, + "step": 16351, + "time_per_iteration": 2.628495931625366 + }, + { + "auxiliary_loss_clip": 0.01079857, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.0366677, + "balance_loss_mlp": 1.01725745, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 2.082320042591113, + "language_loss": 0.6843884, + "learning_rate": 2.972199410170795e-09, + "loss": 0.70548278, + "num_input_tokens_seen": 352907555, + "step": 16352, + "time_per_iteration": 2.6476378440856934 + }, + { + "auxiliary_loss_clip": 0.01088314, + "auxiliary_loss_mlp": 0.00777084, + "balance_loss_clip": 1.03478742, + "balance_loss_mlp": 1.00057054, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.49363093041898, + "language_loss": 0.66454124, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68319523, + "num_input_tokens_seen": 352928670, + "step": 16353, + "time_per_iteration": 2.4910879135131836 + }, + { + "auxiliary_loss_clip": 0.0108361, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.03578866, + "balance_loss_mlp": 1.0182271, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.4869746926121539, + "language_loss": 0.74565864, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76679087, + "num_input_tokens_seen": 352948345, + "step": 16354, + "time_per_iteration": 2.545213460922241 + }, + { + "auxiliary_loss_clip": 0.01096257, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.03535604, + "balance_loss_mlp": 1.01886857, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 2.299323939459211, + "language_loss": 0.77402222, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.79529786, + "num_input_tokens_seen": 352967250, + "step": 16355, + "time_per_iteration": 2.459751844406128 + }, + { + "auxiliary_loss_clip": 0.01097473, + "auxiliary_loss_mlp": 0.01030322, + "balance_loss_clip": 1.03712833, + "balance_loss_mlp": 1.01784086, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 1.999670373062409, + "language_loss": 0.73476887, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75604677, + "num_input_tokens_seen": 352984725, + "step": 16356, + "time_per_iteration": 3.9077298641204834 + }, + { + "auxiliary_loss_clip": 0.01084074, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.0339458, + "balance_loss_mlp": 1.01952279, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.5874148751487813, + "language_loss": 0.75839162, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.77955359, + "num_input_tokens_seen": 353003480, + "step": 16357, + "time_per_iteration": 2.4925503730773926 + }, + { + "auxiliary_loss_clip": 0.01086325, + "auxiliary_loss_mlp": 0.01025735, + "balance_loss_clip": 1.03484476, + "balance_loss_mlp": 1.01299167, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 2.148636594090029, + "language_loss": 0.80465972, + "learning_rate": 2.846214118442436e-09, + "loss": 0.82578033, + "num_input_tokens_seen": 353021425, + "step": 16358, + "time_per_iteration": 2.5063979625701904 + }, + { + "auxiliary_loss_clip": 0.01096016, + "auxiliary_loss_mlp": 0.01026268, + "balance_loss_clip": 1.03320646, + "balance_loss_mlp": 1.0148772, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 2.4633765778856427, + "language_loss": 0.67592204, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.69714487, + "num_input_tokens_seen": 353039870, + "step": 16359, + "time_per_iteration": 2.50274658203125 + }, + { + "auxiliary_loss_clip": 0.01104169, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.0346086, + "balance_loss_mlp": 1.0192703, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 2.1652141827969493, + "language_loss": 0.69635332, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71769845, + "num_input_tokens_seen": 353059750, + "step": 16360, + "time_per_iteration": 2.4211769104003906 + }, + { + "auxiliary_loss_clip": 0.01098863, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.03635168, + "balance_loss_mlp": 1.02295446, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.7688211227203157, + "language_loss": 0.84546101, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86680484, + "num_input_tokens_seen": 353079940, + "step": 16361, + "time_per_iteration": 2.4683761596679688 + }, + { + "auxiliary_loss_clip": 0.0110634, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.03495562, + "balance_loss_mlp": 1.01610708, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 1.7550475545227897, + "language_loss": 0.7583195, + "learning_rate": 2.76373855876022e-09, + "loss": 0.77965707, + "num_input_tokens_seen": 353099990, + "step": 16362, + "time_per_iteration": 2.4486918449401855 + }, + { + "auxiliary_loss_clip": 0.01108235, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.03632832, + "balance_loss_mlp": 1.02097797, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.8126134146770896, + "language_loss": 0.71090996, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73232889, + "num_input_tokens_seen": 353118710, + "step": 16363, + "time_per_iteration": 2.4157321453094482 + }, + { + "auxiliary_loss_clip": 0.01082961, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.03433108, + "balance_loss_mlp": 1.01817346, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 2.145760033724704, + "language_loss": 0.6323241, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65344763, + "num_input_tokens_seen": 353136415, + "step": 16364, + "time_per_iteration": 2.4910356998443604 + }, + { + "auxiliary_loss_clip": 0.01071841, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.04150438, + "balance_loss_mlp": 1.02056181, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 1.5905010090054423, + "language_loss": 0.74971569, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77074778, + "num_input_tokens_seen": 353154650, + "step": 16365, + "time_per_iteration": 2.5925376415252686 + }, + { + "auxiliary_loss_clip": 0.01062211, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.0359695, + "balance_loss_mlp": 1.01645005, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 1.680021866603007, + "language_loss": 0.76573598, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78664899, + "num_input_tokens_seen": 353174065, + "step": 16366, + "time_per_iteration": 2.5916903018951416 + }, + { + "auxiliary_loss_clip": 0.01105042, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.03488016, + "balance_loss_mlp": 1.01627421, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.6480024768331365, + "language_loss": 0.77076608, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79209316, + "num_input_tokens_seen": 353193560, + "step": 16367, + "time_per_iteration": 2.4663443565368652 + }, + { + "auxiliary_loss_clip": 0.01085029, + "auxiliary_loss_mlp": 0.01032159, + "balance_loss_clip": 1.03734195, + "balance_loss_mlp": 1.01927209, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.5419895957590621, + "language_loss": 0.61747247, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63864434, + "num_input_tokens_seen": 353213525, + "step": 16368, + "time_per_iteration": 4.0585081577301025 + }, + { + "auxiliary_loss_clip": 0.01092838, + "auxiliary_loss_mlp": 0.01032597, + "balance_loss_clip": 1.03366113, + "balance_loss_mlp": 1.02151632, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.564323633097296, + "language_loss": 0.65842116, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67967552, + "num_input_tokens_seen": 353234000, + "step": 16369, + "time_per_iteration": 2.4967803955078125 + }, + { + "auxiliary_loss_clip": 0.01098545, + "auxiliary_loss_mlp": 0.00777921, + "balance_loss_clip": 1.03666425, + "balance_loss_mlp": 1.00067508, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.7014660643362443, + "language_loss": 0.68484497, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.7036097, + "num_input_tokens_seen": 353254940, + "step": 16370, + "time_per_iteration": 3.8739190101623535 + }, + { + "auxiliary_loss_clip": 0.01108445, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.03460252, + "balance_loss_mlp": 1.02096057, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 2.1172387621244946, + "language_loss": 0.73730546, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75873244, + "num_input_tokens_seen": 353272590, + "step": 16371, + "time_per_iteration": 2.415015459060669 + }, + { + "auxiliary_loss_clip": 0.01018112, + "auxiliary_loss_mlp": 0.0100149, + "balance_loss_clip": 1.00369895, + "balance_loss_mlp": 1.00037575, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7732169861708147, + "language_loss": 0.65154016, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67173618, + "num_input_tokens_seen": 353334380, + "step": 16372, + "time_per_iteration": 3.0808770656585693 + }, + { + "auxiliary_loss_clip": 0.01096825, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.0365386, + "balance_loss_mlp": 1.01891327, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 1.8159877359586611, + "language_loss": 0.70754606, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.72882432, + "num_input_tokens_seen": 353351640, + "step": 16373, + "time_per_iteration": 2.482767343521118 + }, + { + "auxiliary_loss_clip": 0.01106658, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.03638291, + "balance_loss_mlp": 1.01849031, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.7553852774231102, + "language_loss": 0.81403589, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83540529, + "num_input_tokens_seen": 353372555, + "step": 16374, + "time_per_iteration": 2.4744179248809814 + }, + { + "auxiliary_loss_clip": 0.01065569, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.03957951, + "balance_loss_mlp": 1.0205617, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.728964640023679, + "language_loss": 0.69497025, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71594799, + "num_input_tokens_seen": 353391385, + "step": 16375, + "time_per_iteration": 2.567558765411377 + }, + { + "auxiliary_loss_clip": 0.01085101, + "auxiliary_loss_mlp": 0.01042289, + "balance_loss_clip": 1.03181505, + "balance_loss_mlp": 1.02805555, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 1.8221650089854073, + "language_loss": 0.80971265, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83098656, + "num_input_tokens_seen": 353411630, + "step": 16376, + "time_per_iteration": 2.547530174255371 + }, + { + "auxiliary_loss_clip": 0.01104427, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.03452802, + "balance_loss_mlp": 1.02116382, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.5084560034681054, + "language_loss": 0.62605262, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64743215, + "num_input_tokens_seen": 353432895, + "step": 16377, + "time_per_iteration": 2.46936297416687 + }, + { + "auxiliary_loss_clip": 0.01080899, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.0369072, + "balance_loss_mlp": 1.02202296, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 1.6197481443922914, + "language_loss": 0.72695279, + "learning_rate": 2.445954472695133e-09, + "loss": 0.74810696, + "num_input_tokens_seen": 353454195, + "step": 16378, + "time_per_iteration": 2.5756490230560303 + }, + { + "auxiliary_loss_clip": 0.01107576, + "auxiliary_loss_mlp": 0.01037062, + "balance_loss_clip": 1.03561604, + "balance_loss_mlp": 1.02539706, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 1.6305473447395376, + "language_loss": 0.71270764, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73415399, + "num_input_tokens_seen": 353475125, + "step": 16379, + "time_per_iteration": 2.4800686836242676 + }, + { + "auxiliary_loss_clip": 0.01078485, + "auxiliary_loss_mlp": 0.01033932, + "balance_loss_clip": 1.03791773, + "balance_loss_mlp": 1.02158773, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 4.965231252782603, + "language_loss": 0.68849832, + "learning_rate": 2.407594853716999e-09, + "loss": 0.7096225, + "num_input_tokens_seen": 353493265, + "step": 16380, + "time_per_iteration": 2.554093837738037 + }, + { + "auxiliary_loss_clip": 0.01079189, + "auxiliary_loss_mlp": 0.01038314, + "balance_loss_clip": 1.03384066, + "balance_loss_mlp": 1.02606511, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 2.1333797652383346, + "language_loss": 0.78823376, + "learning_rate": 2.38852866722139e-09, + "loss": 0.80940884, + "num_input_tokens_seen": 353511650, + "step": 16381, + "time_per_iteration": 2.4860470294952393 + }, + { + "auxiliary_loss_clip": 0.01093174, + "auxiliary_loss_mlp": 0.01028201, + "balance_loss_clip": 1.03430748, + "balance_loss_mlp": 1.01604795, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.4467985274516904, + "language_loss": 0.82448047, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84569418, + "num_input_tokens_seen": 353534035, + "step": 16382, + "time_per_iteration": 4.060721397399902 + }, + { + "auxiliary_loss_clip": 0.01083845, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.03201759, + "balance_loss_mlp": 1.02218807, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.8081213408064924, + "language_loss": 0.74067342, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76187015, + "num_input_tokens_seen": 353549950, + "step": 16383, + "time_per_iteration": 2.4949593544006348 + }, + { + "auxiliary_loss_clip": 0.0106582, + "auxiliary_loss_mlp": 0.01027506, + "balance_loss_clip": 1.03763545, + "balance_loss_mlp": 1.01559067, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.5830393441018098, + "language_loss": 0.6593855, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.68031883, + "num_input_tokens_seen": 353573745, + "step": 16384, + "time_per_iteration": 2.692531108856201 + }, + { + "auxiliary_loss_clip": 0.01090467, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.03805268, + "balance_loss_mlp": 1.02100825, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 1.9618235389908185, + "language_loss": 0.70476621, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72601449, + "num_input_tokens_seen": 353595335, + "step": 16385, + "time_per_iteration": 2.640507221221924 + }, + { + "auxiliary_loss_clip": 0.01092832, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.0405643, + "balance_loss_mlp": 1.02092457, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 3.245731719957784, + "language_loss": 0.81110281, + "learning_rate": 2.294333993509978e-09, + "loss": 0.8323614, + "num_input_tokens_seen": 353614270, + "step": 16386, + "time_per_iteration": 2.496034860610962 + }, + { + "auxiliary_loss_clip": 0.01082243, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.03518093, + "balance_loss_mlp": 1.01886225, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 1.7986575609571869, + "language_loss": 0.67612869, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.69726467, + "num_input_tokens_seen": 353634900, + "step": 16387, + "time_per_iteration": 2.5838117599487305 + }, + { + "auxiliary_loss_clip": 0.01088639, + "auxiliary_loss_mlp": 0.00776342, + "balance_loss_clip": 1.03301382, + "balance_loss_mlp": 1.00050175, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.8454719908799198, + "language_loss": 0.74253333, + "learning_rate": 2.257186391438237e-09, + "loss": 0.76118314, + "num_input_tokens_seen": 353652890, + "step": 16388, + "time_per_iteration": 2.457054615020752 + }, + { + "auxiliary_loss_clip": 0.01094569, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.03834236, + "balance_loss_mlp": 1.01868451, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 1.9567238940421403, + "language_loss": 0.82086331, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84211493, + "num_input_tokens_seen": 353671295, + "step": 16389, + "time_per_iteration": 2.4702489376068115 + }, + { + "auxiliary_loss_clip": 0.01087705, + "auxiliary_loss_mlp": 0.0077776, + "balance_loss_clip": 1.03385639, + "balance_loss_mlp": 1.00057888, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 2.2319739341131357, + "language_loss": 0.66747642, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.68613106, + "num_input_tokens_seen": 353690560, + "step": 16390, + "time_per_iteration": 2.5355844497680664 + }, + { + "auxiliary_loss_clip": 0.01071588, + "auxiliary_loss_mlp": 0.01033072, + "balance_loss_clip": 1.03722286, + "balance_loss_mlp": 1.02010179, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.58878230004332, + "language_loss": 0.7721861, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79323268, + "num_input_tokens_seen": 353710660, + "step": 16391, + "time_per_iteration": 2.611851215362549 + }, + { + "auxiliary_loss_clip": 0.01066458, + "auxiliary_loss_mlp": 0.00775722, + "balance_loss_clip": 1.0324738, + "balance_loss_mlp": 1.00052214, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 1.9172287961598184, + "language_loss": 0.68305439, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.70147622, + "num_input_tokens_seen": 353730440, + "step": 16392, + "time_per_iteration": 2.553964853286743 + }, + { + "auxiliary_loss_clip": 0.01080452, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.03538752, + "balance_loss_mlp": 1.01652265, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 2.056918866667623, + "language_loss": 0.55830151, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.57940865, + "num_input_tokens_seen": 353748360, + "step": 16393, + "time_per_iteration": 2.508007764816284 + }, + { + "auxiliary_loss_clip": 0.01077423, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.03284442, + "balance_loss_mlp": 1.0193125, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 3.316367367672481, + "language_loss": 0.78557742, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.80668217, + "num_input_tokens_seen": 353760880, + "step": 16394, + "time_per_iteration": 2.488389730453491 + }, + { + "auxiliary_loss_clip": 0.010974, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.03386807, + "balance_loss_mlp": 1.02139199, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.6237700634096999, + "language_loss": 0.76389301, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78521192, + "num_input_tokens_seen": 353782255, + "step": 16395, + "time_per_iteration": 2.4877493381500244 + }, + { + "auxiliary_loss_clip": 0.01095819, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.03488314, + "balance_loss_mlp": 1.01598167, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 2.046293135799683, + "language_loss": 0.75169301, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77293313, + "num_input_tokens_seen": 353803580, + "step": 16396, + "time_per_iteration": 3.9771883487701416 + }, + { + "auxiliary_loss_clip": 0.01072568, + "auxiliary_loss_mlp": 0.01030507, + "balance_loss_clip": 1.03536427, + "balance_loss_mlp": 1.01825821, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.5408718537602824, + "language_loss": 0.70938593, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.73041666, + "num_input_tokens_seen": 353824200, + "step": 16397, + "time_per_iteration": 2.6079862117767334 + }, + { + "auxiliary_loss_clip": 0.01083298, + "auxiliary_loss_mlp": 0.01034002, + "balance_loss_clip": 1.03566551, + "balance_loss_mlp": 1.02217066, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.775690953871128, + "language_loss": 0.71310437, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73427743, + "num_input_tokens_seen": 353843350, + "step": 16398, + "time_per_iteration": 2.5208892822265625 + }, + { + "auxiliary_loss_clip": 0.01072871, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.03382158, + "balance_loss_mlp": 1.01654434, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.393714268773888, + "language_loss": 0.74072933, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76174009, + "num_input_tokens_seen": 353864520, + "step": 16399, + "time_per_iteration": 2.569082736968994 + }, + { + "auxiliary_loss_clip": 0.0110693, + "auxiliary_loss_mlp": 0.01028896, + "balance_loss_clip": 1.03439784, + "balance_loss_mlp": 1.01652205, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.2633603375151607, + "language_loss": 0.57483214, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.59619045, + "num_input_tokens_seen": 353882240, + "step": 16400, + "time_per_iteration": 2.4320733547210693 + }, + { + "auxiliary_loss_clip": 0.01092898, + "auxiliary_loss_mlp": 0.01028547, + "balance_loss_clip": 1.04049277, + "balance_loss_mlp": 1.01520789, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 2.53489702777482, + "language_loss": 0.80392599, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82514048, + "num_input_tokens_seen": 353901590, + "step": 16401, + "time_per_iteration": 2.5081493854522705 + }, + { + "auxiliary_loss_clip": 0.01096241, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.03647625, + "balance_loss_mlp": 1.02094793, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.7973215995968608, + "language_loss": 0.78013581, + "learning_rate": 2.005638002662069e-09, + "loss": 0.80143422, + "num_input_tokens_seen": 353918785, + "step": 16402, + "time_per_iteration": 2.457071542739868 + }, + { + "auxiliary_loss_clip": 0.01098976, + "auxiliary_loss_mlp": 0.01034786, + "balance_loss_clip": 1.03692698, + "balance_loss_mlp": 1.02228665, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.655237498219078, + "language_loss": 0.69647861, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.71781623, + "num_input_tokens_seen": 353940390, + "step": 16403, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0109253, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.03214288, + "balance_loss_mlp": 1.01840401, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 2.5292673467983144, + "language_loss": 0.74716258, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76838887, + "num_input_tokens_seen": 353962180, + "step": 16404, + "time_per_iteration": 2.515350818634033 + }, + { + "auxiliary_loss_clip": 0.01097241, + "auxiliary_loss_mlp": 0.00777263, + "balance_loss_clip": 1.0342114, + "balance_loss_mlp": 1.00057936, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 1.886625281204984, + "language_loss": 0.70050693, + "learning_rate": 1.953666699415768e-09, + "loss": 0.71925193, + "num_input_tokens_seen": 353984305, + "step": 16405, + "time_per_iteration": 2.599660634994507 + }, + { + "auxiliary_loss_clip": 0.01085207, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.03624892, + "balance_loss_mlp": 1.02121365, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.6949076975232913, + "language_loss": 0.69739342, + "learning_rate": 1.93649446302846e-09, + "loss": 0.71856856, + "num_input_tokens_seen": 354004495, + "step": 16406, + "time_per_iteration": 2.5203564167022705 + }, + { + "auxiliary_loss_clip": 0.01052886, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.03687763, + "balance_loss_mlp": 1.01981902, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 2.7896093117285066, + "language_loss": 0.74626708, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.76711112, + "num_input_tokens_seen": 354015985, + "step": 16407, + "time_per_iteration": 2.5667011737823486 + }, + { + "auxiliary_loss_clip": 0.01083557, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.03381944, + "balance_loss_mlp": 1.01949716, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 1.8512473255177155, + "language_loss": 0.77396274, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79511023, + "num_input_tokens_seen": 354033260, + "step": 16408, + "time_per_iteration": 4.102532625198364 + }, + { + "auxiliary_loss_clip": 0.01102596, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.0379281, + "balance_loss_mlp": 1.01850104, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 3.566738424326201, + "language_loss": 0.67930216, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70064819, + "num_input_tokens_seen": 354052825, + "step": 16409, + "time_per_iteration": 3.788700580596924 + }, + { + "auxiliary_loss_clip": 0.01012923, + "auxiliary_loss_mlp": 0.01002722, + "balance_loss_clip": 1.00824666, + "balance_loss_mlp": 1.00159514, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.7982394347520815, + "language_loss": 0.61056596, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.6307224, + "num_input_tokens_seen": 354113920, + "step": 16410, + "time_per_iteration": 3.1326661109924316 + }, + { + "auxiliary_loss_clip": 0.01098497, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.03602934, + "balance_loss_mlp": 1.02017021, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 2.096874343730024, + "language_loss": 0.66129148, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68260133, + "num_input_tokens_seen": 354134210, + "step": 16411, + "time_per_iteration": 2.5375595092773438 + }, + { + "auxiliary_loss_clip": 0.01026452, + "auxiliary_loss_mlp": 0.01000379, + "balance_loss_clip": 1.00297189, + "balance_loss_mlp": 0.99916935, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7241572524623929, + "language_loss": 0.56260681, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58287513, + "num_input_tokens_seen": 354198010, + "step": 16412, + "time_per_iteration": 3.076401472091675 + }, + { + "auxiliary_loss_clip": 0.01079728, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.03565001, + "balance_loss_mlp": 1.02024436, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 1.9031193603679344, + "language_loss": 0.73364758, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75477707, + "num_input_tokens_seen": 354220000, + "step": 16413, + "time_per_iteration": 2.5892229080200195 + }, + { + "auxiliary_loss_clip": 0.01060201, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.03385508, + "balance_loss_mlp": 1.01789367, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.4046753805801628, + "language_loss": 0.71335936, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73425627, + "num_input_tokens_seen": 354240910, + "step": 16414, + "time_per_iteration": 2.602473020553589 + }, + { + "auxiliary_loss_clip": 0.01091125, + "auxiliary_loss_mlp": 0.01037846, + "balance_loss_clip": 1.03520668, + "balance_loss_mlp": 1.02597892, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.5740953978489296, + "language_loss": 0.70274049, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72403014, + "num_input_tokens_seen": 354259430, + "step": 16415, + "time_per_iteration": 2.4638922214508057 + }, + { + "auxiliary_loss_clip": 0.01069122, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.03437161, + "balance_loss_mlp": 1.02146983, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 3.347495930120541, + "language_loss": 0.75594366, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77696121, + "num_input_tokens_seen": 354279490, + "step": 16416, + "time_per_iteration": 2.6012768745422363 + }, + { + "auxiliary_loss_clip": 0.0108343, + "auxiliary_loss_mlp": 0.01025485, + "balance_loss_clip": 1.03785956, + "balance_loss_mlp": 1.01359987, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 1.9371997813685735, + "language_loss": 0.70377237, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.7248615, + "num_input_tokens_seen": 354295080, + "step": 16417, + "time_per_iteration": 2.4563045501708984 + }, + { + "auxiliary_loss_clip": 0.01088045, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.03810167, + "balance_loss_mlp": 1.02325952, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 1.4998538035592266, + "language_loss": 0.7086972, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.72993702, + "num_input_tokens_seen": 354314610, + "step": 16418, + "time_per_iteration": 2.4995691776275635 + }, + { + "auxiliary_loss_clip": 0.01027042, + "auxiliary_loss_mlp": 0.0100339, + "balance_loss_clip": 1.00367975, + "balance_loss_mlp": 1.00221562, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6556944182274703, + "language_loss": 0.53695726, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55726159, + "num_input_tokens_seen": 354383115, + "step": 16419, + "time_per_iteration": 3.129866361618042 + }, + { + "auxiliary_loss_clip": 0.01088608, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.03375113, + "balance_loss_mlp": 1.02291059, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.865952678398025, + "language_loss": 0.78206003, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80330944, + "num_input_tokens_seen": 354403115, + "step": 16420, + "time_per_iteration": 2.523207426071167 + }, + { + "auxiliary_loss_clip": 0.01069215, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.03949356, + "balance_loss_mlp": 1.01647305, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 1.5596274669587604, + "language_loss": 0.71144211, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73242396, + "num_input_tokens_seen": 354424520, + "step": 16421, + "time_per_iteration": 4.077122688293457 + }, + { + "auxiliary_loss_clip": 0.01100293, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.03607345, + "balance_loss_mlp": 1.01897252, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 4.615322516610739, + "language_loss": 0.82693362, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.84825897, + "num_input_tokens_seen": 354444800, + "step": 16422, + "time_per_iteration": 2.5145530700683594 + }, + { + "auxiliary_loss_clip": 0.01073436, + "auxiliary_loss_mlp": 0.01023966, + "balance_loss_clip": 1.03403294, + "balance_loss_mlp": 1.01188445, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.6481909077455261, + "language_loss": 0.86087549, + "learning_rate": 1.656159280223779e-09, + "loss": 0.88184953, + "num_input_tokens_seen": 354464590, + "step": 16423, + "time_per_iteration": 2.541562080383301 + }, + { + "auxiliary_loss_clip": 0.01100038, + "auxiliary_loss_mlp": 0.01027229, + "balance_loss_clip": 1.03655064, + "balance_loss_mlp": 1.01517153, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 2.107039721263299, + "language_loss": 0.70575762, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72703028, + "num_input_tokens_seen": 354484145, + "step": 16424, + "time_per_iteration": 2.4574296474456787 + }, + { + "auxiliary_loss_clip": 0.0109813, + "auxiliary_loss_mlp": 0.00777419, + "balance_loss_clip": 1.03438354, + "balance_loss_mlp": 1.00053883, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 2.7672466230616664, + "language_loss": 0.80287355, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82162905, + "num_input_tokens_seen": 354502475, + "step": 16425, + "time_per_iteration": 2.509006977081299 + }, + { + "auxiliary_loss_clip": 0.01059049, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.03195632, + "balance_loss_mlp": 1.02282548, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 1.8850239750754318, + "language_loss": 0.80217856, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.82313013, + "num_input_tokens_seen": 354521855, + "step": 16426, + "time_per_iteration": 2.5844545364379883 + }, + { + "auxiliary_loss_clip": 0.01099166, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.03777778, + "balance_loss_mlp": 1.01955545, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.9507554829631515, + "language_loss": 0.84737915, + "learning_rate": 1.593380599750338e-09, + "loss": 0.86868525, + "num_input_tokens_seen": 354539535, + "step": 16427, + "time_per_iteration": 2.4771058559417725 + }, + { + "auxiliary_loss_clip": 0.01106735, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.03668165, + "balance_loss_mlp": 1.01758671, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 1.7016991176158742, + "language_loss": 0.70615613, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72751832, + "num_input_tokens_seen": 354557430, + "step": 16428, + "time_per_iteration": 2.437666893005371 + }, + { + "auxiliary_loss_clip": 0.01070115, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.0326376, + "balance_loss_mlp": 1.02548313, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 1.9128347245260904, + "language_loss": 0.80077398, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82185048, + "num_input_tokens_seen": 354574735, + "step": 16429, + "time_per_iteration": 2.5573058128356934 + }, + { + "auxiliary_loss_clip": 0.01105688, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.034554, + "balance_loss_mlp": 1.01995134, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.947493106611779, + "language_loss": 0.62419957, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.6455695, + "num_input_tokens_seen": 354597050, + "step": 16430, + "time_per_iteration": 2.561802625656128 + }, + { + "auxiliary_loss_clip": 0.01108085, + "auxiliary_loss_mlp": 0.01034953, + "balance_loss_clip": 1.03618836, + "balance_loss_mlp": 1.02305031, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.6578551841685307, + "language_loss": 0.72966242, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75109291, + "num_input_tokens_seen": 354619095, + "step": 16431, + "time_per_iteration": 2.512849807739258 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.01030287, + "balance_loss_clip": 1.03756332, + "balance_loss_mlp": 1.01805651, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 3.005202909976157, + "language_loss": 0.81104225, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.83242881, + "num_input_tokens_seen": 354633790, + "step": 16432, + "time_per_iteration": 2.3782095909118652 + }, + { + "auxiliary_loss_clip": 0.01093785, + "auxiliary_loss_mlp": 0.01030622, + "balance_loss_clip": 1.03457081, + "balance_loss_mlp": 1.01962471, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.8735437447272854, + "language_loss": 0.80409598, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82533997, + "num_input_tokens_seen": 354653180, + "step": 16433, + "time_per_iteration": 2.4721662998199463 + }, + { + "auxiliary_loss_clip": 0.0110469, + "auxiliary_loss_mlp": 0.01033849, + "balance_loss_clip": 1.03536701, + "balance_loss_mlp": 1.02137923, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 2.4938580298145587, + "language_loss": 0.64631063, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.667696, + "num_input_tokens_seen": 354669900, + "step": 16434, + "time_per_iteration": 2.448460817337036 + }, + { + "auxiliary_loss_clip": 0.01098445, + "auxiliary_loss_mlp": 0.01033052, + "balance_loss_clip": 1.0334605, + "balance_loss_mlp": 1.02076125, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 1.4635313016285756, + "language_loss": 0.69288963, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71420455, + "num_input_tokens_seen": 354693165, + "step": 16435, + "time_per_iteration": 2.560978651046753 + }, + { + "auxiliary_loss_clip": 0.01050789, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.03364086, + "balance_loss_mlp": 1.02368855, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.6372413674586883, + "language_loss": 0.75661695, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77748799, + "num_input_tokens_seen": 354711915, + "step": 16436, + "time_per_iteration": 3.9796719551086426 + }, + { + "auxiliary_loss_clip": 0.01077257, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.03287983, + "balance_loss_mlp": 1.01927185, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.067138030824625, + "language_loss": 0.74100101, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76209807, + "num_input_tokens_seen": 354729135, + "step": 16437, + "time_per_iteration": 2.4721338748931885 + }, + { + "auxiliary_loss_clip": 0.01070305, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.03370857, + "balance_loss_mlp": 1.01820827, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.5868452533886837, + "language_loss": 0.60363108, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62462884, + "num_input_tokens_seen": 354752530, + "step": 16438, + "time_per_iteration": 2.5891618728637695 + }, + { + "auxiliary_loss_clip": 0.01085235, + "auxiliary_loss_mlp": 0.01032348, + "balance_loss_clip": 1.0340569, + "balance_loss_mlp": 1.01934242, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 1.7945727572382495, + "language_loss": 0.72131306, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.74248886, + "num_input_tokens_seen": 354771135, + "step": 16439, + "time_per_iteration": 2.588620901107788 + }, + { + "auxiliary_loss_clip": 0.01094212, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.03542924, + "balance_loss_mlp": 1.01891887, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.6228875249378254, + "language_loss": 0.59721172, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.61846673, + "num_input_tokens_seen": 354791800, + "step": 16440, + "time_per_iteration": 2.545454263687134 + }, + { + "auxiliary_loss_clip": 0.01108548, + "auxiliary_loss_mlp": 0.01033674, + "balance_loss_clip": 1.03494263, + "balance_loss_mlp": 1.02140749, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.9167268660630663, + "language_loss": 0.76572359, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.78714585, + "num_input_tokens_seen": 354809200, + "step": 16441, + "time_per_iteration": 2.3809313774108887 + }, + { + "auxiliary_loss_clip": 0.01087908, + "auxiliary_loss_mlp": 0.01029338, + "balance_loss_clip": 1.03546524, + "balance_loss_mlp": 1.01694608, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 2.515442073464999, + "language_loss": 0.6798625, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.7010349, + "num_input_tokens_seen": 354829945, + "step": 16442, + "time_per_iteration": 2.6763107776641846 + }, + { + "auxiliary_loss_clip": 0.01095559, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.03481996, + "balance_loss_mlp": 1.01702869, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.5175177712102697, + "language_loss": 0.74596268, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76721084, + "num_input_tokens_seen": 354845055, + "step": 16443, + "time_per_iteration": 2.420191764831543 + }, + { + "auxiliary_loss_clip": 0.01087519, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.03434062, + "balance_loss_mlp": 1.0175283, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 1.8002078547604734, + "language_loss": 0.73799646, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75918126, + "num_input_tokens_seen": 354864680, + "step": 16444, + "time_per_iteration": 2.4960620403289795 + }, + { + "auxiliary_loss_clip": 0.0106039, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.03465044, + "balance_loss_mlp": 1.02561617, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 2.1266222033023054, + "language_loss": 0.69480252, + "learning_rate": 1.325881465858547e-09, + "loss": 0.7157923, + "num_input_tokens_seen": 354885685, + "step": 16445, + "time_per_iteration": 2.593200445175171 + }, + { + "auxiliary_loss_clip": 0.0110161, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.03792191, + "balance_loss_mlp": 1.01523685, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 5.999876544358832, + "language_loss": 0.60702407, + "learning_rate": 1.311740377491155e-09, + "loss": 0.62832296, + "num_input_tokens_seen": 354901505, + "step": 16446, + "time_per_iteration": 2.4407169818878174 + }, + { + "auxiliary_loss_clip": 0.01080802, + "auxiliary_loss_mlp": 0.01034087, + "balance_loss_clip": 1.0340178, + "balance_loss_mlp": 1.02249992, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 2.501354591196402, + "language_loss": 0.70749688, + "learning_rate": 1.297675079582783e-09, + "loss": 0.7286458, + "num_input_tokens_seen": 354920060, + "step": 16447, + "time_per_iteration": 3.975369453430176 + }, + { + "auxiliary_loss_clip": 0.01105576, + "auxiliary_loss_mlp": 0.00776727, + "balance_loss_clip": 1.03485894, + "balance_loss_mlp": 1.00057578, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 1.7793034978479256, + "language_loss": 0.83842289, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.85724592, + "num_input_tokens_seen": 354938690, + "step": 16448, + "time_per_iteration": 2.4714457988739014 + }, + { + "auxiliary_loss_clip": 0.01092129, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.03448451, + "balance_loss_mlp": 1.01486301, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.6355977405967783, + "language_loss": 0.69753766, + "learning_rate": 1.26977185727406e-09, + "loss": 0.71871299, + "num_input_tokens_seen": 354956955, + "step": 16449, + "time_per_iteration": 3.8032777309417725 + }, + { + "auxiliary_loss_clip": 0.01099749, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.03694546, + "balance_loss_mlp": 1.01749671, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 2.35293195648164, + "language_loss": 0.73886895, + "learning_rate": 1.25593393393153e-09, + "loss": 0.76016116, + "num_input_tokens_seen": 354976800, + "step": 16450, + "time_per_iteration": 2.6092002391815186 + }, + { + "auxiliary_loss_clip": 0.0110863, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.03355527, + "balance_loss_mlp": 1.02022195, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 1.9683158453296523, + "language_loss": 0.79306549, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81448126, + "num_input_tokens_seen": 354996625, + "step": 16451, + "time_per_iteration": 2.417898178100586 + }, + { + "auxiliary_loss_clip": 0.01070067, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.03127337, + "balance_loss_mlp": 1.02310789, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 2.6039737740941677, + "language_loss": 0.70111924, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72218466, + "num_input_tokens_seen": 355014535, + "step": 16452, + "time_per_iteration": 2.5630195140838623 + }, + { + "auxiliary_loss_clip": 0.01106048, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.03694344, + "balance_loss_mlp": 1.01694179, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.8732895559461904, + "language_loss": 0.73744643, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.75878823, + "num_input_tokens_seen": 355033280, + "step": 16453, + "time_per_iteration": 2.41605806350708 + }, + { + "auxiliary_loss_clip": 0.01071714, + "auxiliary_loss_mlp": 0.01035862, + "balance_loss_clip": 1.03834677, + "balance_loss_mlp": 1.02490032, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.1961576431974636, + "language_loss": 0.69837826, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.71945405, + "num_input_tokens_seen": 355053320, + "step": 16454, + "time_per_iteration": 2.6030662059783936 + }, + { + "auxiliary_loss_clip": 0.01077385, + "auxiliary_loss_mlp": 0.01032693, + "balance_loss_clip": 1.03242588, + "balance_loss_mlp": 1.02104568, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 2.1160789704056207, + "language_loss": 0.75841206, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.77951282, + "num_input_tokens_seen": 355070230, + "step": 16455, + "time_per_iteration": 2.4955172538757324 + }, + { + "auxiliary_loss_clip": 0.01080414, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.03749931, + "balance_loss_mlp": 1.01469016, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.6530934074576498, + "language_loss": 0.65524757, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.67631853, + "num_input_tokens_seen": 355090125, + "step": 16456, + "time_per_iteration": 2.533888578414917 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01028943, + "balance_loss_clip": 1.03637993, + "balance_loss_mlp": 1.01717687, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 1.8178546615093076, + "language_loss": 0.74056071, + "learning_rate": 1.161190691666203e-09, + "loss": 0.76183653, + "num_input_tokens_seen": 355107890, + "step": 16457, + "time_per_iteration": 2.4431638717651367 + }, + { + "auxiliary_loss_clip": 0.01108172, + "auxiliary_loss_mlp": 0.01028628, + "balance_loss_clip": 1.03636968, + "balance_loss_mlp": 1.01669502, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.211522819225033, + "language_loss": 0.68800139, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.70936942, + "num_input_tokens_seen": 355126340, + "step": 16458, + "time_per_iteration": 2.5023038387298584 + }, + { + "auxiliary_loss_clip": 0.01092562, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.03391266, + "balance_loss_mlp": 1.01653242, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 1.8316319621433201, + "language_loss": 0.7930339, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81423843, + "num_input_tokens_seen": 355144025, + "step": 16459, + "time_per_iteration": 2.443498373031616 + }, + { + "auxiliary_loss_clip": 0.0108265, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.03462529, + "balance_loss_mlp": 1.02158189, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 3.82330722471127, + "language_loss": 0.70949972, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.73066068, + "num_input_tokens_seen": 355163125, + "step": 16460, + "time_per_iteration": 4.036288022994995 + }, + { + "auxiliary_loss_clip": 0.01087777, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.0339272, + "balance_loss_mlp": 1.01537097, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.5169261117699226, + "language_loss": 0.87375379, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89490843, + "num_input_tokens_seen": 355184060, + "step": 16461, + "time_per_iteration": 2.5538742542266846 + }, + { + "auxiliary_loss_clip": 0.0109678, + "auxiliary_loss_mlp": 0.01033535, + "balance_loss_clip": 1.03630471, + "balance_loss_mlp": 1.0207262, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 1.66206039254594, + "language_loss": 0.63192475, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65322793, + "num_input_tokens_seen": 355204505, + "step": 16462, + "time_per_iteration": 2.499138832092285 + }, + { + "auxiliary_loss_clip": 0.01100703, + "auxiliary_loss_mlp": 0.01028794, + "balance_loss_clip": 1.03751135, + "balance_loss_mlp": 1.01703429, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.5258382668770378, + "language_loss": 0.72849005, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.74978501, + "num_input_tokens_seen": 355223055, + "step": 16463, + "time_per_iteration": 2.433401346206665 + }, + { + "auxiliary_loss_clip": 0.01095304, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.03463018, + "balance_loss_mlp": 1.01658034, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 1.9410937377242095, + "language_loss": 0.70183468, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.723077, + "num_input_tokens_seen": 355242000, + "step": 16464, + "time_per_iteration": 2.473426342010498 + }, + { + "auxiliary_loss_clip": 0.01079442, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.03511953, + "balance_loss_mlp": 1.01759684, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 2.2202803313754886, + "language_loss": 0.73404181, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75513482, + "num_input_tokens_seen": 355260175, + "step": 16465, + "time_per_iteration": 2.5140671730041504 + }, + { + "auxiliary_loss_clip": 0.01104688, + "auxiliary_loss_mlp": 0.01038452, + "balance_loss_clip": 1.03377962, + "balance_loss_mlp": 1.02722871, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 3.611502123301835, + "language_loss": 0.86512673, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88655812, + "num_input_tokens_seen": 355281930, + "step": 16466, + "time_per_iteration": 2.465346097946167 + }, + { + "auxiliary_loss_clip": 0.01073753, + "auxiliary_loss_mlp": 0.01023475, + "balance_loss_clip": 1.03582501, + "balance_loss_mlp": 1.01152992, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.8477352587713127, + "language_loss": 0.71691179, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73788404, + "num_input_tokens_seen": 355301555, + "step": 16467, + "time_per_iteration": 2.5374042987823486 + }, + { + "auxiliary_loss_clip": 0.01081878, + "auxiliary_loss_mlp": 0.01036136, + "balance_loss_clip": 1.03265095, + "balance_loss_mlp": 1.02332711, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.420053331810728, + "language_loss": 0.6506474, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67182755, + "num_input_tokens_seen": 355324925, + "step": 16468, + "time_per_iteration": 2.5991110801696777 + }, + { + "auxiliary_loss_clip": 0.01082665, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.0371182, + "balance_loss_mlp": 1.02267838, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 1.8713862790084035, + "language_loss": 0.62095702, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.64213443, + "num_input_tokens_seen": 355343875, + "step": 16469, + "time_per_iteration": 2.599855899810791 + }, + { + "auxiliary_loss_clip": 0.01074669, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.03335094, + "balance_loss_mlp": 1.01771545, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2.8305052692754717, + "language_loss": 0.70119023, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72223759, + "num_input_tokens_seen": 355358835, + "step": 16470, + "time_per_iteration": 2.512786865234375 + }, + { + "auxiliary_loss_clip": 0.01018812, + "auxiliary_loss_mlp": 0.01002154, + "balance_loss_clip": 1.00533271, + "balance_loss_mlp": 1.0009861, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6589887643572149, + "language_loss": 0.55521715, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57542682, + "num_input_tokens_seen": 355431225, + "step": 16471, + "time_per_iteration": 3.2280433177948 + }, + { + "auxiliary_loss_clip": 0.01088466, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.03796363, + "balance_loss_mlp": 1.02024591, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 2.7625294154889874, + "language_loss": 0.84444189, + "learning_rate": 9.706760407131032e-10, + "loss": 0.86565399, + "num_input_tokens_seen": 355448250, + "step": 16472, + "time_per_iteration": 2.486668109893799 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.0360465, + "balance_loss_mlp": 1.0157764, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 2.0080752014017103, + "language_loss": 0.85694182, + "learning_rate": 9.585814735431075e-10, + "loss": 0.87818491, + "num_input_tokens_seen": 355467040, + "step": 16473, + "time_per_iteration": 2.4761054515838623 + }, + { + "auxiliary_loss_clip": 0.01105337, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.03429151, + "balance_loss_mlp": 1.01825452, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 2.373813609362592, + "language_loss": 0.84799665, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86934596, + "num_input_tokens_seen": 355487825, + "step": 16474, + "time_per_iteration": 2.5049774646759033 + }, + { + "auxiliary_loss_clip": 0.0108233, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.03310692, + "balance_loss_mlp": 1.02523971, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.6775940538634748, + "language_loss": 0.76411921, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78530818, + "num_input_tokens_seen": 355507445, + "step": 16475, + "time_per_iteration": 3.969703197479248 + }, + { + "auxiliary_loss_clip": 0.0106918, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.03303957, + "balance_loss_mlp": 1.02245557, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.5785187733390358, + "language_loss": 0.75648457, + "learning_rate": 9.227525969588423e-10, + "loss": 0.77753127, + "num_input_tokens_seen": 355527205, + "step": 16476, + "time_per_iteration": 2.552018404006958 + }, + { + "auxiliary_loss_clip": 0.01100103, + "auxiliary_loss_mlp": 0.0077965, + "balance_loss_clip": 1.03484225, + "balance_loss_mlp": 1.00065613, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 2.1034459414826574, + "language_loss": 0.67540693, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69420439, + "num_input_tokens_seen": 355544740, + "step": 16477, + "time_per_iteration": 2.454625129699707 + }, + { + "auxiliary_loss_clip": 0.01094455, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.0386095, + "balance_loss_mlp": 1.02130437, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 2.1074579273809766, + "language_loss": 0.71866941, + "learning_rate": 8.992457045289282e-10, + "loss": 0.73995793, + "num_input_tokens_seen": 355564385, + "step": 16478, + "time_per_iteration": 2.51879620552063 + }, + { + "auxiliary_loss_clip": 0.01109038, + "auxiliary_loss_mlp": 0.010377, + "balance_loss_clip": 1.03613853, + "balance_loss_mlp": 1.02459264, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.671256271424058, + "language_loss": 0.8094126, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83087999, + "num_input_tokens_seen": 355579260, + "step": 16479, + "time_per_iteration": 2.371673583984375 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.03520072, + "balance_loss_mlp": 1.0219779, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 2.616441281794303, + "language_loss": 0.66170567, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68303007, + "num_input_tokens_seen": 355599790, + "step": 16480, + "time_per_iteration": 2.5456008911132812 + }, + { + "auxiliary_loss_clip": 0.01094402, + "auxiliary_loss_mlp": 0.01029135, + "balance_loss_clip": 1.03426409, + "balance_loss_mlp": 1.01700544, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 1.9118804778348444, + "language_loss": 0.72335815, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74459344, + "num_input_tokens_seen": 355620925, + "step": 16481, + "time_per_iteration": 2.6277565956115723 + }, + { + "auxiliary_loss_clip": 0.01095361, + "auxiliary_loss_mlp": 0.01026119, + "balance_loss_clip": 1.03435957, + "balance_loss_mlp": 1.01424003, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 1.812648604626426, + "language_loss": 0.77622449, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79743934, + "num_input_tokens_seen": 355639165, + "step": 16482, + "time_per_iteration": 2.4554221630096436 + }, + { + "auxiliary_loss_clip": 0.01098998, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.03472507, + "balance_loss_mlp": 1.02114153, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 1.999590915193097, + "language_loss": 0.7528283, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77414978, + "num_input_tokens_seen": 355657320, + "step": 16483, + "time_per_iteration": 2.4292361736297607 + }, + { + "auxiliary_loss_clip": 0.01017439, + "auxiliary_loss_mlp": 0.01002154, + "balance_loss_clip": 1.00305581, + "balance_loss_mlp": 1.00087857, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6742405601562032, + "language_loss": 0.53582662, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55602258, + "num_input_tokens_seen": 355726370, + "step": 16484, + "time_per_iteration": 3.167501926422119 + }, + { + "auxiliary_loss_clip": 0.01103328, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.03469729, + "balance_loss_mlp": 1.01877594, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 1.9272344120797793, + "language_loss": 0.82250577, + "learning_rate": 8.19359496165184e-10, + "loss": 0.84384429, + "num_input_tokens_seen": 355745840, + "step": 16485, + "time_per_iteration": 2.4516820907592773 + }, + { + "auxiliary_loss_clip": 0.01067518, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.03205442, + "balance_loss_mlp": 1.02637959, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1.6388027499033417, + "language_loss": 0.81427693, + "learning_rate": 8.082504137836288e-10, + "loss": 0.835347, + "num_input_tokens_seen": 355763385, + "step": 16486, + "time_per_iteration": 3.9745044708251953 + }, + { + "auxiliary_loss_clip": 0.01099454, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.036309, + "balance_loss_mlp": 1.01725519, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.3668564984296556, + "language_loss": 0.65841979, + "learning_rate": 7.972171409538209e-10, + "loss": 0.67970133, + "num_input_tokens_seen": 355786075, + "step": 16487, + "time_per_iteration": 2.6452150344848633 + }, + { + "auxiliary_loss_clip": 0.01095122, + "auxiliary_loss_mlp": 0.00776777, + "balance_loss_clip": 1.03616595, + "balance_loss_mlp": 1.0006088, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.7617300143002816, + "language_loss": 0.7684778, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78719676, + "num_input_tokens_seen": 355806295, + "step": 16488, + "time_per_iteration": 3.793163537979126 + }, + { + "auxiliary_loss_clip": 0.01081219, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.03588283, + "balance_loss_mlp": 1.01749444, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 2.482060763419829, + "language_loss": 0.68472302, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70583701, + "num_input_tokens_seen": 355825730, + "step": 16489, + "time_per_iteration": 2.5628769397735596 + }, + { + "auxiliary_loss_clip": 0.00993709, + "auxiliary_loss_mlp": 0.01006926, + "balance_loss_clip": 1.00687432, + "balance_loss_mlp": 1.00579345, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6092186984335277, + "language_loss": 0.52594036, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54594672, + "num_input_tokens_seen": 355891545, + "step": 16490, + "time_per_iteration": 3.1912612915039062 + }, + { + "auxiliary_loss_clip": 0.01085289, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.03659856, + "balance_loss_mlp": 1.02418041, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 1.5435891084421307, + "language_loss": 0.75162768, + "learning_rate": 7.538421534734052e-10, + "loss": 0.77285773, + "num_input_tokens_seen": 355909920, + "step": 16491, + "time_per_iteration": 2.572789430618286 + }, + { + "auxiliary_loss_clip": 0.01075375, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.04561341, + "balance_loss_mlp": 1.01763606, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.698373928052923, + "language_loss": 0.70434666, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72540969, + "num_input_tokens_seen": 355923130, + "step": 16492, + "time_per_iteration": 2.609145402908325 + }, + { + "auxiliary_loss_clip": 0.01075561, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.0342083, + "balance_loss_mlp": 1.01984668, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 2.07928349477568, + "language_loss": 0.6803596, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70144689, + "num_input_tokens_seen": 355941960, + "step": 16493, + "time_per_iteration": 2.5573928356170654 + }, + { + "auxiliary_loss_clip": 0.0108449, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.03726721, + "balance_loss_mlp": 1.02431548, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 3.4641290699864187, + "language_loss": 0.71203488, + "learning_rate": 7.221069333678276e-10, + "loss": 0.7332474, + "num_input_tokens_seen": 355961640, + "step": 16494, + "time_per_iteration": 2.5689661502838135 + }, + { + "auxiliary_loss_clip": 0.01099312, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.03656936, + "balance_loss_mlp": 1.01714182, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 2.1698186741000938, + "language_loss": 0.68080223, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70209885, + "num_input_tokens_seen": 355977980, + "step": 16495, + "time_per_iteration": 2.450010061264038 + }, + { + "auxiliary_loss_clip": 0.01010476, + "auxiliary_loss_mlp": 0.0100376, + "balance_loss_clip": 1.00567508, + "balance_loss_mlp": 1.00260985, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7157116203561174, + "language_loss": 0.53388959, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55403197, + "num_input_tokens_seen": 356042900, + "step": 16496, + "time_per_iteration": 3.190354585647583 + }, + { + "auxiliary_loss_clip": 0.01085594, + "auxiliary_loss_mlp": 0.00782953, + "balance_loss_clip": 1.03465617, + "balance_loss_mlp": 1.00059581, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 1.8025641347802464, + "language_loss": 0.71540773, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73409325, + "num_input_tokens_seen": 356063000, + "step": 16497, + "time_per_iteration": 2.5633347034454346 + }, + { + "auxiliary_loss_clip": 0.01082557, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.03693748, + "balance_loss_mlp": 1.02074528, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 2.1875739573594344, + "language_loss": 0.82506192, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84623015, + "num_input_tokens_seen": 356078130, + "step": 16498, + "time_per_iteration": 2.5185739994049072 + }, + { + "auxiliary_loss_clip": 0.01077259, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.03915238, + "balance_loss_mlp": 1.02539778, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.520151262358136, + "language_loss": 0.68105268, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70220166, + "num_input_tokens_seen": 356101655, + "step": 16499, + "time_per_iteration": 2.6137125492095947 + }, + { + "auxiliary_loss_clip": 0.0111013, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.03724837, + "balance_loss_mlp": 1.01619935, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 2.093384775884514, + "language_loss": 0.81832802, + "learning_rate": 6.606834497904223e-10, + "loss": 0.83971101, + "num_input_tokens_seen": 356121425, + "step": 16500, + "time_per_iteration": 3.9625794887542725 + }, + { + "auxiliary_loss_clip": 0.0108289, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.03449047, + "balance_loss_mlp": 1.0191977, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.8747262924960835, + "language_loss": 0.81670427, + "learning_rate": 6.507115533036511e-10, + "loss": 0.83785188, + "num_input_tokens_seen": 356140710, + "step": 16501, + "time_per_iteration": 2.5567235946655273 + }, + { + "auxiliary_loss_clip": 0.011009, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.03683817, + "balance_loss_mlp": 1.01811481, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 1.8494328131900075, + "language_loss": 0.77122843, + "learning_rate": 6.408154723420711e-10, + "loss": 0.7925458, + "num_input_tokens_seen": 356159835, + "step": 16502, + "time_per_iteration": 2.468890428543091 + }, + { + "auxiliary_loss_clip": 0.01087232, + "auxiliary_loss_mlp": 0.0103105, + "balance_loss_clip": 1.03855205, + "balance_loss_mlp": 1.01776361, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 3.619952375043459, + "language_loss": 0.71609032, + "learning_rate": 6.309952072811597e-10, + "loss": 0.7372731, + "num_input_tokens_seen": 356177555, + "step": 16503, + "time_per_iteration": 2.4789042472839355 + }, + { + "auxiliary_loss_clip": 0.01018545, + "auxiliary_loss_mlp": 0.01006872, + "balance_loss_clip": 1.00358164, + "balance_loss_mlp": 1.00543559, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6377132721566651, + "language_loss": 0.55059004, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57084417, + "num_input_tokens_seen": 356244975, + "step": 16504, + "time_per_iteration": 3.1355698108673096 + }, + { + "auxiliary_loss_clip": 0.01077346, + "auxiliary_loss_mlp": 0.01025298, + "balance_loss_clip": 1.03548491, + "balance_loss_mlp": 1.01449156, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 34.8306852030376, + "language_loss": 0.69468474, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71571124, + "num_input_tokens_seen": 356262605, + "step": 16505, + "time_per_iteration": 2.5230135917663574 + }, + { + "auxiliary_loss_clip": 0.01076879, + "auxiliary_loss_mlp": 0.01037673, + "balance_loss_clip": 1.0353539, + "balance_loss_mlp": 1.02372551, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 2.1848769530652388, + "language_loss": 0.65499508, + "learning_rate": 6.019893112119146e-10, + "loss": 0.67614061, + "num_input_tokens_seen": 356278935, + "step": 16506, + "time_per_iteration": 2.54762864112854 + }, + { + "auxiliary_loss_clip": 0.01048197, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.03547609, + "balance_loss_mlp": 1.01652098, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 2.9823790546258278, + "language_loss": 0.6300683, + "learning_rate": 5.924723134487219e-10, + "loss": 0.65084398, + "num_input_tokens_seen": 356295675, + "step": 16507, + "time_per_iteration": 2.629622459411621 + }, + { + "auxiliary_loss_clip": 0.01109121, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.03636742, + "balance_loss_mlp": 1.02064621, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.1632872205482334, + "language_loss": 0.72686148, + "learning_rate": 5.830311334193983e-10, + "loss": 0.74828357, + "num_input_tokens_seen": 356312885, + "step": 16508, + "time_per_iteration": 2.455556869506836 + }, + { + "auxiliary_loss_clip": 0.01107351, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.03520489, + "balance_loss_mlp": 1.01785111, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.6766719789202167, + "language_loss": 0.70277965, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72416288, + "num_input_tokens_seen": 356334070, + "step": 16509, + "time_per_iteration": 2.456666946411133 + }, + { + "auxiliary_loss_clip": 0.01095314, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.03353596, + "balance_loss_mlp": 1.02048945, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 1.8020796786517128, + "language_loss": 0.68391579, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70520186, + "num_input_tokens_seen": 356359410, + "step": 16510, + "time_per_iteration": 2.820282459259033 + }, + { + "auxiliary_loss_clip": 0.01076123, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.03484678, + "balance_loss_mlp": 1.02143788, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 2.441002175507692, + "language_loss": 0.81109446, + "learning_rate": 5.551625032997886e-10, + "loss": 0.832196, + "num_input_tokens_seen": 356378345, + "step": 16511, + "time_per_iteration": 2.5388834476470947 + }, + { + "auxiliary_loss_clip": 0.01072413, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.03969622, + "balance_loss_mlp": 1.01861811, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.786362282437672, + "language_loss": 0.91140771, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93243921, + "num_input_tokens_seen": 356397345, + "step": 16512, + "time_per_iteration": 2.5754408836364746 + }, + { + "auxiliary_loss_clip": 0.01001666, + "auxiliary_loss_mlp": 0.00999696, + "balance_loss_clip": 1.00705206, + "balance_loss_mlp": 0.99850434, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.7017588094101357, + "language_loss": 0.55154437, + "learning_rate": 5.369625117095378e-10, + "loss": 0.571558, + "num_input_tokens_seen": 356459160, + "step": 16513, + "time_per_iteration": 3.1965267658233643 + }, + { + "auxiliary_loss_clip": 0.01080888, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.03518081, + "balance_loss_mlp": 1.01698422, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.3183822941944858, + "language_loss": 0.64907074, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67017335, + "num_input_tokens_seen": 356486405, + "step": 16514, + "time_per_iteration": 2.830111265182495 + }, + { + "auxiliary_loss_clip": 0.01081101, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.03427124, + "balance_loss_mlp": 1.01371539, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 2.0227726952884613, + "language_loss": 0.73409009, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75517297, + "num_input_tokens_seen": 356502905, + "step": 16515, + "time_per_iteration": 3.954413414001465 + }, + { + "auxiliary_loss_clip": 0.01068559, + "auxiliary_loss_mlp": 0.0104107, + "balance_loss_clip": 1.03402328, + "balance_loss_mlp": 1.02718842, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.3912128669665171, + "language_loss": 0.77207792, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79317427, + "num_input_tokens_seen": 356523830, + "step": 16516, + "time_per_iteration": 2.5414247512817383 + }, + { + "auxiliary_loss_clip": 0.01079583, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.0363704, + "balance_loss_mlp": 1.01659334, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.5063171541671236, + "language_loss": 0.78040218, + "learning_rate": 5.014723692997602e-10, + "loss": 0.8014757, + "num_input_tokens_seen": 356543965, + "step": 16517, + "time_per_iteration": 2.575014114379883 + }, + { + "auxiliary_loss_clip": 0.01097021, + "auxiliary_loss_mlp": 0.01036835, + "balance_loss_clip": 1.03785443, + "balance_loss_mlp": 1.02297723, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.8732789393317772, + "language_loss": 0.67359257, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69493109, + "num_input_tokens_seen": 356561530, + "step": 16518, + "time_per_iteration": 2.44767427444458 + }, + { + "auxiliary_loss_clip": 0.01018085, + "auxiliary_loss_mlp": 0.01001965, + "balance_loss_clip": 1.01823497, + "balance_loss_mlp": 1.00077844, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7449155801083708, + "language_loss": 0.53414714, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55434763, + "num_input_tokens_seen": 356616845, + "step": 16519, + "time_per_iteration": 2.9649062156677246 + }, + { + "auxiliary_loss_clip": 0.01066954, + "auxiliary_loss_mlp": 0.0104189, + "balance_loss_clip": 1.03205431, + "balance_loss_mlp": 1.02816343, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 2.4577659058884085, + "language_loss": 0.60248637, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62357485, + "num_input_tokens_seen": 356633560, + "step": 16520, + "time_per_iteration": 2.512970209121704 + }, + { + "auxiliary_loss_clip": 0.01080791, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.03469419, + "balance_loss_mlp": 1.02017117, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.7235738620599077, + "language_loss": 0.62340665, + "learning_rate": 4.671953657853223e-10, + "loss": 0.64454055, + "num_input_tokens_seen": 356657600, + "step": 16521, + "time_per_iteration": 2.6189990043640137 + }, + { + "auxiliary_loss_clip": 0.01087133, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.03885591, + "balance_loss_mlp": 1.02142739, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 5.030615161892199, + "language_loss": 0.73875046, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.75996339, + "num_input_tokens_seen": 356675880, + "step": 16522, + "time_per_iteration": 2.499324083328247 + }, + { + "auxiliary_loss_clip": 0.01073974, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.03457832, + "balance_loss_mlp": 1.01797557, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.6641213503257282, + "language_loss": 0.73243552, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75347316, + "num_input_tokens_seen": 356696000, + "step": 16523, + "time_per_iteration": 2.5610227584838867 + }, + { + "auxiliary_loss_clip": 0.01085141, + "auxiliary_loss_mlp": 0.00777373, + "balance_loss_clip": 1.03314352, + "balance_loss_mlp": 1.00066054, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.5223992341869075, + "language_loss": 0.70703763, + "learning_rate": 4.422837480875241e-10, + "loss": 0.72566283, + "num_input_tokens_seen": 356716845, + "step": 16524, + "time_per_iteration": 2.5110936164855957 + }, + { + "auxiliary_loss_clip": 0.01076471, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.03616714, + "balance_loss_mlp": 1.02013469, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 2.177388413854162, + "language_loss": 0.79701674, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81809884, + "num_input_tokens_seen": 356732100, + "step": 16525, + "time_per_iteration": 4.01172137260437 + }, + { + "auxiliary_loss_clip": 0.01068076, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.03932333, + "balance_loss_mlp": 1.01711524, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 2.021337497134367, + "language_loss": 0.74635458, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.76732856, + "num_input_tokens_seen": 356751480, + "step": 16526, + "time_per_iteration": 2.548645496368408 + }, + { + "auxiliary_loss_clip": 0.01104046, + "auxiliary_loss_mlp": 0.00778332, + "balance_loss_clip": 1.03474295, + "balance_loss_mlp": 1.00051534, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 1.6024196324090285, + "language_loss": 0.72278666, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74161047, + "num_input_tokens_seen": 356772650, + "step": 16527, + "time_per_iteration": 3.902634382247925 + }, + { + "auxiliary_loss_clip": 0.01086108, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.03442478, + "balance_loss_mlp": 1.01719105, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.124341767483731, + "language_loss": 0.75973904, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78088754, + "num_input_tokens_seen": 356788510, + "step": 16528, + "time_per_iteration": 2.4620678424835205 + }, + { + "auxiliary_loss_clip": 0.01089319, + "auxiliary_loss_mlp": 0.01028027, + "balance_loss_clip": 1.03352916, + "balance_loss_mlp": 1.01484203, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 2.489132418058887, + "language_loss": 0.67674154, + "learning_rate": 4.022808578922898e-10, + "loss": 0.69791502, + "num_input_tokens_seen": 356809115, + "step": 16529, + "time_per_iteration": 2.5532896518707275 + }, + { + "auxiliary_loss_clip": 0.01103357, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.03826892, + "balance_loss_mlp": 1.02446461, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 2.7911039881227397, + "language_loss": 0.65761518, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67903346, + "num_input_tokens_seen": 356826410, + "step": 16530, + "time_per_iteration": 2.443532705307007 + }, + { + "auxiliary_loss_clip": 0.01093849, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.03472435, + "balance_loss_mlp": 1.01891994, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 2.1388352190555784, + "language_loss": 0.71436787, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73560995, + "num_input_tokens_seen": 356844990, + "step": 16531, + "time_per_iteration": 2.4775443077087402 + }, + { + "auxiliary_loss_clip": 0.01095998, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.03564119, + "balance_loss_mlp": 1.01705217, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.4830821189073642, + "language_loss": 0.74149787, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76275575, + "num_input_tokens_seen": 356866530, + "step": 16532, + "time_per_iteration": 2.5062596797943115 + }, + { + "auxiliary_loss_clip": 0.01059047, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.03825188, + "balance_loss_mlp": 1.02187204, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.579195357748166, + "language_loss": 0.70173317, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72264999, + "num_input_tokens_seen": 356884660, + "step": 16533, + "time_per_iteration": 2.5582005977630615 + }, + { + "auxiliary_loss_clip": 0.01097291, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.03694987, + "balance_loss_mlp": 1.02060139, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 2.1458575613627615, + "language_loss": 0.83882225, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86013079, + "num_input_tokens_seen": 356900895, + "step": 16534, + "time_per_iteration": 2.434142589569092 + }, + { + "auxiliary_loss_clip": 0.0106899, + "auxiliary_loss_mlp": 0.0102548, + "balance_loss_clip": 1.03301477, + "balance_loss_mlp": 1.01433957, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.6686492620242188, + "language_loss": 0.66281766, + "learning_rate": 3.567796158934211e-10, + "loss": 0.68376231, + "num_input_tokens_seen": 356920985, + "step": 16535, + "time_per_iteration": 2.5766513347625732 + }, + { + "auxiliary_loss_clip": 0.01070288, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.03719425, + "balance_loss_mlp": 1.01681304, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.7071379757490948, + "language_loss": 0.64828551, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66926646, + "num_input_tokens_seen": 356939800, + "step": 16536, + "time_per_iteration": 2.538581132888794 + }, + { + "auxiliary_loss_clip": 0.01066028, + "auxiliary_loss_mlp": 0.01039345, + "balance_loss_clip": 1.03365159, + "balance_loss_mlp": 1.02558804, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 1.9490023771315717, + "language_loss": 0.78750551, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.8085593, + "num_input_tokens_seen": 356957780, + "step": 16537, + "time_per_iteration": 2.4858288764953613 + }, + { + "auxiliary_loss_clip": 0.0110388, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.03716946, + "balance_loss_mlp": 1.01933527, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.675114462659554, + "language_loss": 0.68924105, + "learning_rate": 3.35052651107004e-10, + "loss": 0.71060008, + "num_input_tokens_seen": 356979185, + "step": 16538, + "time_per_iteration": 2.4867732524871826 + }, + { + "auxiliary_loss_clip": 0.01066256, + "auxiliary_loss_mlp": 0.01036777, + "balance_loss_clip": 1.03058207, + "balance_loss_mlp": 1.02368784, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 2.197202942709024, + "language_loss": 0.74690056, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.76793098, + "num_input_tokens_seen": 356997735, + "step": 16539, + "time_per_iteration": 2.541247606277466 + }, + { + "auxiliary_loss_clip": 0.01060901, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.03511, + "balance_loss_mlp": 1.01814759, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 2.086130713657473, + "language_loss": 0.70352137, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72443545, + "num_input_tokens_seen": 357015660, + "step": 16540, + "time_per_iteration": 3.9811110496520996 + }, + { + "auxiliary_loss_clip": 0.01093041, + "auxiliary_loss_mlp": 0.01028955, + "balance_loss_clip": 1.03429163, + "balance_loss_mlp": 1.01829171, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.7496752832793818, + "language_loss": 0.75088596, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77210593, + "num_input_tokens_seen": 357034800, + "step": 16541, + "time_per_iteration": 2.4793646335601807 + }, + { + "auxiliary_loss_clip": 0.01081494, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.03406787, + "balance_loss_mlp": 1.01755917, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 1.9052052646572102, + "language_loss": 0.76641321, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78752857, + "num_input_tokens_seen": 357053785, + "step": 16542, + "time_per_iteration": 2.550480842590332 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.03655899, + "balance_loss_mlp": 1.01788485, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 2.4090015325523217, + "language_loss": 0.74703097, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76833391, + "num_input_tokens_seen": 357072025, + "step": 16543, + "time_per_iteration": 2.4958016872406006 + }, + { + "auxiliary_loss_clip": 0.01095399, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.03454232, + "balance_loss_mlp": 1.02044344, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.4104569753736707, + "language_loss": 0.81875765, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.84004998, + "num_input_tokens_seen": 357086960, + "step": 16544, + "time_per_iteration": 2.4227635860443115 + }, + { + "auxiliary_loss_clip": 0.01106728, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.03541636, + "balance_loss_mlp": 1.01719141, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 2.129018908603042, + "language_loss": 0.79157364, + "learning_rate": 2.870103745831187e-10, + "loss": 0.81293201, + "num_input_tokens_seen": 357105095, + "step": 16545, + "time_per_iteration": 2.399075984954834 + }, + { + "auxiliary_loss_clip": 0.01080661, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.0340544, + "balance_loss_mlp": 1.01931643, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.7794632100073662, + "language_loss": 0.72275627, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74387902, + "num_input_tokens_seen": 357125065, + "step": 16546, + "time_per_iteration": 2.599919080734253 + }, + { + "auxiliary_loss_clip": 0.01094828, + "auxiliary_loss_mlp": 0.01037683, + "balance_loss_clip": 1.03374481, + "balance_loss_mlp": 1.02588165, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 1.7867741667566241, + "language_loss": 0.77570868, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79703385, + "num_input_tokens_seen": 357141600, + "step": 16547, + "time_per_iteration": 2.4641311168670654 + }, + { + "auxiliary_loss_clip": 0.0108497, + "auxiliary_loss_mlp": 0.01029645, + "balance_loss_clip": 1.03284895, + "balance_loss_mlp": 1.01810503, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 2.3059234669184727, + "language_loss": 0.70289046, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72403657, + "num_input_tokens_seen": 357157880, + "step": 16548, + "time_per_iteration": 2.514492988586426 + }, + { + "auxiliary_loss_clip": 0.01094589, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.03386331, + "balance_loss_mlp": 1.018682, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.8314570247967659, + "language_loss": 0.75614405, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77740288, + "num_input_tokens_seen": 357176705, + "step": 16549, + "time_per_iteration": 2.4474031925201416 + }, + { + "auxiliary_loss_clip": 0.01081472, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.03621352, + "balance_loss_mlp": 1.02173889, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.630119912739675, + "language_loss": 0.74383909, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.76499826, + "num_input_tokens_seen": 357197630, + "step": 16550, + "time_per_iteration": 2.612227201461792 + }, + { + "auxiliary_loss_clip": 0.01060692, + "auxiliary_loss_mlp": 0.00778138, + "balance_loss_clip": 1.03224063, + "balance_loss_mlp": 1.00054002, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 2.1326957180423363, + "language_loss": 0.78104389, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.79943228, + "num_input_tokens_seen": 357215445, + "step": 16551, + "time_per_iteration": 2.5859529972076416 + }, + { + "auxiliary_loss_clip": 0.01090917, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.03544617, + "balance_loss_mlp": 1.02212989, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.7598828314680413, + "language_loss": 0.66421795, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68545735, + "num_input_tokens_seen": 357234285, + "step": 16552, + "time_per_iteration": 2.4784226417541504 + }, + { + "auxiliary_loss_clip": 0.01109413, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.03623927, + "balance_loss_mlp": 1.0172298, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 1.5393972508647564, + "language_loss": 0.81562579, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83701605, + "num_input_tokens_seen": 357257565, + "step": 16553, + "time_per_iteration": 2.554415464401245 + }, + { + "auxiliary_loss_clip": 0.01019278, + "auxiliary_loss_mlp": 0.01000945, + "balance_loss_clip": 1.00647223, + "balance_loss_mlp": 0.99985391, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.7562495260741676, + "language_loss": 0.57256746, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59276962, + "num_input_tokens_seen": 357320205, + "step": 16554, + "time_per_iteration": 3.1543869972229004 + }, + { + "auxiliary_loss_clip": 0.01092218, + "auxiliary_loss_mlp": 0.01035457, + "balance_loss_clip": 1.0354383, + "balance_loss_mlp": 1.02329218, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.8939751688985038, + "language_loss": 0.77160931, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79288602, + "num_input_tokens_seen": 357340695, + "step": 16555, + "time_per_iteration": 3.9695498943328857 + }, + { + "auxiliary_loss_clip": 0.01077936, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.03224909, + "balance_loss_mlp": 1.0210402, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 2.0909769744408253, + "language_loss": 0.86139023, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.8825084, + "num_input_tokens_seen": 357357505, + "step": 16556, + "time_per_iteration": 2.567711114883423 + }, + { + "auxiliary_loss_clip": 0.01064129, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.03544116, + "balance_loss_mlp": 1.01817441, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.9573795148664814, + "language_loss": 0.73098439, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75193036, + "num_input_tokens_seen": 357375395, + "step": 16557, + "time_per_iteration": 2.5631985664367676 + }, + { + "auxiliary_loss_clip": 0.01096896, + "auxiliary_loss_mlp": 0.01032541, + "balance_loss_clip": 1.0367198, + "balance_loss_mlp": 1.02138877, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 1.8915164214066278, + "language_loss": 0.76506543, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78635979, + "num_input_tokens_seen": 357397375, + "step": 16558, + "time_per_iteration": 2.531916379928589 + }, + { + "auxiliary_loss_clip": 0.0108295, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.03284061, + "balance_loss_mlp": 1.01967192, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 1.9172204612991828, + "language_loss": 0.63523638, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.65638793, + "num_input_tokens_seen": 357418880, + "step": 16559, + "time_per_iteration": 2.5681025981903076 + }, + { + "auxiliary_loss_clip": 0.01095025, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.03429317, + "balance_loss_mlp": 1.01960337, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 2.062663259949102, + "language_loss": 0.74718463, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76845217, + "num_input_tokens_seen": 357438310, + "step": 16560, + "time_per_iteration": 2.480686902999878 + }, + { + "auxiliary_loss_clip": 0.01052161, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.03306079, + "balance_loss_mlp": 1.015908, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.7308768641737984, + "language_loss": 0.78861952, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.80941856, + "num_input_tokens_seen": 357457155, + "step": 16561, + "time_per_iteration": 2.6044087409973145 + }, + { + "auxiliary_loss_clip": 0.01105035, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.03619099, + "balance_loss_mlp": 1.02037323, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 2.770644016465643, + "language_loss": 0.65794146, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67931134, + "num_input_tokens_seen": 357468060, + "step": 16562, + "time_per_iteration": 2.3771493434906006 + }, + { + "auxiliary_loss_clip": 0.01085201, + "auxiliary_loss_mlp": 0.007784, + "balance_loss_clip": 1.0371573, + "balance_loss_mlp": 1.00064158, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 1.8053887469368541, + "language_loss": 0.64225006, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66088611, + "num_input_tokens_seen": 357489665, + "step": 16563, + "time_per_iteration": 2.565345048904419 + }, + { + "auxiliary_loss_clip": 0.01086328, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.03680396, + "balance_loss_mlp": 1.01923132, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 1.9111819451637297, + "language_loss": 0.64794803, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66913038, + "num_input_tokens_seen": 357511975, + "step": 16564, + "time_per_iteration": 4.0504090785980225 + }, + { + "auxiliary_loss_clip": 0.01085483, + "auxiliary_loss_mlp": 0.0077657, + "balance_loss_clip": 1.03653145, + "balance_loss_mlp": 1.00056946, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 1.893024394401384, + "language_loss": 0.74226022, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.76088077, + "num_input_tokens_seen": 357529345, + "step": 16565, + "time_per_iteration": 2.4863758087158203 + }, + { + "auxiliary_loss_clip": 0.01087152, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.03367829, + "balance_loss_mlp": 1.01784921, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.7863952226414743, + "language_loss": 0.79627848, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81743753, + "num_input_tokens_seen": 357547615, + "step": 16566, + "time_per_iteration": 3.8830456733703613 + }, + { + "auxiliary_loss_clip": 0.01058354, + "auxiliary_loss_mlp": 0.00776178, + "balance_loss_clip": 1.03452277, + "balance_loss_mlp": 1.00059891, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.6311736395062653, + "language_loss": 0.7077356, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.72608095, + "num_input_tokens_seen": 357567380, + "step": 16567, + "time_per_iteration": 2.570563316345215 + }, + { + "auxiliary_loss_clip": 0.01097855, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.03432405, + "balance_loss_mlp": 1.0208869, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 2.7640723110560192, + "language_loss": 0.78417814, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.80550116, + "num_input_tokens_seen": 357586435, + "step": 16568, + "time_per_iteration": 2.5282037258148193 + }, + { + "auxiliary_loss_clip": 0.01094276, + "auxiliary_loss_mlp": 0.01026069, + "balance_loss_clip": 1.03673816, + "balance_loss_mlp": 1.01517296, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.809154405092218, + "language_loss": 0.8231746, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.84437805, + "num_input_tokens_seen": 357604720, + "step": 16569, + "time_per_iteration": 2.5610239505767822 + }, + { + "auxiliary_loss_clip": 0.01071833, + "auxiliary_loss_mlp": 0.0077778, + "balance_loss_clip": 1.03691018, + "balance_loss_mlp": 1.00056159, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 1.9341960153315876, + "language_loss": 0.70321542, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72171158, + "num_input_tokens_seen": 357622345, + "step": 16570, + "time_per_iteration": 2.544016122817993 + }, + { + "auxiliary_loss_clip": 0.01081336, + "auxiliary_loss_mlp": 0.01025996, + "balance_loss_clip": 1.03820705, + "balance_loss_mlp": 1.01360464, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.822045993482313, + "language_loss": 0.74999559, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77106893, + "num_input_tokens_seen": 357642710, + "step": 16571, + "time_per_iteration": 2.614741086959839 + }, + { + "auxiliary_loss_clip": 0.01087431, + "auxiliary_loss_mlp": 0.01034188, + "balance_loss_clip": 1.03484178, + "balance_loss_mlp": 1.02206469, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 1.7140576507787653, + "language_loss": 0.79583108, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.81704724, + "num_input_tokens_seen": 357659870, + "step": 16572, + "time_per_iteration": 2.4780569076538086 + }, + { + "auxiliary_loss_clip": 0.01084349, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.03589272, + "balance_loss_mlp": 1.02048707, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 1.845220076942408, + "language_loss": 0.70561045, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72677583, + "num_input_tokens_seen": 357677075, + "step": 16573, + "time_per_iteration": 2.543036937713623 + }, + { + "auxiliary_loss_clip": 0.01084215, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.03426683, + "balance_loss_mlp": 1.02072167, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 1.7155962987061577, + "language_loss": 0.63270712, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65388751, + "num_input_tokens_seen": 357696715, + "step": 16574, + "time_per_iteration": 2.5597691535949707 + }, + { + "auxiliary_loss_clip": 0.01082838, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.04217863, + "balance_loss_mlp": 1.0199585, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 3.482878318036601, + "language_loss": 0.76253432, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78369987, + "num_input_tokens_seen": 357712345, + "step": 16575, + "time_per_iteration": 2.604520320892334 + }, + { + "auxiliary_loss_clip": 0.01084039, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.0348587, + "balance_loss_mlp": 1.01851535, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 1.8091830307773418, + "language_loss": 0.70153105, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72267962, + "num_input_tokens_seen": 357731815, + "step": 16576, + "time_per_iteration": 2.525141954421997 + }, + { + "auxiliary_loss_clip": 0.01094405, + "auxiliary_loss_mlp": 0.01028019, + "balance_loss_clip": 1.03545904, + "balance_loss_mlp": 1.01594353, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.73473497015512, + "language_loss": 0.72090721, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74213141, + "num_input_tokens_seen": 357751640, + "step": 16577, + "time_per_iteration": 2.5410425662994385 + }, + { + "auxiliary_loss_clip": 0.01081944, + "auxiliary_loss_mlp": 0.01032807, + "balance_loss_clip": 1.03502321, + "balance_loss_mlp": 1.02089834, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 2.0418224745685922, + "language_loss": 0.78721511, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.8083626, + "num_input_tokens_seen": 357769850, + "step": 16578, + "time_per_iteration": 2.4929611682891846 + }, + { + "auxiliary_loss_clip": 0.01070262, + "auxiliary_loss_mlp": 0.00776942, + "balance_loss_clip": 1.03865337, + "balance_loss_mlp": 1.00060415, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 2.130656592794751, + "language_loss": 0.76358473, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.78205669, + "num_input_tokens_seen": 357789550, + "step": 16579, + "time_per_iteration": 3.9696404933929443 + }, + { + "auxiliary_loss_clip": 0.01085312, + "auxiliary_loss_mlp": 0.01037432, + "balance_loss_clip": 1.03649545, + "balance_loss_mlp": 1.02391934, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 1.9976720103170207, + "language_loss": 0.69492692, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71615434, + "num_input_tokens_seen": 357809525, + "step": 16580, + "time_per_iteration": 2.6211941242218018 + }, + { + "auxiliary_loss_clip": 0.0106074, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.03592849, + "balance_loss_mlp": 1.01773381, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 2.2709645944223236, + "language_loss": 0.79815835, + "learning_rate": 9.862937031113184e-11, + "loss": 0.81906193, + "num_input_tokens_seen": 357829795, + "step": 16581, + "time_per_iteration": 2.6544101238250732 + }, + { + "auxiliary_loss_clip": 0.01080707, + "auxiliary_loss_mlp": 0.01026495, + "balance_loss_clip": 1.03820276, + "balance_loss_mlp": 1.01559353, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 1.783355233314975, + "language_loss": 0.80090833, + "learning_rate": 9.479950191249031e-11, + "loss": 0.8219803, + "num_input_tokens_seen": 357851655, + "step": 16582, + "time_per_iteration": 2.5539848804473877 + }, + { + "auxiliary_loss_clip": 0.01092953, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.03318024, + "balance_loss_mlp": 1.01932263, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 1.710351262532123, + "language_loss": 0.60522604, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62646592, + "num_input_tokens_seen": 357871205, + "step": 16583, + "time_per_iteration": 2.478992223739624 + }, + { + "auxiliary_loss_clip": 0.01087756, + "auxiliary_loss_mlp": 0.01036344, + "balance_loss_clip": 1.03589201, + "balance_loss_mlp": 1.02463806, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.7036810876508215, + "language_loss": 0.77731717, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79855818, + "num_input_tokens_seen": 357892145, + "step": 16584, + "time_per_iteration": 2.5343542098999023 + }, + { + "auxiliary_loss_clip": 0.01080813, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.03355289, + "balance_loss_mlp": 1.01639342, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.4948902430554545, + "language_loss": 0.69390613, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71499014, + "num_input_tokens_seen": 357911205, + "step": 16585, + "time_per_iteration": 2.5258333683013916 + }, + { + "auxiliary_loss_clip": 0.01106772, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.03605843, + "balance_loss_mlp": 1.01960897, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 5.028815483526634, + "language_loss": 0.81222665, + "learning_rate": 8.023839578363834e-11, + "loss": 0.83361238, + "num_input_tokens_seen": 357928190, + "step": 16586, + "time_per_iteration": 2.3967697620391846 + }, + { + "auxiliary_loss_clip": 0.01084494, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.032377, + "balance_loss_mlp": 1.02123296, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.6137344513270235, + "language_loss": 0.78000689, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80117893, + "num_input_tokens_seen": 357946985, + "step": 16587, + "time_per_iteration": 2.5625317096710205 + }, + { + "auxiliary_loss_clip": 0.01081504, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.03558385, + "balance_loss_mlp": 1.02037525, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 1.8256593134938535, + "language_loss": 0.72701311, + "learning_rate": 7.341286512074773e-11, + "loss": 0.74815369, + "num_input_tokens_seen": 357966720, + "step": 16588, + "time_per_iteration": 2.493858575820923 + }, + { + "auxiliary_loss_clip": 0.0111354, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.03689384, + "balance_loss_mlp": 1.01687872, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.485134713145113, + "language_loss": 0.8317256, + "learning_rate": 7.011385585031781e-11, + "loss": 0.85315984, + "num_input_tokens_seen": 357981375, + "step": 16589, + "time_per_iteration": 2.4107773303985596 + }, + { + "auxiliary_loss_clip": 0.01103071, + "auxiliary_loss_mlp": 0.01038066, + "balance_loss_clip": 1.03582168, + "balance_loss_mlp": 1.02350497, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 2.0101923269589084, + "language_loss": 0.70010298, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72151434, + "num_input_tokens_seen": 358000290, + "step": 16590, + "time_per_iteration": 2.484041213989258 + }, + { + "auxiliary_loss_clip": 0.01087942, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.03578997, + "balance_loss_mlp": 1.01814651, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 1.8777962347673567, + "language_loss": 0.63677537, + "learning_rate": 6.374335005676634e-11, + "loss": 0.6579656, + "num_input_tokens_seen": 358022075, + "step": 16591, + "time_per_iteration": 2.6408851146698 + }, + { + "auxiliary_loss_clip": 0.01084725, + "auxiliary_loss_mlp": 0.0102974, + "balance_loss_clip": 1.03376329, + "balance_loss_mlp": 1.0180335, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 1.7264168915898421, + "language_loss": 0.73112386, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75226855, + "num_input_tokens_seen": 358043940, + "step": 16592, + "time_per_iteration": 2.6261537075042725 + }, + { + "auxiliary_loss_clip": 0.01087963, + "auxiliary_loss_mlp": 0.01028048, + "balance_loss_clip": 1.0368638, + "balance_loss_mlp": 1.0155195, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 2.357414805137928, + "language_loss": 0.85106266, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87222278, + "num_input_tokens_seen": 358062720, + "step": 16593, + "time_per_iteration": 2.4913461208343506 + }, + { + "auxiliary_loss_clip": 0.01104147, + "auxiliary_loss_mlp": 0.00777046, + "balance_loss_clip": 1.03515136, + "balance_loss_mlp": 1.00059986, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 2.2332467855435816, + "language_loss": 0.69734412, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71615601, + "num_input_tokens_seen": 358081560, + "step": 16594, + "time_per_iteration": 3.8873884677886963 + }, + { + "auxiliary_loss_clip": 0.01069832, + "auxiliary_loss_mlp": 0.01026364, + "balance_loss_clip": 1.03997254, + "balance_loss_mlp": 1.01415133, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 2.07429861926707, + "language_loss": 0.72584522, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.74680716, + "num_input_tokens_seen": 358099065, + "step": 16595, + "time_per_iteration": 2.612579822540283 + }, + { + "auxiliary_loss_clip": 0.01008061, + "auxiliary_loss_mlp": 0.0100078, + "balance_loss_clip": 1.00318336, + "balance_loss_mlp": 0.9996292, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.7887449015742308, + "language_loss": 0.6037662, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62385464, + "num_input_tokens_seen": 358156095, + "step": 16596, + "time_per_iteration": 2.929790735244751 + }, + { + "auxiliary_loss_clip": 0.01097353, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.03634381, + "balance_loss_mlp": 1.02248907, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 1.836766577785098, + "language_loss": 0.77516258, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79648018, + "num_input_tokens_seen": 358175230, + "step": 16597, + "time_per_iteration": 2.477062463760376 + }, + { + "auxiliary_loss_clip": 0.01098414, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.03482032, + "balance_loss_mlp": 1.01957345, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 2.1593513194443363, + "language_loss": 0.82202148, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84332538, + "num_input_tokens_seen": 358197075, + "step": 16598, + "time_per_iteration": 2.5896694660186768 + }, + { + "auxiliary_loss_clip": 0.01083829, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_clip": 1.03559661, + "balance_loss_mlp": 1.03007817, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 2.145832809624287, + "language_loss": 0.64598405, + "learning_rate": 4.129484715709175e-11, + "loss": 0.66725874, + "num_input_tokens_seen": 358215925, + "step": 16599, + "time_per_iteration": 2.5090718269348145 + }, + { + "auxiliary_loss_clip": 0.01012729, + "auxiliary_loss_mlp": 0.00998981, + "balance_loss_clip": 1.00890994, + "balance_loss_mlp": 0.99776477, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8546338369382761, + "language_loss": 0.62263101, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64274812, + "num_input_tokens_seen": 358269035, + "step": 16600, + "time_per_iteration": 2.998063802719116 + }, + { + "auxiliary_loss_clip": 0.01085367, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.03639364, + "balance_loss_mlp": 1.0201292, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 1.6597100089816188, + "language_loss": 0.78590614, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80706942, + "num_input_tokens_seen": 358287680, + "step": 16601, + "time_per_iteration": 2.515044689178467 + }, + { + "auxiliary_loss_clip": 0.01079027, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.035074, + "balance_loss_mlp": 1.02041554, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 1.9054820807667656, + "language_loss": 0.82327247, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84439141, + "num_input_tokens_seen": 358304080, + "step": 16602, + "time_per_iteration": 2.5398662090301514 + }, + { + "auxiliary_loss_clip": 0.01069259, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.03835857, + "balance_loss_mlp": 1.02563214, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 2.0778494852448746, + "language_loss": 0.6287148, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64978695, + "num_input_tokens_seen": 358323670, + "step": 16603, + "time_per_iteration": 2.546210289001465 + }, + { + "auxiliary_loss_clip": 0.01085173, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.03312349, + "balance_loss_mlp": 1.01757526, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 3.1001380588621195, + "language_loss": 0.71317375, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73432422, + "num_input_tokens_seen": 358341980, + "step": 16604, + "time_per_iteration": 3.9707109928131104 + }, + { + "auxiliary_loss_clip": 0.01107305, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.0355401, + "balance_loss_mlp": 1.0142926, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 1.7356054560399652, + "language_loss": 0.64515835, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66649795, + "num_input_tokens_seen": 358360400, + "step": 16605, + "time_per_iteration": 2.4189348220825195 + }, + { + "auxiliary_loss_clip": 0.01074481, + "auxiliary_loss_mlp": 0.01028943, + "balance_loss_clip": 1.03538704, + "balance_loss_mlp": 1.01777923, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.7387557465197239, + "language_loss": 0.71604753, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73708177, + "num_input_tokens_seen": 358378990, + "step": 16606, + "time_per_iteration": 3.97162127494812 + }, + { + "auxiliary_loss_clip": 0.01097031, + "auxiliary_loss_mlp": 0.00776286, + "balance_loss_clip": 1.03534842, + "balance_loss_mlp": 1.00057006, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 1.9934026962752962, + "language_loss": 0.82005775, + "learning_rate": 2.370001590090709e-11, + "loss": 0.83879095, + "num_input_tokens_seen": 358395970, + "step": 16607, + "time_per_iteration": 2.464900493621826 + }, + { + "auxiliary_loss_clip": 0.01075528, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.03267395, + "balance_loss_mlp": 1.01812387, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.7327434062996478, + "language_loss": 0.66848266, + "learning_rate": 2.184193803622669e-11, + "loss": 0.68954372, + "num_input_tokens_seen": 358417355, + "step": 16608, + "time_per_iteration": 2.6378676891326904 + }, + { + "auxiliary_loss_clip": 0.01065531, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.03701103, + "balance_loss_mlp": 1.01941586, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 2.992833323329804, + "language_loss": 0.80892009, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.82989347, + "num_input_tokens_seen": 358434345, + "step": 16609, + "time_per_iteration": 2.596517562866211 + }, + { + "auxiliary_loss_clip": 0.01087966, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.03456759, + "balance_loss_mlp": 1.02038765, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.5247217961559936, + "language_loss": 0.63079923, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.65200996, + "num_input_tokens_seen": 358452870, + "step": 16610, + "time_per_iteration": 2.507598400115967 + }, + { + "auxiliary_loss_clip": 0.01090005, + "auxiliary_loss_mlp": 0.01033679, + "balance_loss_clip": 1.03433168, + "balance_loss_mlp": 1.02219963, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 5.478660283086222, + "language_loss": 0.67571944, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69695628, + "num_input_tokens_seen": 358472210, + "step": 16611, + "time_per_iteration": 2.4633641242980957 + }, + { + "auxiliary_loss_clip": 0.01068317, + "auxiliary_loss_mlp": 0.01038698, + "balance_loss_clip": 1.03631377, + "balance_loss_mlp": 1.02548993, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.5536136669915894, + "language_loss": 0.69637871, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71744883, + "num_input_tokens_seen": 358493840, + "step": 16612, + "time_per_iteration": 2.645962715148926 + }, + { + "auxiliary_loss_clip": 0.01082614, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.0355171, + "balance_loss_mlp": 1.02078998, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.5007962873299399, + "language_loss": 0.73804826, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.75919497, + "num_input_tokens_seen": 358515060, + "step": 16613, + "time_per_iteration": 2.571424961090088 + }, + { + "auxiliary_loss_clip": 0.01078265, + "auxiliary_loss_mlp": 0.00783333, + "balance_loss_clip": 1.03917956, + "balance_loss_mlp": 1.00053585, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 1.9301182038798632, + "language_loss": 0.73479152, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.75340754, + "num_input_tokens_seen": 358528200, + "step": 16614, + "time_per_iteration": 2.5300674438476562 + }, + { + "auxiliary_loss_clip": 0.01094491, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.03406143, + "balance_loss_mlp": 1.02043903, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 2.6615398473511047, + "language_loss": 0.73148835, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.75275016, + "num_input_tokens_seen": 358548360, + "step": 16615, + "time_per_iteration": 2.497079610824585 + }, + { + "auxiliary_loss_clip": 0.0111164, + "auxiliary_loss_mlp": 0.00777834, + "balance_loss_clip": 1.03746223, + "balance_loss_mlp": 1.0006485, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 5.9265875276611135, + "language_loss": 0.77552772, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79442251, + "num_input_tokens_seen": 358566270, + "step": 16616, + "time_per_iteration": 2.468456506729126 + }, + { + "auxiliary_loss_clip": 0.01082275, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.03598928, + "balance_loss_mlp": 1.01674986, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 1.8821069138648205, + "language_loss": 0.83053666, + "learning_rate": 8.532016508855378e-12, + "loss": 0.85164726, + "num_input_tokens_seen": 358584710, + "step": 16617, + "time_per_iteration": 2.4676363468170166 + }, + { + "auxiliary_loss_clip": 0.01087362, + "auxiliary_loss_mlp": 0.01026289, + "balance_loss_clip": 1.03560746, + "balance_loss_mlp": 1.01515448, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.563168768355281, + "language_loss": 0.78702366, + "learning_rate": 7.43233506206309e-12, + "loss": 0.80816019, + "num_input_tokens_seen": 358606750, + "step": 16618, + "time_per_iteration": 4.002735376358032 + }, + { + "auxiliary_loss_clip": 0.01107486, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.03618193, + "balance_loss_mlp": 1.01741111, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.838456085934967, + "language_loss": 0.74477351, + "learning_rate": 6.408493534060255e-12, + "loss": 0.76614487, + "num_input_tokens_seen": 358624675, + "step": 16619, + "time_per_iteration": 2.452850341796875 + }, + { + "auxiliary_loss_clip": 0.01093166, + "auxiliary_loss_mlp": 0.01027824, + "balance_loss_clip": 1.03319764, + "balance_loss_mlp": 1.01674891, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 2.0345467914616324, + "language_loss": 0.86406505, + "learning_rate": 5.460491963260594e-12, + "loss": 0.88527501, + "num_input_tokens_seen": 358640715, + "step": 16620, + "time_per_iteration": 2.4809741973876953 + }, + { + "auxiliary_loss_clip": 0.01069509, + "auxiliary_loss_mlp": 0.01026629, + "balance_loss_clip": 1.03078341, + "balance_loss_mlp": 1.0148927, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 2.104513310366952, + "language_loss": 0.7266593, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74762058, + "num_input_tokens_seen": 358659630, + "step": 16621, + "time_per_iteration": 2.6518168449401855 + }, + { + "auxiliary_loss_clip": 0.01010088, + "auxiliary_loss_mlp": 0.0100401, + "balance_loss_clip": 1.01582718, + "balance_loss_mlp": 1.00276458, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.8968174460186399, + "language_loss": 0.5651502, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58529121, + "num_input_tokens_seen": 358727840, + "step": 16622, + "time_per_iteration": 3.3075969219207764 + }, + { + "auxiliary_loss_clip": 0.01068499, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.03163457, + "balance_loss_mlp": 1.01937795, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 1.8719690016996677, + "language_loss": 0.71172571, + "learning_rate": 3.071527340914315e-12, + "loss": 0.73273158, + "num_input_tokens_seen": 358744125, + "step": 16623, + "time_per_iteration": 2.483433723449707 + }, + { + "auxiliary_loss_clip": 0.01065979, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.03640342, + "balance_loss_mlp": 1.01914144, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 2.2386025073739266, + "language_loss": 0.74494374, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.76592547, + "num_input_tokens_seen": 358761420, + "step": 16624, + "time_per_iteration": 2.5091946125030518 + }, + { + "auxiliary_loss_clip": 0.01075168, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03639579, + "balance_loss_mlp": 1.01762223, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.5079372765553807, + "language_loss": 0.73744482, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.75849485, + "num_input_tokens_seen": 358782600, + "step": 16625, + "time_per_iteration": 2.6014578342437744 + }, + { + "auxiliary_loss_clip": 0.01093041, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.03415775, + "balance_loss_mlp": 1.0200057, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.829617383048552, + "language_loss": 0.77432644, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.7955727, + "num_input_tokens_seen": 358801220, + "step": 16626, + "time_per_iteration": 2.486210823059082 + }, + { + "auxiliary_loss_clip": 0.01105489, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.03609276, + "balance_loss_mlp": 1.022802, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 1.8615835690829612, + "language_loss": 0.82271969, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84411997, + "num_input_tokens_seen": 358819190, + "step": 16627, + "time_per_iteration": 2.4715166091918945 + }, + { + "auxiliary_loss_clip": 0.01095212, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.03601956, + "balance_loss_mlp": 1.02379942, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 1.8764083932825364, + "language_loss": 0.70870954, + "learning_rate": 6.067215747584952e-13, + "loss": 0.73003161, + "num_input_tokens_seen": 358839850, + "step": 16628, + "time_per_iteration": 2.5205745697021484 + }, + { + "auxiliary_loss_clip": 0.01098296, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.03420603, + "balance_loss_mlp": 1.01682878, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.4654205331476053, + "language_loss": 0.75515521, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77642751, + "num_input_tokens_seen": 358859805, + "step": 16629, + "time_per_iteration": 2.463367223739624 + }, + { + "auxiliary_loss_clip": 0.01090272, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.03693652, + "balance_loss_mlp": 1.02081645, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 1.8133711264245815, + "language_loss": 0.60408998, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62532651, + "num_input_tokens_seen": 358877900, + "step": 16630, + "time_per_iteration": 2.459104537963867 + }, + { + "auxiliary_loss_clip": 0.0106586, + "auxiliary_loss_mlp": 0.0102537, + "balance_loss_clip": 1.03373599, + "balance_loss_mlp": 1.01292419, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 2.0872767716113287, + "language_loss": 0.60793078, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62884307, + "num_input_tokens_seen": 358897285, + "step": 16631, + "time_per_iteration": 2.561814308166504 + }, + { + "auxiliary_loss_clip": 0.01049324, + "auxiliary_loss_mlp": 0.00777895, + "balance_loss_clip": 1.03416288, + "balance_loss_mlp": 1.00052953, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 1.8113888334143962, + "language_loss": 0.72381723, + "learning_rate": 0.0, + "loss": 0.74208939, + "num_input_tokens_seen": 358911570, + "step": 16632, + "time_per_iteration": 2.5323617458343506 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.3992169073237033e+18, + "train_loss": 0.7695426323010751, + "train_runtime": 46454.0317, + "train_samples_per_second": 14.322, + "train_steps_per_second": 0.358 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/training_args.bin b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c25011e2e7ba409b57de99a91ed3472ec6986a63 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3368de8c7027967a2c74be84a935a4090535aa9e8533641e4f4b02232e6e70a +size 7992